## Machine Learning & Deep Learning 1 Assignment 2
### 2023-29914 Chan Gyu Lee

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.feature_selection import SequentialFeatureSelector
import matplotlib.pyplot as plt

In [4]:
# Import Dataset
df_College = pd.read_csv("College.csv", index_col = 0)
df_College.info()
df_College.head()

<class 'pandas.core.frame.DataFrame'>
Index: 777 entries, Abilene Christian University to York College of Pennsylvania
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    object 
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: 

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [5]:
# Quick check of the range of 'Apps' variable.
print('the smallest value of Apps = ',df_College.Apps.min())
print('the largest value of Apps = ',df_College.Apps.max())

the smallest value of Apps =  81
the largest value of Apps =  48094


In [6]:
# Data Manipulation
df_College.Private.unique() # has two values (yes, no)
df_College['Private'] = df_College.Private.factorize()[0] # Change the categorial variable -> numeric factor.
df_College.info()

<class 'pandas.core.frame.DataFrame'>
Index: 777 entries, Abilene Christian University to York College of Pennsylvania
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    int64  
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: 

In [7]:
#1.a) Randomly Split the data
X_train, X_test, y_train, y_test = train_test_split(df_College.loc[:,df_College.columns != 'Apps'],
                                                    df_College.Apps, test_size = 0.1, random_state = 1996)

In [8]:
regr = LinearRegression()
regr.fit(X_train, y_train) #fit a regression
apps_pred = regr.predict(X_test) # predict using test set.

In [9]:
mean_squared_error(y_test, apps_pred) # Test Error obtained.

819169.0394109311

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)

In [11]:
# 1.b) Ridge (Updated)
alphas = 10**np.linspace(10,-2,100)*0.5 # Initial set of alphas
ridgeModel = Ridge()

In [20]:
# Cross-Validation
def cross_validation(model, list_X, y, data, num_k, alphas):
    # Shuffle the whole data.
    data = data.sample(frac = 1, random_state = 96).reset_index(drop = True) 
    X = data[list_X]
    y = data[y]
    n = len(data)
    size_fold = n//num_k
    
    evaluation = [] # to store final result.
    
    # Iterate the validating process for 5 times.
    for i in range(num_k):
        start = i*size_fold
        end = (i+1)*size_fold
        X_train = X.drop(X.index[start:end])
        y_train = y.drop(y.index[start:end])
        X_test = X[start:end]
        y_test = y[start:end]

        # calculate MSE for each alpha input.
        ridgecv = RidgeCV(alphas = alphas)
        ridgecv.fit(scaler.transform(X_train), y_train)
        model.set_params(alpha = ridgecv.alpha_)
        
        # Fit the model for each validation set.
        model.fit(scaler.transform(X_train), y_train)
        evaluation.append([i, ridgecv.alpha_, mean_squared_error(y_test, model.predict(scaler.transform(X_test)))])

        
    return evaluation #round(np.average(evaluation), 5) # evaluate the result round to 5 decimals.

In [21]:
trainData = pd.concat([y_train, X_train], axis = 1)
trainData
list_X = list(trainData.loc[:, trainData.columns != 'Apps'].columns)
y = 'Apps'
result = cross_validation(ridgeModel, list_X, y, trainData, 10, alphas)

In [271]:
result

[[0, 0.005, 827629.784716583],
 [1, 0.005, 855720.0017959789],
 [2, 0.005, 1309431.6691319437],
 [3, 0.005, 1080983.0517577277],
 [4, 2.320794416806386, 2963386.954730386],
 [5, 0.005, 1188604.9306884871],
 [6, 0.005, 1127591.729520765],
 [7, 0.005, 1166702.8231651543],
 [8, 0.005, 1651930.1910290048],
 [9, 0.005, 1097962.0643972822]]

In [22]:
df_result = pd.DataFrame(result, columns = ['CV', 'alpha', 'MSE'])

In [23]:
# alpha calculated by 10-fold cross validation
df_result.alpha.mean()

0.3112953636706581

In [24]:
# Fit into the final model.
ridgeModel2 = Ridge(alpha = df_result.alpha.mean())
ridgeModel2.fit(scaler.transform(X_train), y_train)
pred = ridgeModel2.predict(scaler.transform(X_test))
mean_squared_error(y_test, pred)

815001.5279444703

In [25]:
#1.c) Lasso
lassoModel = Lasso()

# Cross-Validation
def cross_validation_lasso(model, list_X, y, data, num_k, alphas):
    # Shuffle the whole data.
    data = data.sample(frac = 1, random_state = 96).reset_index(drop = True) 
    X = data[list_X]
    y = data[y]
    n = len(data)
    size_fold = n//num_k
    
    evaluation = [] # to store final result.
    
    # Iterate the validating process for 5 times.
    for i in range(num_k):
        start = i*size_fold
        end = (i+1)*size_fold
        X_train = X.drop(X.index[start:end])
        y_train = y.drop(y.index[start:end])
        X_test = X[start:end]
        y_test = y[start:end]

        # calculate MSE for each alpha input.
        lassocv = LassoCV(alphas = None, max_iter=1000)
        lassocv.fit(scaler.transform(X_train), y_train)
        model.set_params(alpha = lassocv.alpha_, max_iter = 1000)
        
        # Fit the model for each validation set.
        model.fit(scaler.transform(X_train), y_train)
        evaluation.append([i, lassocv.alpha_, mean_squared_error(y_test, model.predict(scaler.transform(X_test)))])

        
    return evaluation

In [26]:
list_X = list(trainData.loc[:, trainData.columns != 'Apps'].columns)
y = 'Apps'
result_lasso = cross_validation_lasso(lassoModel, list_X, y, trainData, 10, alphas)

In [27]:
df_result_lasso = pd.DataFrame(result_lasso, columns = ['CV', 'alpha', 'MSE'])

In [28]:
df_result_lasso.alpha.mean()

14.864560115399831

In [29]:
# Fit the model
lassoModel2 = Lasso(alpha = df_result_lasso.alpha.mean())
lassoModel2.fit(scaler.transform(X_train), y_train)
pred2 = lassoModel2.predict(scaler.transform(X_test))
mean_squared_error(y_test, pred2)

720823.2457449477

In [30]:
# the list of non-zero coefficient estimates.
pd.Series(lassoModel2.coef_, index=X_train.columns)

Private         220.248169
Accept         3834.442824
Enroll         -391.932999
Top10perc       774.735882
Top25perc      -199.494825
F.Undergrad       0.000000
P.Undergrad      61.339827
Outstate       -304.572692
Room.Board      142.997653
Books            -0.000000
Personal          0.000000
PhD            -124.007623
Terminal        -19.407439
S.F.Ratio        35.711987
perc.alumni      -0.000000
Expend          357.447689
Grad.Rate       103.920837
dtype: float64

## Question 2

In [31]:
df_Boston = pd.read_csv('Boston.csv',index_col = 0)
df_Boston.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 1 to 506
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  lstat    506 non-null    float64
 12  medv     506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 55.3 KB


In [32]:
# 2.a)
# Split Dataset.
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_Boston.loc[:,df_Boston.columns != 'crim'], df_Boston.crim, test_size = 0.1, random_state = 1996)

In [47]:
# Forward Selection
selector_forward = SequentialFeatureSelector(LinearRegression(), 
                                            direction = 'forward', scoring = 'neg_mean_squared_error')
selector_forward.fit(X_train2, y_train2)

SequentialFeatureSelector(estimator=LinearRegression(),
                          scoring='neg_mean_squared_error')

In [48]:
    # Selected Variables
selected_forward = X_train2.columns[selector_forward.get_support(indices = True)]
selected_forward

Index(['nox', 'dis', 'rad', 'ptratio', 'lstat', 'medv'], dtype='object')

In [49]:
    # Model Tune and Fitting
model_forward = LinearRegression()
model_forward.fit(X_train2[selected_forward], y_train2)
    # MSE for Forward Selection
mse_Forward = mean_squared_error(y_test2, model_forward.predict(X_test2[selected_forward]))

In [50]:
mse_Forward

13.728758007070557

In [51]:
# Backward Selection
selector_backward = SequentialFeatureSelector(LinearRegression(), 
                                            direction = 'backward', scoring = 'neg_mean_squared_error')
selector_backward.fit(X_train2, y_train2)

SequentialFeatureSelector(direction='backward', estimator=LinearRegression(),
                          scoring='neg_mean_squared_error')

In [52]:
    # Selected Variables
selected_backward = X_train2.columns[selector_backward.get_support(indices = True)]
selected_backward

Index(['zn', 'nox', 'dis', 'rad', 'ptratio', 'medv'], dtype='object')

In [53]:
    # Model Tune and Fitting
model_backward = LinearRegression()
model_backward.fit(X_train2[selected_backward], y_train2)
    # MSE for backward Selection
mse_Backward = mean_squared_error(y_test2, model_backward.predict(X_test2[selected_backward]))

In [54]:
mse_Backward

14.849203623484309

In [33]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train2)

In [35]:
# Ridge
trainData_Boston = pd.concat([y_train2, X_train2], axis = 1)
ridgeModel = Ridge()
list_X = list(trainData_Boston.loc[:, trainData_Boston.columns != 'crim'].columns)
y = 'crim'
result_ridge_Boston = cross_validation(ridgeModel, list_X, y, trainData_Boston, 10, alphas) #10 fold validation

In [36]:
df_result_boston = pd.DataFrame(result, columns = ['CV', 'alpha', 'MSE'])

In [37]:
df_result_boston.alpha.mean()

0.3112953636706581

In [39]:
ridgeModel2_boston = Ridge(alpha = df_result_boston.alpha.mean())
ridgeModel2_boston.fit(scaler.transform(X_train2), y_train2)
pred_boston = ridgeModel2_boston.predict(scaler.transform(X_test2))
mse_Ridge = mean_squared_error(y_test2, pred_boston)

In [40]:
mse_Ridge

15.01259635593882

In [41]:
# Lasso
lassoModel = Lasso()
list_X = list(trainData_Boston.loc[:, trainData_Boston.columns != 'crim'].columns)
y = 'crim'
result_lasso_boston = cross_validation_lasso(lassoModel, list_X, y, trainData_Boston, 10, alphas)

In [42]:
df_result_lasso_boston = pd.DataFrame(result_lasso_boston, columns = ['CV', 'alpha', 'MSE'])

In [43]:
df_result_lasso_boston.alpha.mean()

0.03152287839583976

In [44]:
lassoModel2_boston = Lasso(alpha = df_result_lasso_boston.alpha.mean())
lassoModel2_boston.fit(scaler.transform(X_train2), y_train2)
pred2 = lassoModel2_boston.predict(scaler.transform(X_test2))
mse_Lasso = mean_squared_error(y_test2, pred2)

In [45]:
mse_Lasso

14.111579290011868

In [55]:
# 2.b)
# Create a model performance table for comparison
modelPerformance = pd.DataFrame({'Model' :['Forward', 'Backward', 'Lasso', 'Ridge'],
             'MSE': [mse_Forward, mse_Backward, mse_Lasso, mse_Ridge]})
modelPerformance.sort_values(by = 'MSE', ascending = True)

Unnamed: 0,Model,MSE
0,Forward,13.728758
2,Lasso,14.111579
1,Backward,14.849204
3,Ridge,15.012596


In [56]:
pd.Series(ridgeModel2_boston.coef_, index = X_train2.columns)

zn         1.219803
indus     -0.393448
chas      -0.172680
nox       -1.321009
rm         0.557232
age       -0.002067
dis       -2.381810
rad        5.661526
tax       -0.760496
ptratio   -0.767536
lstat      0.952548
medv      -2.316412
dtype: float64

In [57]:
pd.Series(lassoModel2_boston.coef_, index=X_train2.columns)

zn         1.089325
indus     -0.429543
chas      -0.148237
nox       -1.101131
rm         0.469548
age       -0.000000
dis       -2.108762
rad        5.268887
tax       -0.350672
ptratio   -0.659894
lstat      0.937450
medv      -2.086144
dtype: float64