In [101]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error


In [102]:
# Read the CSV file
df = pd.read_csv('College.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [103]:
# Read after 3rd column and change to Numpy array
start_column_index = 2
X = df.iloc[:, start_column_index:].values
y = df.iloc[:, start_column_index:].values

In [104]:
# Random sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=87)


In [105]:
X_train, len(X_train)


(array([[8.000e+03, 4.556e+03, 1.464e+03, ..., 2.100e+01, 5.280e+03,
         6.300e+01],
        [2.281e+03, 1.870e+03, 1.408e+03, ..., 8.000e+00, 5.916e+03,
         4.500e+01],
        [1.310e+03, 1.086e+03, 4.580e+02, ..., 3.500e+01, 7.215e+03,
         8.100e+01],
        ...,
        [2.409e+03, 1.939e+03, 7.590e+02, ..., 1.000e+00, 5.968e+03,
         4.600e+01],
        [5.350e+02, 5.020e+02, 2.230e+02, ..., 1.500e+01, 7.114e+03,
         5.100e+01],
        [2.307e+03, 1.896e+03, 5.090e+02, ..., 1.600e+01, 7.120e+03,
         8.200e+01]]),
 699)

In [106]:
X_test, len(X_test)

(array([[9.2000e+02, 6.8400e+02, 2.2500e+02, ..., 1.9000e+01, 7.3600e+03,
         6.7000e+01],
        [1.0160e+03, 8.7200e+02, 3.0000e+02, ..., 3.6000e+01, 7.4110e+03,
         7.0000e+01],
        [3.5500e+02, 3.0000e+02, 1.4200e+02, ..., 2.5000e+01, 8.9540e+03,
         6.5000e+01],
        ...,
        [3.6700e+02, 2.7400e+02, 1.5800e+02, ..., 1.2000e+01, 5.9350e+03,
         4.9000e+01],
        [9.7400e+02, 7.0400e+02, 2.9000e+02, ..., 9.0000e+00, 1.1641e+04,
         5.7000e+01],
        [4.8560e+03, 2.4920e+03, 7.2700e+02, ..., 4.5000e+01, 1.5494e+04,
         9.3000e+01]]),
 78)

In [107]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
lr_error = mean_squared_error(y_test, lr_preds)

print("Linear Regression's test error : ", lr_error)


Linear Regression's test error :  4.318602405044083e-24


In [108]:
# Ridge Regression
ridge_model = Ridge()
alpha_values = {'alpha': np.logspace(-4, 0, 50)}

In [109]:
# Use GridSearchCV for 10-fold corss validation
ridge_grid_search = GridSearchCV(ridge_model, alpha_values, scoring='neg_mean_squared_error', cv=10)

In [110]:
ridge_grid_search.fit(X_train, y_train)
ridge_predictions = ridge_grid_search.predict(X_test)

In [111]:
# Mean squared error of prediction
ridge_mse = mean_squared_error(y_test, ridge_predictions)
print("Ridge Regression's test error : ", ridge_mse)

Ridge Regression's test error :  2.1939080024636782e-16


In [112]:
# Lasso Regression
lasso_model = Lasso()
alpha_values = {'alpha': np.logspace(-4, 0, 50)}

In [113]:
# # Use GridSearchCV for 10-fold corss validation
lasso_grid_search = GridSearchCV(lasso_model, alpha_values, scoring='neg_mean_squared_error', cv=10)

In [114]:
lasso_grid_search.fit(X_train, y_train)
lasso_predictions = lasso_grid_search.predict(X_test)

In [115]:
# Mean squared error of prediction
lasso_mse = mean_squared_error(y_test, lasso_predictions)
print("Lasso Regression's test error : ", lasso_error)

Lasso Regression's test error :  0.001282760725403851


In [116]:
# Number of Non-zero coefficient estimates
non_zero_coefficients = np.sum(lasso_grid_search.best_estimator_.coef_ != 0)
print("Number of non-zero coefficients : ", non_zero_coefficients)

Number of non-zero coefficients :  184


예측 정확도는 Linear regression 모델의 테스트 오류가 가장 낮으므로 세 모델 중 가장 정확하다. Ridge regression 모델은 테스트 오류가 Linear regression 모델에 비해 약간 높지만 0에 가깝고 이는 세 모델 중 테스트 오류가 가장 높은 Lasso regression 모델도 마찬가지 이다. 따라서 세 모델 모두 대학 지원서 접수 수를 잘 예측할 수 있음을 나타낸다.

모델 차이점으로는 Linear regression 모델과 Ridge regression 모델은 예측을 위해 모든 특성을 사용하는 반면 Lasso regression 모델은 특성 선택을 수행하고 200개 중 0이 아닌 계수 184개만 사용한다. 이는 Lasso regression 모델이 다른 두 모델에 비해 더 단순하다는 것을 의미하며 다른 두 모델에 비해 과적합을 피할 수 있음을 나타낸다.

