## Imports

In [122]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression

## Load Data

In [123]:
college_data = pd.read_csv('./data/college_data_EDA.csv')
college_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  349 non-null    object 
 1   Private               349 non-null    int64  
 2   Top10perc             349 non-null    int64  
 3   Top25perc             349 non-null    int64  
 4   Room.Board            349 non-null    int64  
 5   PhD                   349 non-null    int64  
 6   S.F.Ratio             349 non-null    float64
 7   perc.alumni           349 non-null    int64  
 8   Expend                349 non-null    int64  
 9   Grad.Rate             349 non-null    int64  
 10  out_of_state_tuition  349 non-null    float64
 11  state_mean            349 non-null    float64
dtypes: float64(3), int64(8), object(1)
memory usage: 32.8+ KB


## Extract Purdue

In [124]:
purdue = college_data[college_data.name.str.match('^Purdue')]
college_data = college_data[~college_data.name.str.match('^Purdue')]
purdue.head()

Unnamed: 0,name,Private,Top10perc,Top25perc,Room.Board,PhD,S.F.Ratio,perc.alumni,Expend,Grad.Rate,out_of_state_tuition,state_mean
198,Purdue University at West Lafayette,0,29,60,3990,86,18.2,15,8604,67,28794.0,34490.31


## Train/Test Split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(college_data.drop(columns='out_of_state_tuition'), 
                                                    college_data.out_of_state_tuition,
                                                    test_size=0.3,
                                                    random_state=67)

In [126]:
names_train = X_train['name']
names_test = X_test['name']
X_train.drop(columns='name', inplace=True)
X_test.drop(columns='name', inplace=True)
X_train.shape, X_test.shape

((243, 10), (105, 10))

In [127]:
X_train.dtypes

Private          int64
Top10perc        int64
Top25perc        int64
Room.Board       int64
PhD              int64
S.F.Ratio      float64
perc.alumni      int64
Expend           int64
Grad.Rate        int64
state_mean     float64
dtype: object

In [128]:
college_data.isna().sum()

name                    0
Private                 0
Top10perc               0
Top25perc               0
Room.Board              0
PhD                     0
S.F.Ratio               0
perc.alumni             0
Expend                  0
Grad.Rate               0
out_of_state_tuition    0
state_mean              0
dtype: int64

## Mean Model

Using the naive mean model as a baseline

In [129]:
mean_mod = DummyRegressor(strategy='mean')
mean_mod.fit(X_train, y_train)
y_tr_pred = mean_mod.predict(X_train)
y_te_pred = mean_mod.predict(X_test)
mean_mod.constant_

array([[33055.11111111]])

In [130]:
round(r2_score(y_test, y_te_pred), 6)

-0.004479

In [131]:
round(mean_squared_error(y_test, y_te_pred), 2)

201920510.92

## Scale Data and Linear Regression

In [132]:
pipe = make_pipeline(
    StandardScaler(),
    SelectKBest(f_regression), #test different k in modeling
    LinearRegression()
)
pipe.fit(X_train, y_train)

In [133]:
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

In [134]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.7021380419154903, 0.7196611687161416)

In [135]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(43964426.38613531, 56353771.063597016)

Note: Data was already preprocessed with all numeric values so no categorical encoding needed to be done.