In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings



In [40]:
df=pd.read_csv('data/stud.csv')

In [41]:
df['total_score']=df['math_score']+df['reading_score']+df['writing_score']
df['average_score']=df['total_score']/3
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [42]:
X=df.drop(columns=['math_score'],axis=1)

In [43]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score,total_score,average_score
0,female,group B,bachelor's degree,standard,none,72,74,218,72.666667
1,female,group C,some college,standard,completed,90,88,247,82.333333
2,female,group B,master's degree,standard,none,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,57,44,148,49.333333
4,male,group C,some college,standard,none,78,75,229,76.333333


In [44]:
y=df['math_score']

In [45]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [46]:
numeric_features=X.select_dtypes(exclude="object").columns
print("Numeric features: ",numeric_features)
categorical_features=X.select_dtypes(include="object").columns
print("Categorical features: ",categorical_features)

Numeric features:  Index(['reading_score', 'writing_score', 'total_score', 'average_score'], dtype='object')
Categorical features:  Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [47]:
#Create column transformer with 3 types of transformations
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()
preprocessor=ColumnTransformer(
    [
        ('OneHotEncoder',oh_transformer,categorical_features),
        ('StandardScaler',numeric_transformer,numeric_features)
    ])

In [48]:
print("Columns going into preprocessor:")
print("Numeric: ", numeric_features.tolist())
print("Categorical: ", categorical_features.tolist())


Columns going into preprocessor:
Numeric:  ['reading_score', 'writing_score', 'total_score', 'average_score']
Categorical:  ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [49]:
X_transformed = preprocessor.fit_transform(X)


In [50]:
X_transformed

array([[ 1.        ,  0.        ,  0.        , ...,  0.39149181,
         0.34357423,  0.34357423],
       [ 1.        ,  0.        ,  0.        , ...,  1.31326868,
         1.0219275 ,  1.0219275 ],
       [ 1.        ,  0.        ,  0.        , ...,  1.64247471,
         1.74706375,  1.74706375],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.20107904,
        -0.19443008, -0.19443008],
       [ 1.        ,  0.        ,  0.        , ...,  0.58901542,
         0.46053169,  0.46053169],
       [ 1.        ,  0.        ,  0.        , ...,  1.18158627,
         1.06871048,  1.06871048]])

In [51]:
X.shape

(1000, 9)

In [52]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_transformed,y,test_size=0.2,random_state=42)

In [53]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    r2_square=r2_score(true,predicted)
    return mae,mse,rmse,r2_square

In [58]:
models={
    "Linear Regression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "SVR":SVR(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False),
    "XGBRegressor":XGBRegressor()
}
model_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    mae,mse,rmse,r2_square=evaluate_model(y_test,y_test_pred)
    model_train_mae,model_train_mse,model_train_rmse,model_train_r2_square=evaluate_model(y_train,y_train_pred)
    print(f"Training performance of {list(models.keys())[i]}:")
    print(f"MAE: {model_train_mae}")
    print(f"MSE: {model_train_mse}")
    print(f"RMSE: {model_train_rmse}")
    print(f"R2 Square: {model_train_r2_square}")
    print("-"*35)
    print(f"Testing performance of {list(models.keys())[i]}:")
    print(f"Model: {list(models.keys())[i]}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Square: {r2_square}")
    print("="*35)
    model_list.append(list(models.keys())[i])
    r2_list.append(r2_square)

Training performance of Linear Regression:
MAE: 1.310507258267535e-13
MSE: 2.677425466756323e-26
RMSE: 1.6362840421993741e-13
R2 Square: 1.0
-----------------------------------
Testing performance of Linear Regression:
Model: Linear Regression
MAE: 1.3521628261514707e-13
MSE: 2.915427957305123e-26
RMSE: 1.7074624321797313e-13
R2 Square: 1.0
Training performance of Ridge:
MAE: 0.2724414942037947
MSE: 0.11532820078628774
RMSE: 0.3396000600504772
R2 Square: 0.9994884476076249
-----------------------------------
Testing performance of Ridge:
Model: Ridge
MAE: 0.26842795106636336
MSE: 0.1177835718904815
RMSE: 0.34319611287204504
R2 Square: 0.9995159680022122
Training performance of Lasso:
MAE: 3.743856971975057
MSE: 22.44920124636832
RMSE: 4.738058805710238
R2 Square: 0.9004238119887846
-----------------------------------
Testing performance of Lasso:
Model: Lasso
MAE: 3.7578731421148284
MSE: 22.250793484434894
RMSE: 4.717074674460315
R2 Square: 0.9085602868908693
Training performance of KN

In [59]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model','R2']).sort_values(by='R2',ascending=False)

Unnamed: 0,Model,R2
0,Linear Regression,1.0
1,Ridge,0.999516
9,XGBRegressor,0.977101
8,CatBoostRegressor,0.9681
5,RandomForestRegressor,0.964828
4,DecisionTreeRegressor,0.942816
7,AdaBoostRegressor,0.926226
2,Lasso,0.90856
3,KNeighborsRegressor,0.889781
6,SVR,0.794023
