In [28]:
 #Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [29]:
df = pd.read_csv('rawdate.csv')

In [30]:
df.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,master's degree,standard,none,62,70,75
1,female,group C,bachelor's degree,free/reduced,completed,66,83,83
2,female,group D,some college,free/reduced,none,79,89,86
3,male,group C,master's degree,free/reduced,none,61,67,66
4,male,group E,high school,standard,none,73,64,57


In [31]:
#Preparting X and Y variables
x = df.drop(columns = "math_score",axis = 1)

In [32]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group D,master's degree,standard,none,70,75
1,female,group C,bachelor's degree,free/reduced,completed,83,83
2,female,group D,some college,free/reduced,none,89,86
3,male,group C,master's degree,free/reduced,none,67,66
4,male,group E,high school,standard,none,64,57
...,...,...,...,...,...,...,...
795,female,group D,master's degree,standard,none,100,100
796,male,group C,bachelor's degree,standard,none,63,61
797,female,group C,associate's degree,standard,none,62,53
798,male,group C,some college,free/reduced,completed,48,53


In [33]:
y = df["math_score"]

In [34]:
y

0      62
1      66
2      79
3      61
4      73
       ..
795    87
796    69
797    53
798    50
799    85
Name: math_score, Length: 800, dtype: int64

In [35]:
#Columns Transformations
num_feature = x.select_dtypes(exclude = 'object').columns
cat_feature = x.select_dtypes(include = 'object').columns

In [36]:
num_feature

Index(['reading_score', 'writing_score'], dtype='object')

In [37]:
cat_feature

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')

In [38]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [39]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",cat_transformer,cat_feature),
        ("StandardScaler",num_transformer,num_feature),
    ]
)


In [40]:
x = preprocessor.fit_transform(x)

In [41]:
x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.03079054,  0.43405338],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.9302895 ,  0.96470125],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.34544287,  1.1636942 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        -0.52274728, -1.02522827],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
        -1.49143847, -1.02522827],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.48382733,  1.36268716]])

In [42]:
x.shape

(800, 19)

In [43]:
#Separate dataset into test and train
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=30)
x_train.shape,x_test.shape

((640, 19), (160, 19))

In [46]:
#Create an Evaluate function to give all matrics after model training

def evaluate_model(true,predicted):
    MAE = mean_absolute_error(true,predicted)
    MSE = mean_squared_error(true,predicted)
    RMSE = np.sqrt(mean_squared_error(true,predicted))
    R2_Square = r2_score(true,predicted)
    return MAE ,MSE ,R2_Square

In [48]:
#MOdel Selections
Models = {
    "LR":LinearRegression(),
    "lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "Decision_tree":DecisionTreeRegressor(),
    "random forest": RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor()
}
model_list =[]
r2_list = []
for i in range(len(list(Models))):
    model = list(Models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(Models.keys())[i])
    model_list.append(list(Models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

LR
Model performance for Training set
- Root Mean Squared Error: 29.2750
- Mean Absolute Error: 4.3426
- R2 Score: 0.8703
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 25.2631
- Mean Absolute Error: 3.9741
- R2 Score: 0.8874


lasso
Model performance for Training set
- Root Mean Squared Error: 44.5820
- Mean Absolute Error: 5.2627
- R2 Score: 0.8025
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 36.3904
- Mean Absolute Error: 4.8515
- R2 Score: 0.8377


Ridge
Model performance for Training set
- Root Mean Squared Error: 29.2711
- Mean Absolute Error: 4.3414
- R2 Score: 0.8703
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 25.2095
- Mean Absolute Error: 3.9679
- R2 Score: 0.8876


KNeighborsRegressor
Model performance for Training set
- Root Mean Squared Error: 35.6756
- Mean Absolute Error: 4.7631
- R2 Score: 0.8419
---------------------------------

In [49]:
#Results
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.887594
0,LR,0.887355
7,CatBoostRegressor,0.854943
8,AdaBoostRegressor,0.845684
1,lasso,0.83774
5,random forest,0.833441
6,XGBRegressor,0.823692
3,KNeighborsRegressor,0.780604
4,Decision_tree,0.717781
