In [25]:
# basic import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Modelling
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor  
from sklearn.metrics import mean_squared_error, r2_score
import warnings

In [46]:
df = pd.read_csv('data/stud.csv')

In [47]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [19]:
X = df.drop("math_score",axis =1)
y = df["math_score"]

In [13]:
print("categorices in gender varibale: ", end =" ")
print(df['gender'].unique())
print("categorices in 'race_ethnicity varibale: ', ", end =" ")
print(df['race_ethnicity'].unique())
print("categorices in 'parental level of education varibale: ', ", end =" ")
print(df['parental_level_of_education'].unique())
print("categorices in 'lunch varibale: ', ", end =" ")
print(df['lunch'].unique())
print("categorices in 'test preparation course varibale: ', ", end =" ")
print(df['test_preparation_course'].unique())

categorices in gender varibale:  ['female' 'male']
categorices in 'race_ethnicity varibale: ',  ['group B' 'group C' 'group A' 'group D' 'group E']
categorices in 'parental level of education varibale: ',  ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
categorices in 'lunch varibale: ',  ['standard' 'free/reduced']
categorices in 'test preparation course varibale: ',  ['none' 'completed']


In [14]:
# create a column transformer with 3 types of transformers

num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

In [15]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformers = StandardScaler()
oh_transformers =OneHotEncoder()
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformers,cat_features),
        ('StandardScaler',numeric_transformers,num_features)
    ]
)


In [20]:
X = preprocessor.fit_transform(X)

In [22]:
X.shape

(1000, 19)

In [23]:
# separate dataset into train and test sets

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((800, 19), (200, 19), (800,), (200,))

create an evaluate function to give all metrics after model training

In [24]:
def evaluate_model(true,predict):
    mae = mean_squared_error(true,predict)
    r2 = r2_score(true,predict)
    return mae,r2


In [31]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Naive Bayes" :GaussianNB(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Support Vector Regressor": SVR(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    # "CatBoostRegressor": CatBoostRegressor(),
    "XGBoost Regressor": XGBRegressor()
}
model_list =[]
r2_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # make prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # evaluate model
    mae_train, r2_train = evaluate_model(y_train, y_train_pred)
    mae_test, r2_test = evaluate_model(y_test, y_test_pred)

    model_list.append(list(models.keys())[i])
    r2_list.append((r2_train, r2_test))
    print(list(models.keys())[i])
    print("Model performance for Training set")
    print("- Mean Absolute Error : {:.4f}".format(mae_train))
    print("- R2 Score : {:.4f}\n".format(r2_train))
    print("-"*20)
    print("Model performance for Test set")
    print("- Mean Absolute Error : {:.4f}".format(mae_test))
    print("- R2 Score : {:.4f}\n".format(r2_test))
    print("-"*20)

Linear Regression
Model performance for Training set
- Mean Absolute Error : 28.3469
- R2 Score : 0.8743

--------------------
Model performance for Test set
- Mean Absolute Error : 29.1774
- R2 Score : 0.8801

--------------------
Lasso
Model performance for Training set
- Mean Absolute Error : 43.4784
- R2 Score : 0.8071

--------------------
Model performance for Test set
- Mean Absolute Error : 42.5064
- R2 Score : 0.8253

--------------------
Ridge
Model performance for Training set
- Mean Absolute Error : 28.3378
- R2 Score : 0.8743

--------------------
Model performance for Test set
- Mean Absolute Error : 29.0563
- R2 Score : 0.8806

--------------------
Naive Bayes
Model performance for Training set
- Mean Absolute Error : 392.5300
- R2 Score : -0.7411

--------------------
Model performance for Test set
- Mean Absolute Error : 434.7250
- R2 Score : -0.7865

--------------------
KNeighbors Regressor
Model performance for Training set
- Mean Absolute Error : 32.6859
- R2 Score

# Results

In [38]:
pd.DataFrame(list(zip(models, r2_list)),columns=['Model', 'R2_Score']).sort_values(by =['R2_Score'],ascending=False)

Unnamed: 0,Model,R2_Score
5,Decision Tree Regressor,"(0.9996534669718089, 0.7656761119494498)"
10,XGBoost Regressor,"(0.9954995512962341, 0.8277965784072876)"
7,Random Forest Regressor,"(0.976315394800258, 0.8528041077089902)"
8,Gradient Boosting Regressor,"(0.9050396644022572, 0.8722288225190544)"
2,Ridge,"(0.8743042615212908, 0.8805931485028738)"
0,Linear Regression,"(0.8742639739869204, 0.8800954640546355)"
4,KNeighbors Regressor,"(0.8550176780012468, 0.7836806685669011)"
9,AdaBoost Regressor,"(0.8459247413762656, 0.8499256463133502)"
6,Support Vector Regressor,"(0.8081281585902299, 0.7286001513223704)"
1,Lasso,"(0.8071462015863458, 0.8253197323627853)"


In [45]:
r2_list

[(0.8742639739869204, 0.8800954640546355),
 (0.8071462015863458, 0.8253197323627853),
 (0.8743042615212908, 0.8805931485028738),
 (-0.7411150023150832, -0.786503897453094),
 (0.8550176780012468, 0.7836806685669011),
 (0.9996534669718089, 0.7656761119494498),
 (0.8081281585902299, 0.7286001513223704),
 (0.976315394800258, 0.8528041077089902),
 (0.9050396644022572, 0.8722288225190544),
 (0.8459247413762656, 0.8499256463133502),
 (0.9954995512962341, 0.8277965784072876)]