In [3]:
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')


In [4]:
df = pd.read_csv("data/raw.csv")
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
X = df.drop(columns=['math_score'],axis=1)
y = df['math_score']
print("shape of X:-",X.shape)
print("Shape of y:-",y.shape)

shape of X:- (1000, 7)
Shape of y:- (1000,)


In [6]:
num_feature = X.select_dtypes(exclude="object").columns
cat_feature = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transfomer = StandardScaler()
one_hot_encoder = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", one_hot_encoder, cat_feature),
        ("Standardscaler", numeric_transfomer, num_feature),
    ]
)

In [7]:
X = preprocessor.fit_transform(X)

In [10]:
X.shape

(1000, 19)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [13]:
from sklearn.metrics import mean_absolute_error

In [14]:
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    r2_sre = r2_score(true, predicted)
    return(mse, mae, r2_sre)
    
    

In [15]:
from xgboost import XGBRegressor

In [20]:
models = {
    
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "CatBoost": CatBoostRegressor(),
    "XGBoost": XGBRegressor(),
   
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    model_tn_mse, model_tn_mae, model_tn_r2score = evaluate_model(y_train, y_train_pred)
    model_ts_mse, model_ts_mae, model_ts_r2score = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model performance for test set")
    print("- Mean Absolute Error:- {:.4f}".format(model_ts_mae))
    print("- mean Squared Error:- {:.4f}".format(model_ts_mse))
    print("- R2-Score:- {:.4f}".format(model_ts_r2score))
    r2_list.append(model_ts_r2score)
    
    print("*"*50)
    
    print('Model Performance for train set')
    print("- Mean Absolute Error:- {:.4f}".format(model_tn_mae))
    print("- Mean Squared Error:- {:.4f}".format(model_tn_mse))
    print("- R2-Score:- {:.4f}".format(model_tn_r2score))
    
    

Decision Tree
Model performance for test set
- Mean Absolute Error:- 6.1850
- mean Squared Error:- 61.8550
- R2-Score:- 0.7458
**************************************************
Model Performance for train set
- Mean Absolute Error:- 0.0187
- Mean Squared Error:- 0.0781
- R2-Score:- 0.9997
Random Forest
Model performance for test set
- Mean Absolute Error:- 4.6452
- mean Squared Error:- 35.8146
- R2-Score:- 0.8528
**************************************************
Model Performance for train set
- Mean Absolute Error:- 1.8504
- Mean Squared Error:- 5.3745
- R2-Score:- 0.9762
Learning rate set to 0.039525
0:	learn: 14.5987177	total: 2.81ms	remaining: 2.81s
1:	learn: 14.2251886	total: 5.34ms	remaining: 2.67s
2:	learn: 13.8866124	total: 8.04ms	remaining: 2.67s
3:	learn: 13.5235688	total: 10.5ms	remaining: 2.62s
4:	learn: 13.1887021	total: 13.3ms	remaining: 2.65s
5:	learn: 12.9124226	total: 15.6ms	remaining: 2.59s
6:	learn: 12.6000335	total: 18.6ms	remaining: 2.64s
7:	learn: 12.3299057	tot

In [21]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2 score']).sort_values(by=['R2 score'])

Unnamed: 0,Model Name,R2 score
0,Decision Tree,0.745807
3,XGBoost,0.827797
2,CatBoost,0.851632
1,Random Forest,0.85282
