In [64]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [65]:
df = pd.read_csv('D:/mlproject/notebook/data/data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [66]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [67]:
df.shape

(1000, 8)

In [68]:
df.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [69]:
df.duplicated().sum()

0

In [70]:
df.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

In [71]:
df['total_score'] = df['math score'] + df['reading score'] + df['writing score']
df['average_score'] = df['total_score'] / 3

In [72]:
X = df.drop(columns=['math score'], axis = 1)
y = df['math score']

In [73]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler

one = OneHotEncoder()
st = StandardScaler()

ct = ColumnTransformer(transformers=[('ONE', one, cat_features)
                                     , ('standard_scalar', st, num_features)],
                                     remainder='passthrough')


In [74]:
X = ct.fit_transform(X)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [76]:
def evaluation_model(true, predicted):
    mae = mean_absolute_error(predicted, true)
    mse = mean_squared_error(predicted, true)
    r2 = r2_score(predicted, true)
    return mae, mse, r2

In [81]:
X_train.shape, y_train.shape

((750, 21), (750,))

In [88]:
models = {
    "LinearRegression":LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForestRegressor" : RandomForestRegressor(),
    "XGBRegressor" : XGBRegressor(),
    "AdaBoostRegressor" : AdaBoostRegressor(),
    "CatBoostRegressor" : CatBoostRegressor(),
    "KNeighborsRegressor" : KNeighborsRegressor(),
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mae, train_mse, train_r2 = evaluation_model(y_train, y_train_pred)

    test_mae, test_mse, test_r2 = evaluation_model(y_test, y_test_pred)

    print("\n\n", list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    r2_list.append(test_r2)
    print("\ntrain mae:", train_mae)
    print("\ntrain mse:", train_mse)
    print("\ntrain r2:", train_r2)
    print("\ntest mae:", test_mae)
    print("\ntest mse:", test_mse)
    print("\ntest r2:", test_r2)




 LinearRegression

train mae: 1.0672351891116704e-13

train mse: 1.759111855899819e-26

train r2: 1.0

test mae: 1.1144152267661411e-13

test mse: 2.059651452482196e-26

test r2: 1.0


 Lasso

train mae: 3.715647908194945

train mse: 22.02104601893464

train r2: 0.8672453955418763

test mae: 3.7994027066963616

test mse: 23.102931722510437

test r2: 0.8790296287070711


 Ridge

train mae: 0.289821285740607

train mse: 0.13089724475516268

train r2: 0.9994049783670467

test mae: 0.2966082095992915

test mse: 0.14007970361788857

test r2: 0.9994236218920672


 DecisionTreeRegressor

train mae: 0.0

train mse: 0.0

train r2: 1.0

test mae: 3.432

test mse: 22.808

test r2: 0.9116458898665862


 RandomForestRegressor

train mae: 0.8406266666666665

train mse: 1.1887673333333333

train r2: 0.9944417018812073

test mae: 2.4516000000000004

test mse: 11.566595200000004

test r2: 0.9510268763761488


 XGBRegressor

train mae: 0.0781027234395345

train mse: 0.011849101022095661

train r2: 0.9

In [93]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model', 'R2_score']).sort_values(by=['R2_score'], ascending=False)

Unnamed: 0,Model,R2_score
0,LinearRegression,1.0
2,Ridge,0.999424
5,XGBRegressor,0.966781
7,CatBoostRegressor,0.964205
4,RandomForestRegressor,0.951027
3,DecisionTreeRegressor,0.911646
6,AdaBoostRegressor,0.9046
1,Lasso,0.87903
8,KNeighborsRegressor,0.864554


In [94]:
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)

In [95]:
y_test_pred = model.predict(X_test)

In [96]:
pred_df = pd.DataFrame({'Actual Value':y_test, 'Predicted Value':y_test_pred, 'Difference (y_test_pred - y_test)':y_test_pred - y_test})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference (y_test_pred - y_test)
521,91,91.0,1.421085e-13
737,53,53.0,0.000000e+00
740,80,80.0,2.842171e-14
660,74,74.0,2.842171e-14
411,84,84.0,1.136868e-13
...,...,...,...
109,70,70.0,1.421085e-14
430,64,64.0,-7.105427e-14
77,80,80.0,1.421085e-13
84,42,42.0,-3.126388e-13
