In [2]:
!pip install catboost

Collecting catboost
  Using cached catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Using cached catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
Using cached graphviz-0.20.3-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [4]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1


In [5]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [7]:
df=pd.read_csv('data/stud.csv')

In [8]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [9]:
x=df.drop(columns='math score',axis=1)

In [10]:
y=df['math score']

In [11]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [12]:
num_features = x.select_dtypes(exclude="object").columns
cat_features = x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [13]:
X=preprocessor.fit_transform(x)

In [14]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [15]:
X.shape

(1000, 19)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

In [17]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    r2_score_val=r2_score(true,predicted)
    return mae,mse,rmse,r2_score_val

In [36]:
models={
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Nearest Neighbors":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest":RandomForestRegressor(),
    "AdaBoost":AdaBoostRegressor(),
    "Support Vector Machine":SVR(),
    "CatBoost":CatBoostRegressor(verbose=False),
    "XGBoost":XGBRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    model_train_mae,model_train_rmse,model_train_r2,_=evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2,_=evaluate_model(y_test,y_test_pred)
    
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    
    print("- Root Mean Squared Error:{: .4f}".format(model_train_rmse))
    print("- Mean Absolute Error:{: .4f}".format(model_train_mae))
    print("- R2 Score:{: .4f}".format(model_train_r2))
    
    print('--------------------------------')
    
    print('Model performance for test set')
    print("- root Mean Squared Error:{: .4f}".format(model_test_rmse))
    print("- Mean Absolute Error:{: .4f}".format(model_test_mae))
    print("- R2 Score:{: .4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
- Root Mean Squared Error: 28.3485
- Mean Absolute Error: 4.2671
- R2 Score: 5.3243
--------------------------------
Model performance for test set
- root Mean Squared Error: 29.1167
- Mean Absolute Error: 4.2158
- R2 Score: 5.3960


Lasso
- Root Mean Squared Error: 43.4784
- Mean Absolute Error: 5.2063
- R2 Score: 6.5938
--------------------------------
Model performance for test set
- root Mean Squared Error: 42.5064
- Mean Absolute Error: 5.1579
- R2 Score: 6.5197


Ridge
- Root Mean Squared Error: 28.3378
- Mean Absolute Error: 4.2650
- R2 Score: 5.3233
--------------------------------
Model performance for test set
- root Mean Squared Error: 29.0563
- Mean Absolute Error: 4.2111
- R2 Score: 5.3904


K-Nearest Neighbors
- Root Mean Squared Error: 32.5776
- Mean Absolute Error: 4.5167
- R2 Score: 5.7077
--------------------------------
Model performance for test set
- root Mean Squared Error: 52.6066
- Mean Absolute Error: 5.6210
- R2 Score: 7.2530


Decision Tree


In [39]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by='R2_Score', ascending=True)



Unnamed: 0,Model Name,R2_Score
2,Ridge,5.390387
0,Linear Regression,5.395987
6,AdaBoost,5.911939
5,Random Forest,5.999832
8,CatBoost,6.008632
9,XGBoost,6.473307
1,Lasso,6.519695
3,K-Nearest Neighbors,7.253041
7,Support Vector Machine,8.126623
4,Decision Tree,8.177714
