In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
!pip install xgboost



In [4]:
# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [5]:
data = pd.read_csv('data/StudentsPerformance.csv')

In [6]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Prepare X and Y

In [10]:
X = data.drop(['math score'], axis=1)

In [12]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [13]:
y = data['math score']

In [14]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [18]:
# Create column Transformer with onehotencoder and standardscaler
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

oh_transformer = OneHotEncoder()
sc_transformer = StandardScaler()

preprocessor = ColumnTransformer(
                    [
                        ("OneHotEncoder", oh_transformer, cat_features),
                        ("StandardScaler", sc_transformer, num_features)
                    ]
                )

In [19]:
X = preprocessor.fit_transform(X)

In [20]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [21]:
X.shape

(1000, 19)

In [23]:
# seperate dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=423)

In [24]:
X_train.shape

(800, 19)

In [25]:
X_test.shape

(200, 19)

### Create an Evaluate Function to give all metrics after model Training

In [26]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    
    return mae,mse,r2_square

In [36]:
models = {
    "Linear Regression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "Support Vector":SVR(),
    "K-NeighborsRegressor":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest":RandomForestRegressor(),
    "AdaBoost":AdaBoostRegressor(),
    "CatBoost":CatBoostRegressor(verbose=False),
    "XG Boost":XGBRegressor()
}

In [40]:
r2_list = []
for algo in models.keys():
    model = models[algo]
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test
    model_train_mae, model_train_mse, model_train_r2_square = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_r2_square = evaluate_model(y_test, y_test_pred)
    
    print("======================")
    print(algo)
    print("R2: {:.4f}".format(model_test_r2_square))
    print("Mean Squared Error: {:.4f}".format(model_test_mse))
    print("Mean Absolute Error: {:.4f}".format(model_test_mae))

    r2_list.append(model_test_r2_square)

Linear Regression
R2: 0.8739
Mean Squared Error: 30.5453
Mean Absolute Error: 4.4402
Ridge
R2: 0.8737
Mean Squared Error: 30.5923
Mean Absolute Error: 4.4251
Lasso
R2: 0.8279
Mean Squared Error: 41.7036
Mean Absolute Error: 5.1130
Support Vector
R2: 0.7173
Mean Squared Error: 68.4870
Mean Absolute Error: 5.7979
K-NeighborsRegressor
R2: 0.7709
Mean Squared Error: 55.4886
Mean Absolute Error: 5.8350
Decision Tree
R2: 0.7257
Mean Squared Error: 66.4400
Mean Absolute Error: 6.2900
Random Forest
R2: 0.8471
Mean Squared Error: 37.0380
Mean Absolute Error: 4.9311
AdaBoost
R2: 0.8321
Mean Squared Error: 40.6712
Mean Absolute Error: 5.1236
CatBoost
R2: 0.8341
Mean Squared Error: 40.1800
Mean Absolute Error: 5.0307
XG Boost
R2: 0.8146
Mean Squared Error: 44.9216
Mean Absolute Error: 5.3317


In [41]:
models_list = models.keys()

In [48]:
pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model Name', 'R2 Square']).sort_values(by=['R2 Square'], ascending=False)

Unnamed: 0,Model Name,R2 Square
0,Linear Regression,0.873912
1,Ridge,0.873718
6,Random Forest,0.847111
8,CatBoost,0.834141
7,AdaBoost,0.832113
2,Lasso,0.827852
9,XG Boost,0.814568
4,K-NeighborsRegressor,0.770949
5,Decision Tree,0.725742
3,Support Vector,0.717292
