In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

#modelling import
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('data/stud.csv')

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X=df.drop(columns=['math score'],axis=1)

In [5]:
y=df['math score']

In [6]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [7]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [8]:
#create column transformer with 3-types of transfromer
num_feature=X.select_dtypes(exclude="object").columns
cat_feature=X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_transformer=StandardScaler()
oh_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,cat_feature),
        ("StandardScaler",numerical_transformer,num_feature)
    ]
)

In [9]:
X=preprocessor.fit_transform(X)

In [10]:
X.shape


(1000, 19)

In [11]:
# sepertae datasets into test and train
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [12]:
#evaluate function to give all metrics after model training
def evaluate_model(true, predicted):
    mae=mean_absolute_error(true, predicted)
   # mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square
 

In [13]:
models={
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Neighbors Regressor":KNeighborsRegressor(),
    "DecisionTree":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    "CatBoostRegressor":CatBoostRegressor(),
    "XGBRegressor":XGBRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) #traub model

    #make prediction

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    #evakuation Train and test data
    model_train_mae, model_train_rmse , model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2 = evaluate_model(y_test, y_test_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('model performance for Training set')
    print("- Root Mean Squared Error: {:4f}".format(model_train_rmse))
    print("-  Mean Absolute  Error: {:4f}".format(model_train_mae))
    print("- R2- Score : {:4f}".format(model_train_r2))


    print("------------------------")

    print('Model Performance for test set')
    print("- Root Mean Squard Error{:4f}".format(model_test_rmse))
    print("-  Mean Absolute Error{:4f}".format(model_test_mae))
    print("- R2 Score {:4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')

Linear Regression
model performance for Training set
- Root Mean Squared Error: 5.323051
-  Mean Absolute  Error: 4.266712
- R2- Score : 0.874317
------------------------
Model Performance for test set
- Root Mean Squard Error5.393994
-  Mean Absolute Error4.214763
- R2 Score 0.880433


Lasso
model performance for Training set
- Root Mean Squared Error: 6.593816
-  Mean Absolute  Error: 5.206303
- R2- Score : 0.807146
------------------------
Model Performance for test set
- Root Mean Squard Error6.519695
-  Mean Absolute Error5.157882
- R2 Score 0.825320


Ridge
model performance for Training set
- Root Mean Squared Error: 5.323325
-  Mean Absolute  Error: 4.264988
- R2- Score : 0.874304
------------------------
Model Performance for test set
- Root Mean Squard Error5.390387
-  Mean Absolute Error4.211101
- R2 Score 0.880593


K-Neighbors Regressor
model performance for Training set
- Root Mean Squared Error: 5.707683
-  Mean Absolute  Error: 4.516750
- R2- Score : 0.855498
----------

In [14]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model name','R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,model name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.880433
5,Random Forest Regressor,0.854246
6,CatBoostRegressor,0.851632
8,AdaBoostRegressor,0.846657
7,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Neighbors Regressor,0.783813
4,DecisionTree,0.758669


In [15]:
#Linear Regression 
lin_model=LinearRegression(fit_intercept=True)
lin_model=lin_model.fit(X_train,y_train)
y_pred=lin_model.predict(X_test)
score=r2_score(y_test, y_pred)*100
print("Accuracy of the model is %.2f" %score)

Accuracy of the model is 88.04


In [20]:
pred_df=pd.DataFrame({'Actual Value': y_test, 'prediction value':y_pred, 'Difference': y_test-y_test_pred})
pred_df

Unnamed: 0,Actual Value,prediction value,Difference
521,91,76.387970,13.333333
737,53,58.885970,-3.056338
740,80,76.990265,2.465753
660,74,76.851804,-2.917073
411,84,87.627378,2.140351
...,...,...,...
408,52,43.409149,3.694656
332,62,62.152214,4.926606
208,74,67.888395,5.740741
613,65,67.022287,-3.698718
