In [1]:
# Importing Required Libaries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

#Model Libaries:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Evaluation Metrics:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV


import warnings
warnings.simplefilter("ignore")

In [2]:
data=pd.read_csv(r"E:\End to End ML projects\student performance with Github actions\notebook\Data/stud.csv")

In [3]:
data.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
# Preparing X and Y Variables:
x=data.drop(columns=["math_score"],axis=1)
y=data["math_score"]

In [5]:
x.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [6]:
print("Categories in gender variables:  ",end="  ")
print(data["gender"].unique())

print("Categoriers in race_ethnicity:  ",end="  ")
print(data["race_ethnicity"].unique())

print("Categorires in parental_level_of_education:  ",end="  ")
print(data["parental_level_of_education"].unique())

print("Categories in lunch:  ",end="  ")
print(data["lunch"].unique())

print("Categories in test_preparation_course:  ",end="  ")
print(data["test_preparation_course"].unique())

Categories in gender variables:    ['female' 'male']
Categoriers in race_ethnicity:    ['group B' 'group C' 'group A' 'group D' 'group E']
Categorires in parental_level_of_education:    ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in lunch:    ['standard' 'free/reduced']
Categories in test_preparation_course:    ['none' 'completed']


In [7]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [8]:
# Create column Transform with three types of transformers:
num_features=x.select_dtypes(exclude="object").columns
cat_features=x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer=StandardScaler()
categorical_transformer=OneHotEncoder()

preprocessor=ColumnTransformer(
[
    ("OneHotEncoder",categorical_transformer,cat_features),
    ("StandardScaler",numeric_transformer,num_features),
]
)

In [9]:
x=preprocessor.fit_transform(x)

In [10]:
x.shape

(1000, 19)

In [11]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 19)
(200, 19)
(800,)
(200,)


In [13]:
# Create an Evaluate Function to give all metrics for all Models:
def evaluate(true,predicted):
    MAE=mean_absolute_error(true,predicted)
    RMSE=np.sqrt(mean_squared_error(true,predicted))
    R2=r2_score(true,predicted)
    return MAE,RMSE,R2

In [14]:
models={
    "LinearRegression":LinearRegression(),
    "KNearestNeighbors":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "catBoostingRegressor":CatBoostRegressor(),
    "XgboostRegressor":XGBRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    #Make_predictions:
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    
    # Evaluate train and test  dataset:
    model_train_MAE,model_train_RMSE,model_train_R2=evaluate(y_train,y_train_pred)
    model_test_MAE,model_test_RMSE,model_test_R2=evaluate(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance for Training set")
    print("RMSE:{:.4f}".format(model_train_RMSE))
    print("MAE:{:.4f}".format(model_train_MAE))
    print("R2:{:.4f}".format(model_train_R2))
    
    print("---------------------------------------------")
    
    print("Model Performance for Testing set")
    print("RMSE:{:.4f}".format(model_test_RMSE))
    print("MAE:{:.4f}".format(model_test_MAE))
    print("R2:{:.4f}".format(model_test_R2))
    
    r2_list.append(model_test_R2)
    
    print('='*35)
    print('\n')

LinearRegression
Model Performance for Training set
RMSE:5.3231
MAE:4.2667
R2:0.8743
---------------------------------------------
Model Performance for Testing set
RMSE:5.3940
MAE:4.2148
R2:0.8804


KNearestNeighbors
Model Performance for Training set
RMSE:5.6989
MAE:4.5038
R2:0.8559
---------------------------------------------
Model Performance for Testing set
RMSE:7.2520
MAE:5.6360
R2:0.7839


DecisionTreeRegressor
Model Performance for Training set
RMSE:0.2795
MAE:0.0187
R2:0.9997
---------------------------------------------
Model Performance for Testing set
RMSE:7.7343
MAE:6.1800
R2:0.7542


RandomForestRegressor
Model Performance for Training set
RMSE:2.2778
MAE:1.8236
R2:0.9770
---------------------------------------------
Model Performance for Testing set
RMSE:6.0184
MAE:4.6425
R2:0.8512


AdaBoostRegressor
Model Performance for Training set
RMSE:5.7885
MAE:4.7411
R2:0.8514
---------------------------------------------
Model Performance for Testing set
RMSE:6.1928
MAE:4.8440


168:	learn: 4.9176688	total: 491ms	remaining: 2.41s
169:	learn: 4.9103654	total: 492ms	remaining: 2.4s
170:	learn: 4.9064572	total: 494ms	remaining: 2.39s
171:	learn: 4.9019965	total: 495ms	remaining: 2.38s
172:	learn: 4.8968162	total: 499ms	remaining: 2.38s
173:	learn: 4.8911231	total: 501ms	remaining: 2.38s
174:	learn: 4.8876302	total: 502ms	remaining: 2.37s
175:	learn: 4.8852597	total: 503ms	remaining: 2.36s
176:	learn: 4.8806597	total: 505ms	remaining: 2.35s
177:	learn: 4.8772997	total: 506ms	remaining: 2.34s
178:	learn: 4.8740388	total: 507ms	remaining: 2.33s
179:	learn: 4.8694370	total: 508ms	remaining: 2.31s
180:	learn: 4.8612687	total: 511ms	remaining: 2.31s
181:	learn: 4.8586731	total: 512ms	remaining: 2.3s
182:	learn: 4.8540362	total: 514ms	remaining: 2.29s
183:	learn: 4.8513024	total: 515ms	remaining: 2.28s
184:	learn: 4.8475101	total: 516ms	remaining: 2.27s
185:	learn: 4.8456917	total: 518ms	remaining: 2.27s
186:	learn: 4.8420586	total: 519ms	remaining: 2.25s
187:	learn: 4.

362:	learn: 4.2655659	total: 743ms	remaining: 1.3s
363:	learn: 4.2621172	total: 745ms	remaining: 1.3s
364:	learn: 4.2591851	total: 747ms	remaining: 1.3s
365:	learn: 4.2576685	total: 749ms	remaining: 1.3s
366:	learn: 4.2534324	total: 751ms	remaining: 1.3s
367:	learn: 4.2502167	total: 753ms	remaining: 1.29s
368:	learn: 4.2451483	total: 755ms	remaining: 1.29s
369:	learn: 4.2440158	total: 756ms	remaining: 1.29s
370:	learn: 4.2403292	total: 757ms	remaining: 1.28s
371:	learn: 4.2367180	total: 758ms	remaining: 1.28s
372:	learn: 4.2334959	total: 764ms	remaining: 1.28s
373:	learn: 4.2303575	total: 765ms	remaining: 1.28s
374:	learn: 4.2273812	total: 767ms	remaining: 1.28s
375:	learn: 4.2261691	total: 768ms	remaining: 1.27s
376:	learn: 4.2259639	total: 769ms	remaining: 1.27s
377:	learn: 4.2243465	total: 770ms	remaining: 1.27s
378:	learn: 4.2225954	total: 771ms	remaining: 1.26s
379:	learn: 4.2197009	total: 772ms	remaining: 1.26s
380:	learn: 4.2163562	total: 775ms	remaining: 1.26s
381:	learn: 4.214

555:	learn: 3.8150301	total: 1.01s	remaining: 809ms
556:	learn: 3.8135293	total: 1.01s	remaining: 808ms
557:	learn: 3.8088813	total: 1.02s	remaining: 806ms
558:	learn: 3.8087404	total: 1.02s	remaining: 804ms
559:	learn: 3.8071435	total: 1.02s	remaining: 801ms
560:	learn: 3.8057335	total: 1.02s	remaining: 799ms
561:	learn: 3.8033701	total: 1.02s	remaining: 796ms
562:	learn: 3.8032152	total: 1.02s	remaining: 794ms
563:	learn: 3.8013097	total: 1.02s	remaining: 792ms
564:	learn: 3.8000038	total: 1.02s	remaining: 790ms
565:	learn: 3.7980261	total: 1.03s	remaining: 787ms
566:	learn: 3.7947458	total: 1.03s	remaining: 785ms
567:	learn: 3.7943120	total: 1.03s	remaining: 782ms
568:	learn: 3.7922059	total: 1.03s	remaining: 780ms
569:	learn: 3.7889911	total: 1.03s	remaining: 777ms
570:	learn: 3.7873916	total: 1.03s	remaining: 775ms
571:	learn: 3.7826445	total: 1.03s	remaining: 772ms
572:	learn: 3.7817003	total: 1.03s	remaining: 770ms
573:	learn: 3.7800454	total: 1.03s	remaining: 768ms
574:	learn: 

763:	learn: 3.4086148	total: 1.29s	remaining: 398ms
764:	learn: 3.4079067	total: 1.29s	remaining: 397ms
765:	learn: 3.4074940	total: 1.29s	remaining: 396ms
766:	learn: 3.4062871	total: 1.3s	remaining: 395ms
767:	learn: 3.4046004	total: 1.3s	remaining: 394ms
768:	learn: 3.4040127	total: 1.31s	remaining: 392ms
769:	learn: 3.4003988	total: 1.31s	remaining: 391ms
770:	learn: 3.3997377	total: 1.31s	remaining: 390ms
771:	learn: 3.3991853	total: 1.31s	remaining: 389ms
772:	learn: 3.3972454	total: 1.32s	remaining: 387ms
773:	learn: 3.3962298	total: 1.32s	remaining: 385ms
774:	learn: 3.3940028	total: 1.32s	remaining: 384ms
775:	learn: 3.3896172	total: 1.32s	remaining: 382ms
776:	learn: 3.3854265	total: 1.33s	remaining: 381ms
777:	learn: 3.3850723	total: 1.33s	remaining: 379ms
778:	learn: 3.3838424	total: 1.33s	remaining: 378ms
779:	learn: 3.3801107	total: 1.33s	remaining: 376ms
780:	learn: 3.3785952	total: 1.33s	remaining: 375ms
781:	learn: 3.3773712	total: 1.34s	remaining: 373ms
782:	learn: 3.

929:	learn: 3.1448698	total: 1.71s	remaining: 129ms
930:	learn: 3.1434662	total: 1.73s	remaining: 128ms
931:	learn: 3.1419196	total: 1.75s	remaining: 128ms
932:	learn: 3.1410541	total: 1.78s	remaining: 128ms
933:	learn: 3.1393904	total: 1.78s	remaining: 126ms
934:	learn: 3.1390253	total: 1.8s	remaining: 125ms
935:	learn: 3.1378242	total: 1.8s	remaining: 123ms
936:	learn: 3.1356205	total: 1.81s	remaining: 122ms
937:	learn: 3.1344318	total: 1.81s	remaining: 120ms
938:	learn: 3.1327657	total: 1.87s	remaining: 122ms
939:	learn: 3.1314341	total: 1.87s	remaining: 120ms
940:	learn: 3.1283843	total: 1.87s	remaining: 118ms
941:	learn: 3.1281205	total: 1.88s	remaining: 116ms
942:	learn: 3.1263146	total: 1.88s	remaining: 113ms
943:	learn: 3.1254097	total: 1.88s	remaining: 111ms
944:	learn: 3.1230726	total: 1.88s	remaining: 109ms
945:	learn: 3.1229211	total: 1.88s	remaining: 107ms
946:	learn: 3.1221827	total: 1.88s	remaining: 105ms
947:	learn: 3.1200410	total: 1.89s	remaining: 103ms
948:	learn: 3.

# Results

In [20]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=["Model_Name","R2_score"]).sort_values(by=["R2_score"],ascending=False)

Unnamed: 0,Model_Name,R2_score
0,LinearRegression,0.880433
5,catBoostingRegressor,0.851632
3,RandomForestRegressor,0.851151
4,AdaBoostRegressor,0.842397
6,XgboostRegressor,0.821589
1,KNearestNeighbors,0.783876
2,DecisionTreeRegressor,0.75417


In [24]:
linear=LinearRegression()
linear.fit(X_train,y_train)
y_pred=linear.predict(X_test)

In [25]:
pred_data=pd.DataFrame({"Actual_value":y_test,"predicted_value":y_pred,"Difference":y_test-y_pred})
pred_data

Unnamed: 0,Actual_value,predicted_value,Difference
521,91,76.387970,14.612030
737,53,58.885970,-5.885970
740,80,76.990265,3.009735
660,74,76.851804,-2.851804
411,84,87.627378,-3.627378
...,...,...,...
408,52,43.409149,8.590851
332,62,62.152214,-0.152214
208,74,67.888395,6.111605
613,65,67.022287,-2.022287
