<a href="https://colab.research.google.com/github/Foysal348/Compare-Regression-model-Performance/blob/main/Five_Regression_Model_Performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [113]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import VotingRegressor , StackingRegressor,RandomForestRegressor,GradientBoostingRegressor

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

import warnings
warnings.filterwarnings("ignore")


In [114]:
df=pd.read_csv("/content/sample_data/bangladesh_student_performance.csv")
df

Unnamed: 0,date,gender,age,address,famsize,Pstatus,M_Edu,F_Edu,M_Job,F_Job,relationship,smoker,tuition_fee,time_friends,ssc_result,hsc_result
0,29/04/2018,M,18,Rural,GT3,Together,3,2,At_home,Farmer,No,No,71672,4,4.22,3.72
1,29/04/2018,F,19,Rural,LE3,Apart,0,4,Other,Health,Yes,No,26085,5,3.47,2.62
2,29/04/2018,F,19,Rural,GT3,Together,0,3,Teacher,Services,No,No,40891,3,3.32,2.56
3,29/04/2018,F,19,Rural,LE3,Apart,2,3,At_home,Business,No,No,50600,2,4.57,4.17
4,29/04/2018,M,17,Rural,GT3,Together,1,1,At_home,Farmer,No,No,62458,2,4.50,3.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,29/04/2018,M,18,Urban,GT3,Together,2,2,Teacher,Health,No,No,108426,2,3.57,3.18
2014,29/04/2018,M,19,Urban,GT3,Together,0,3,Other,Teacher,Yes,No,100665,5,4.05,3.13
2015,29/04/2018,M,18,Urban,GT3,Together,3,2,Services,Farmer,Yes,No,92291,1,3.93,3.15
2016,29/04/2018,M,19,Rural,GT3,Together,0,2,At_home,Teacher,No,No,43926,3,3.34,2.83


In [115]:
!pip install ydata-profiling



In [116]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df,title="Bangladeshi Student Performance Report",explorative=True)
profile.to_file("YDataProfiling.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/16 [00:00<?, ?it/s][A
 19%|█▉        | 3/16 [00:00<00:00, 22.96it/s][A
 38%|███▊      | 6/16 [00:00<00:00, 18.68it/s][A
 56%|█████▋    | 9/16 [00:00<00:00, 21.29it/s][A
100%|██████████| 16/16 [00:00<00:00, 26.17it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [117]:
df.columns

Index(['date', 'gender', 'age', 'address', 'famsize', 'Pstatus', 'M_Edu',
       'F_Edu', 'M_Job', 'F_Job', 'relationship', 'smoker', 'tuition_fee',
       'time_friends', 'ssc_result', 'hsc_result'],
      dtype='object')

In [118]:
df.drop(["date"],inplace=True,axis=1)
df.columns

Index(['gender', 'age', 'address', 'famsize', 'Pstatus', 'M_Edu', 'F_Edu',
       'M_Job', 'F_Job', 'relationship', 'smoker', 'tuition_fee',
       'time_friends', 'ssc_result', 'hsc_result'],
      dtype='object')

## Corelation  fro numerical Column

In [119]:
cor=df.select_dtypes(include=np.number).corr()["ssc_result"].sort_values(ascending=False)
cor

Unnamed: 0,ssc_result
ssc_result,1.0
hsc_result,0.950178
tuition_fee,0.013987
age,-0.008862
M_Edu,-0.017275
F_Edu,-0.023337
time_friends,-0.028549


In [120]:
X=df.drop(['hsc_result'],axis=1)
y=df['hsc_result']

## Pipline

In [127]:
#For numeric column Pipeline
numeric_transformer=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
         ("scaler", StandardScaler())

    ])
#Categorical Column Pipeline
Categorical_transformer=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy='most_frequent')),
        ("encoder",OneHotEncoder(handle_unknown='ignore'))
    ]
)
#Column Transformer
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,X.select_dtypes(include=np.number).columns),
        ('cat',Categorical_transformer,X.select_dtypes(exclude=np.number).columns)
    ]
)

In [128]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)



##Ensemble Learning

In [129]:
#Base Learner
reg_lr=LinearRegression()
reg_rf=RandomForestRegressor(n_estimators=100,random_state=42)
reg_gb=GradientBoostingRegressor(n_estimators=100,random_state=42)

In [124]:
#Voting Regressor
voting_reg=VotingRegressor(
    estimators=[
        ('lr',reg_lr),
        ('rf',reg_rf),
        ('gb',reg_gb)
    ]
)

In [130]:
#Stacking
stacking_reg=StackingRegressor(
    estimators=[
        ('lr',reg_lr),
        ('rf',reg_rf),
        ('gb',reg_gb)
    ],final_estimator=Ridge()#The meta learner
)

## Model Training

In [133]:
#Dictionary
models={
    "linear Regression":reg_lr,
    "Random Forest":reg_rf,
    "Gradient Boosting":reg_gb,
    "Voting Regressor":voting_reg,
    "Stacking Regressor":stacking_reg
}
results=[]
for name,model in models.items():
  pipe=Pipeline(
      steps=[
          ("preprocessor",preprocessor),
          ("model",model)
      ]

  )
#Train
  pipe.fit(X_train,y_train)

#predict
  y_pred=pipe.predict(X_test)

#Evaluate
  mse=mean_squared_error(y_test,y_pred)
  mae=mean_absolute_error(y_test,y_pred)
  r2=r2_score(y_test,y_pred)

  results.append({
    "model":name,
    "mse":mse,
    "mae":mae,
    "r2":r2
  })
df_results=pd.DataFrame(results)
print(df_results)

                model       mse       mae        r2
0   linear Regression  0.020269  0.111376  0.945920
1       Random Forest  0.018647  0.108201  0.950248
2   Gradient Boosting  0.015155  0.098902  0.959565
3    Voting Regressor  0.015919  0.100838  0.957528
4  Stacking Regressor  0.015215  0.098687  0.959405
