In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso 
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor 
from catboost import CatBoostRegressor
import warnings
 

In [2]:
df =  pd.read_csv('Data/cleaned_data.csv')
df.head()

Unnamed: 0,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,gender,math_score,reading_score,writing_score,Total_score,Avrage
0,group B,bachelor's degree,standard,none,female,72,72,74,218,72.67
1,group C,some college,standard,completed,female,69,90,88,247,82.33
2,group B,master's degree,standard,none,female,90,95,93,278,92.67
3,group A,associate's degree,free/reduced,none,male,47,57,44,148,49.33
4,group C,some college,standard,none,male,76,78,75,229,76.33


In [3]:
df.iloc[:,-1]

0      72.67
1      82.33
2      92.67
3      49.33
4      76.33
       ...  
995    94.00
996    57.33
997    65.00
998    74.33
999    83.00
Name: Avrage, Length: 1000, dtype: float64

**Preparing X And Y variabls** 

In [23]:
X = df.drop(columns=['Avrage','Total_score','math_score'],axis=1)
X.sample(5)

Unnamed: 0,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,gender,reading_score,writing_score
257,group C,associate's degree,standard,completed,male,77,77
13,group A,some college,standard,completed,male,72,70
578,group B,some college,free/reduced,completed,female,56,58
703,group D,some college,standard,none,female,64,67
840,group D,high school,free/reduced,none,female,52,46


In [24]:
X.shape

(1000, 7)

In [25]:
y =  df['math_score']
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [26]:
print('Categories in gender variable:',end=" ")
print(df['gender'].unique())

print('Categories in race/Ethnicity variable:',end=" ")
print(df['race_ethnicity'].unique())
print('')
print('Categories in paretale level of education variable:',end=" ")
print(df['parental_level_of_education'].unique())
print('')
print('Categories in lunch variable:',end=" ")
print(df['lunch'].unique())
print('Categories in test preparation course variable:',end=" ")
print(df['test_preparation_course'].unique())


Categories in gender variable: ['female' 'male']
Categories in race/Ethnicity variable: ['group B' 'group C' 'group A' 'group D' 'group E']

Categories in paretale level of education variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']

Categories in lunch variable: ['standard' 'free/reduced']
Categories in test preparation course variable: ['none' 'completed']


**standradtion and encoding**

In [27]:
num_features = X.select_dtypes(exclude='O').columns
num_features

Index(['reading_score', 'writing_score'], dtype='object')

In [28]:
catfeatures = X.select_dtypes(include='O').columns
catfeatures

Index(['race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'gender'],
      dtype='object')

In [29]:
#using ColumnTransformer for apply multiple transforms on multiplle column 
#oneHotencoder for categorical features m and  standerdScaeler for numric features  
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ohe = OneHotEncoder()

preprocesseur =ColumnTransformer(

    [   ("OneHotEncoder",ohe,catfeatures),
        ('StandrScaler',numeric_transformer,num_features),
    
    ]
)

In [30]:
X =preprocesseur.fit_transform(X)

In [31]:
X

array([[ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.19399858,  0.39149181],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         1.77010859,  1.64247471],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.15336989,  1.18158627]])

In [32]:
X.shape

(1000, 19)

**Split th Data into train and test**

In [33]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,

((800, 19), (200, 19))

**Aply multiple models**

In [34]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'K-Neighbors Regressor':KNeighborsRegressor(),
    'XGB Regressor':XGBRegressor(),
    'CatBoost Regressor':CatBoostRegressor(verbose=False),
    'Decision Tree Regressor:':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'Support Vector machine':SVR()
}

'Ridge'

In [16]:
def Model_evaluate(true_val,predicted_val): 
    MAE = mean_absolute_error(true_val,predicted_val)
    MSE = mean_squared_error(true_val,predicted_val)
    rmse = np.sqrt(mean_squared_error(true_val,predicted_val))
    r2score = r2_score(true_val,predicted_val)

    return MAE,r2score,MSE,rmse

In [51]:
model_list =[]
r2_score_list = [] 
report ={}
for mod in range (len(list(models))):
    model = list(models.values())[mod]
    model.fit(X_train,y_train)
    
    #make predictions: 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    #Evaluate model 
    model_train_MAE, model_train_r2score, model_train_MSE,model_train_RMSE  = Model_evaluate(y_train,y_train_pred)    
    model_test_MAE, model_test_r2score, model_test_MSE,model_test_RMSE = Model_evaluate(y_test,y_test_pred) 

    
    report[list(models.keys())[mod]]= model_test_r2score
    print(report)


    #Print information 

    print('\033[1m'+list(models.keys())[mod]+'\033[0m') 
    model_list.append(list(models.keys())[mod])
    print(model_list)
    print('MODEL PERFORMANCE FOR TRAINING SET')
    print("--Mean Squared Error: {:.4f}".format(model_train_MSE))
    print("--Root Mean Squared Error: {:.4f}".format(model_train_RMSE))
    print("--Mean Absolute Error: {:.4f}".format(model_train_MAE))
    print("--R2 Score: {:.4f}".format(model_train_r2score)) 

    print("----------------------------------------------------------")

    print('MODEL PERFORMANCE FOR Test SET')
    print("--Mean Squared Error: {:.4f}".format(model_test_MSE))
    print("--Root Mean Squared Error: {:.4f}".format(model_test_RMSE))
    print("--Mean Absolute Error: {:.4f}".format(model_test_MAE))
    print("--R2 Score: {:.4f}".format(model_test_r2score)) 

    r2_score_list.append(model_test_r2score)
    

    print("="*30)
    print('\n')     

{'LinearRegression': 0.8787434815569594}
[1mLinearRegression[0m
['LinearRegression']
MODEL PERFORMANCE FOR TRAINING SET
--Mean Squared Error: 28.4750
--Root Mean Squared Error: 5.3362
--Mean Absolute Error: 4.2762
--R2 Score: 0.8737
----------------------------------------------------------
MODEL PERFORMANCE FOR Test SET
--Mean Squared Error: 29.5064
--Root Mean Squared Error: 5.4320
--Mean Absolute Error: 4.2450
--R2 Score: 0.8787


{'Ridge': 0.8805931485028738}
[1mRidge[0m
['LinearRegression', 'Ridge']
MODEL PERFORMANCE FOR TRAINING SET
--Mean Squared Error: 28.3378
--Root Mean Squared Error: 5.3233
--Mean Absolute Error: 4.2650
--R2 Score: 0.8743
----------------------------------------------------------
MODEL PERFORMANCE FOR Test SET
--Mean Squared Error: 29.0563
--Root Mean Squared Error: 5.3904
--Mean Absolute Error: 4.2111
--R2 Score: 0.8806


{'Lasso': 0.825319744107968}
[1mLasso[0m
['LinearRegression', 'Ridge', 'Lasso']
MODEL PERFORMANCE FOR TRAINING SET
--Mean Squared E

In [50]:
report

{'Support Vector machine': 0.7286001513223705}

AttributeError: 'list' object has no attribute 'values'

# **Results**

In [19]:
results = pd.DataFrame(list(zip(model_list,r2_score_list)),columns=['MODEL','R2 Score']).round(2).sort_values(by=['R2 Score'],ascending=False)
results

Unnamed: 0,MODEL,R2 Score
0,LinearRegression,1.0
1,Ridge,1.0
2,Lasso,0.99
4,XGB Regressor,0.99
5,CatBoost Regressor,0.99
6,Decision Tree Regressor:,0.99
7,RandomForestRegressor,0.99
3,K-Neighbors Regressor,0.96


* the Right model:</br>
**Linear rigression**

In [20]:
lin_model = LinearRegression(fit_intercept=True)
lin_model.fit(X_train,y_train)
y_pred =lin_model.predict(X_test)
score =r2_score(y_test,y_pred)*100
print('Accuracy of the model is: %.1f'%score) 


Accuracy of the model is: 100.0


**Predicted values and the true values** 


In [21]:
show_values = pd.DataFrame({"Actuale Values":y_test,"Predicted Values":y_pred,"Diffrance":(y_test-y_pred)})
show_values

Unnamed: 0,Actuale Values,Predicted Values,Diffrance
521,87.00,87.000267,-0.000267
737,64.00,63.999725,0.000275
740,75.00,75.000526,-0.000526
660,74.67,74.665718,0.004282
411,81.67,81.665749,0.004251
...,...,...,...
408,55.00,55.001152,-0.001152
332,57.00,56.999817,0.000183
208,77.00,76.999725,0.000275
613,72.00,71.999794,0.000206
