In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import RandomizedSearchCV

In [27]:
data = pd.read_csv(r"D:\P1 Anna\python_ml_end_to_end\notebook\data\data.csv")
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [29]:
X = data.drop("math_score", axis=1)
y = data["math_score"]

In [31]:
numeric_col = [i for i in X.columns if X[i].dtype!="object"]
catergorical_col = [i for i in X.columns if X[i].dtype =="object"]
print(numeric_col)
print(catergorical_col)

['reading_score', 'writing_score']
['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [32]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, catergorical_col),
         ("StandardScaler", numeric_transformer, numeric_col),        
    ]
)

In [33]:
X = preprocessor.fit_transform(X)

In [35]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [36]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True,random_state=42)

In [48]:
models = {
    "Linear Regression":LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor()

}


res = {}
for name, model in models.items():
    cross_val_res = cross_val_score(model, X, y, cv = kf, scoring="r2")

    print("Cross-Validation R² Scores:", cross_val_res)
    print("Mean R² Score:", cross_val_res.mean())
    res[name]= cross_val_res.mean()

Cross-Validation R² Scores: [0.87597767 0.86169496 0.88101988 0.85340536 0.87859861]
Mean R² Score: 0.8701392965645324
Cross-Validation R² Scores: [0.82531973 0.78962274 0.82811    0.79157463 0.81585015]
Mean R² Score: 0.8100954502273439
Cross-Validation R² Scores: [0.88059315 0.86157924 0.88115774 0.85567702 0.87960879]
Mean R² Score: 0.8717231882943401
Cross-Validation R² Scores: [0.78119113 0.78069478 0.78865767 0.74550039 0.76607985]
Mean R² Score: 0.7724247667788523
Cross-Validation R² Scores: [0.85142605 0.8340484  0.84152594 0.82138706 0.83589075]
Mean R² Score: 0.8368556407040059
Cross-Validation R² Scores: [0.74186154 0.74266948 0.73487273 0.6880644  0.73713702]
Mean R² Score: 0.7289210346617347


In [50]:
res

{'Linear Regression': 0.8701392965645324,
 'Lasso': 0.8100954502273439,
 'Ridge': 0.8717231882943401,
 'KNN': 0.7724247667788523,
 'Random Forest': 0.8368556407040059,
 'Decision Tree': 0.7289210346617347}

In [51]:
result = pd.DataFrame(list(res.items()), columns=["Model", "Result-R2 Score"])
result

Unnamed: 0,Model,Result-R2 Score
0,Linear Regression,0.870139
1,Lasso,0.810095
2,Ridge,0.871723
3,KNN,0.772425
4,Random Forest,0.836856
5,Decision Tree,0.728921
