In [29]:
import pandas as pd
import numpy as np

In [30]:
df = pd.read_csv(rf'D:\DS_Work\Projects\Student Performance\notebooks\data\Student_Performance.csv')
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [31]:
X = df.drop(columns=['Performance Index'])
y = df[['Performance Index']]

In [32]:
categorical_columns = X.columns[X.dtypes == 'O']
numerical_columns = X.columns[X.dtypes != 'O']

In [33]:
## For Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## For Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [34]:
## Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()) 
    ]
)

## Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ('scaler', StandardScaler()) 
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [35]:
## Train Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=786)

In [36]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [38]:
## Model Training

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score

In [49]:
## Training Multiple Models

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
}

report = {}

for i in range(len(list(models.keys()))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    ## Make Predictions

    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)

    report[list(models.keys())[i]] = score

best_model = [k for k, v in report.items() if v == max(list(report.values()))][0]

In [50]:
best_model

'LinearRegression'