In [1]:
### 1) Download the DataSet:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nikhil7280/student-performance-multiple-linear-regression")

print("Path to dataset files:", path)

Path to dataset files: /Users/romainkuhne/.cache/kagglehub/datasets/nikhil7280/student-performance-multiple-linear-regression/versions/1


In [37]:
### 2) Get the downloaded DataSet and store it into a pd.DataFrame:
import os
import pandas as pd
import numpy as np
# Path to dataset folder
dataset_path = "/Users/romainkuhne/.cache/kagglehub/datasets/nikhil7280/student-performance-multiple-linear-regression/versions/1"

# Find the CSV file (assuming only one CSV in the folder)
for file_name in os.listdir(dataset_path):
    if file_name.endswith(".csv"):
        csv_file = os.path.join(dataset_path, file_name)
        break

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Display the DataFrame
print(df)

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0                 7               99                        Yes            9   
1                 4               82                         No            4   
2                 8               51                        Yes            7   
3                 5               52                        Yes            5   
4                 7               75                         No            8   
...             ...              ...                        ...          ...   
9995              1               49                        Yes            4   
9996              7               64                        Yes            8   
9997              6               83                        Yes            8   
9998              9               97                        Yes            7   
9999              7               74                         No            8   

      Sample Question Papers Practiced 

In [40]:
### Replace yes with 1 and No with 0 in the 'Extracurricular Activities' column:
cleaned_df = df.replace({"Extracurricular Activities": {'Yes':1, 'No':0}})
print(cleaned_df)


      Hours Studied  Previous Scores  Extracurricular Activities  Sleep Hours  \
0                 7               99                           1            9   
1                 4               82                           0            4   
2                 8               51                           1            7   
3                 5               52                           1            5   
4                 7               75                           0            8   
...             ...              ...                         ...          ...   
9995              1               49                           1            4   
9996              7               64                           1            8   
9997              6               83                           1            8   
9998              9               97                           1            7   
9999              7               74                           0            8   

      Sample Question Paper

  cleaned_df = df.replace({"Extracurricular Activities": {'Yes':1, 'No':0}})


In [41]:
### Question 1: Predict the performance index of students using Multiple Linear Regression Model:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import SGDRegressor
import random

async def optimize_model(df: pd.DataFrame):
    # 1) Define the FeatureSet & LabelSet:
    X = df.drop(axis=1, columns=["Performance Index"])
    y = df["Performance Index"]
    # 2) Call helper function to split DS into Training & Test set:
    X_train, X_test, y_train, y_test = await split_dataset_train_test(X, y)
    # 3) Call helper function to normalize featureset in Training & Test set:
    X_train, X_test, scaler = await normalize_feature_set_test_df(X_train, X_test)
    # 4) Call helper function to perform mini-batch SGD:
    optimized_model = await perform_mini_batch_sgd(X_train, y_train)
    # 5) Predict on the Test Set:
    y_pred = optimized_model.predict(X_test)
    # 6) Evaluate the model:
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")
    # 7) Return the optimized Multilinear regression model:
    return optimized_model, scaler
    
### Purpose: Split DataSet into Training(80%) & Validation(20%) to ensure the model will be assessed on unseen data:
async def split_dataset_train_test(X: pd.DataFrame, y:pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, train_size=0.8, random_state=42)
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    return X_train, X_test, y_train, y_test

### Purpose: Normalize the feature set to ensure all features contributes equally to the model's performance:
async def normalize_feature_set_test_df(X_train: pd.DataFrame, X_test: pd.DataFrame):
    scaler = StandardScaler()
    # 1) Calculate the mean & STD of the features in the training set
    # 2) Standardize the features in the training set to ensure each fetures got mean=0 & STD=1
    X_train = scaler.fit_transform(X_train)
    # 3) Apply the same feature normalization to the test set_
    X_test = scaler.transform(X_test)
    return X_train, X_test, scaler
    
### Purpose: Perform mini-batch SGD on training data to find W & biais which optimises the linear regression model:
async def perform_mini_batch_sgd(X_train: pd.DataFrame, y_train: pd.DataFrame):
    # 1) Configure the SGDRegressor object:
    sgd = SGDRegressor(
        loss='squared_error', # Loss (objective) function to use: Default = MSE
        max_iter=1000, # Max nbr of pass over the training data set
        tol=1e-3, # Tolerance for convergence} training will stop when (loss > best_loss - tol)
        shuffle=True, # Shuffle the training dataset after each pass
        learning_rate='adaptive', # Type of learning rate
        eta0=0.01, # Initial learning rate
        random_state=42
    )

    # 2) Perform mini batch SGD:
    batch_size=32
    for _ in range(1000):
        indices = np.random.choice(X_train.shape[0], batch_size, replace=False)
        X_batch = X_train[indices]
        y_batch = y_train.iloc[indices]
        sgd.partial_fit(X_batch, y_batch)
    return sgd


optimized_model, scaler = await optimize_model(cleaned_df)

Mean Squared Error: 4.2835425014274495
R2 Score: 0.9884411372273585


In [43]:
async def predict_grade_performance(model, scaler, prepTime:int, prevScores:int, extraActivities:int, sleepHours:int, nbrPracticeTest:int):
    # 1) Create a dictionary to create a DataFrame:
    data_points = {
        "Hours Studied": [prepTime],
        "Previous Scores": [prevScores],
        "Extracurricular Activities": [extraActivities],
        "Sleep Hours": [sleepHours],
        "Sample Question Papers Practiced": [nbrPracticeTest]
    }
    df_input = pd.DataFrame(data=data_points)
    # 2) Feature Normalization using the pre-trained scaler:
    normalized_inputs = scaler.transform(df_input)
    # 3) Predict the future performance:
    predicted_score = model.predict(normalized_inputs)
    return predicted_score

# Predict using the same scaler
new = await predict_grade_performance(optimized_model, scaler, 3, 50, 1, 5, 1)
print(new)
    

[28.04065178]
