In [None]:
### ------------------------------------------- PART A : Grade Prediction Model ------------------------------------------- 

In [1]:
### 1) Download the DataSet:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nikhil7280/student-performance-multiple-linear-regression")

print("Path to dataset files:", path)

Path to dataset files: /Users/romainkuhne/.cache/kagglehub/datasets/nikhil7280/student-performance-multiple-linear-regression/versions/1


In [37]:
### 2) Get the downloaded DataSet and store it into a pd.DataFrame:
import os
import pandas as pd
import numpy as np
# Path to dataset folder
dataset_path = "/Users/romainkuhne/.cache/kagglehub/datasets/nikhil7280/student-performance-multiple-linear-regression/versions/1"

# Find the CSV file (assuming only one CSV in the folder)
for file_name in os.listdir(dataset_path):
    if file_name.endswith(".csv"):
        csv_file = os.path.join(dataset_path, file_name)
        break

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Display the DataFrame
print(df)

      Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0                 7               99                        Yes            9   
1                 4               82                         No            4   
2                 8               51                        Yes            7   
3                 5               52                        Yes            5   
4                 7               75                         No            8   
...             ...              ...                        ...          ...   
9995              1               49                        Yes            4   
9996              7               64                        Yes            8   
9997              6               83                        Yes            8   
9998              9               97                        Yes            7   
9999              7               74                         No            8   

      Sample Question Papers Practiced 

In [40]:
### Replace yes with 1 and No with 0 in the 'Extracurricular Activities' column:
cleaned_df = df.replace({"Extracurricular Activities": {'Yes':1, 'No':0}})
print(cleaned_df)


      Hours Studied  Previous Scores  Extracurricular Activities  Sleep Hours  \
0                 7               99                           1            9   
1                 4               82                           0            4   
2                 8               51                           1            7   
3                 5               52                           1            5   
4                 7               75                           0            8   
...             ...              ...                         ...          ...   
9995              1               49                           1            4   
9996              7               64                           1            8   
9997              6               83                           1            8   
9998              9               97                           1            7   
9999              7               74                           0            8   

      Sample Question Paper

  cleaned_df = df.replace({"Extracurricular Activities": {'Yes':1, 'No':0}})


In [41]:
### Question 1: Predict the performance index of students using Multiple Linear Regression Model:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import SGDRegressor
import random

async def optimize_model(df: pd.DataFrame):
    # 1) Define the FeatureSet & LabelSet:
    X = df.drop(axis=1, columns=["Performance Index"])
    y = df["Performance Index"]
    # 2) Call helper function to split DS into Training & Test set:
    X_train, X_test, y_train, y_test = await split_dataset_train_test(X, y)
    # 3) Call helper function to normalize featureset in Training & Test set:
    X_train, X_test, scaler = await normalize_feature_set_test_df(X_train, X_test)
    # 4) Call helper function to perform mini-batch SGD:
    optimized_model = await perform_mini_batch_sgd(X_train, y_train)
    # 5) Predict on the Test Set:
    y_pred = optimized_model.predict(X_test)
    # 6) Evaluate the model:
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")
    # 7) Return the optimized Multilinear regression model:
    return optimized_model, scaler
    
### Purpose: Split DataSet into Training(80%) & Validation(20%) to ensure the model will be assessed on unseen data:
async def split_dataset_train_test(X: pd.DataFrame, y:pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, train_size=0.8, random_state=42)
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    return X_train, X_test, y_train, y_test

### Purpose: Normalize the feature set to ensure all features contributes equally to the model's performance:
async def normalize_feature_set_test_df(X_train: pd.DataFrame, X_test: pd.DataFrame):
    scaler = StandardScaler()
    # 1) Calculate the mean & STD of the features in the training set
    # 2) Standardize the features in the training set to ensure each fetures got mean=0 & STD=1
    X_train = scaler.fit_transform(X_train)
    # 3) Apply the same feature normalization to the test set_
    X_test = scaler.transform(X_test)
    return X_train, X_test, scaler
    
### Purpose: Perform mini-batch SGD on training data to find W & biais which optimises the linear regression model:
async def perform_mini_batch_sgd(X_train: pd.DataFrame, y_train: pd.DataFrame):
    # 1) Configure the SGDRegressor object:
    sgd = SGDRegressor(
        loss='squared_error', # Loss (objective) function to use: Default = MSE
        max_iter=1000, # Max nbr of pass over the training data set
        tol=1e-3, # Tolerance for convergence} training will stop when (loss > best_loss - tol)
        shuffle=True, # Shuffle the training dataset after each pass
        learning_rate='adaptive', # Type of learning rate
        eta0=0.01, # Initial learning rate
        random_state=42
    )

    # 2) Perform mini batch SGD:
    batch_size=32
    for _ in range(1000):
        indices = np.random.choice(X_train.shape[0], batch_size, replace=False)
        X_batch = X_train[indices]
        y_batch = y_train.iloc[indices]
        sgd.partial_fit(X_batch, y_batch)
    return sgd


optimized_model, scaler = await optimize_model(cleaned_df)

Mean Squared Error: 4.2835425014274495
R2 Score: 0.9884411372273585


In [43]:
async def predict_grade_performance(model, scaler, prepTime:int, prevScores:int, extraActivities:int, sleepHours:int, nbrPracticeTest:int):
    # 1) Create a dictionary to create a DataFrame:
    data_points = {
        "Hours Studied": [prepTime],
        "Previous Scores": [prevScores],
        "Extracurricular Activities": [extraActivities],
        "Sleep Hours": [sleepHours],
        "Sample Question Papers Practiced": [nbrPracticeTest]
    }
    df_input = pd.DataFrame(data=data_points)
    # 2) Feature Normalization using the pre-trained scaler:
    normalized_inputs = scaler.transform(df_input)
    # 3) Predict the future performance:
    predicted_score = model.predict(normalized_inputs)
    return predicted_score

# Predict using the same scaler
new = await predict_grade_performance(optimized_model, scaler, 3, 50, 1, 5, 1)
print(new)
    

[28.04065178]


In [None]:
### ------------------------------------------- PART B : Diabetes Model ------------------------------------------- 

In [1]:
### 1) Download the CSV DataSet:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
print("Path to dataset files:", path)


Path to dataset files: /Users/romainkuhne/.cache/kagglehub/datasets/alexteboul/diabetes-health-indicators-dataset/versions/1


In [2]:
### 2) Convert the CSV DataSet into a pd.DataFrame:
import os
import pandas as pd

def fetch_dataset(path='/Users/romainkuhne/.cache/kagglehub/datasets/alexteboul/diabetes-health-indicators-dataset/versions/1') -> pd.DataFrame:
    try: # 1) Attempt to Get the dataset present in the path:
        for file_name in os.listdir(path):
            if file_name.endswith('.csv'):
                csv_file = os.path.join(path, file_name)
                break
                
    except Exception as e:
        print(f"Error the filepath contains no dataset: {str(e)}")
        return pd.DataFrame()

    # 2) Convert the CSV file into a pd.DataSet:
    df = pd.read_csv(csv_file)
    return df

df = fetch_dataset()
print(df.head(5))
    

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

In [3]:
### 3) Assess the Pearson correlation of label Y = Diabetes vs other columns:
def evaluate_correlation(df: pd.DataFrame):
    correlation_matrix = df.corr()["Diabetes_012"]
    return correlation_matrix

correlation_matrix = evaluate_correlation(df)
print(correlation_matrix)

Diabetes_012            1.000000
HighBP                  0.271596
HighChol                0.209085
CholCheck               0.067546
BMI                     0.224379
Smoker                  0.062914
Stroke                  0.107179
HeartDiseaseorAttack    0.180272
PhysActivity           -0.121947
Fruits                 -0.042192
Veggies                -0.058972
HvyAlcoholConsump      -0.057882
AnyHealthcare           0.015410
NoDocbcCost             0.035436
GenHlth                 0.302587
MentHlth                0.073507
PhysHlth                0.176287
DiffWalk                0.224239
Sex                     0.031040
Age                     0.185026
Education              -0.130517
Income                 -0.171483
Name: Diabetes_012, dtype: float64


In [10]:
### 4) Convert the whole pd.DataFrame into a Numpy Matrix:
import numpy as np
def convert_df_to_matrix(df: pd.DataFrame) -> np.ndarray:
    return df.to_numpy()

matrix_dataset = convert_df_to_matrix(df)
print(matrix_dataset)

[[0. 1. 1. ... 9. 4. 3.]
 [0. 0. 0. ... 7. 6. 1.]
 [0. 1. 1. ... 9. 4. 8.]
 ...
 [0. 0. 0. ... 2. 5. 2.]
 [0. 1. 0. ... 7. 5. 1.]
 [2. 1. 1. ... 9. 6. 2.]]


In [11]:
### question 1) Otpimize a Multi Linear Regression Model for predicting the diagnosis of diabetes:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import SGDRegressor
import random

async def optimize_multi_linear_regression(dataset: np.ndarray):
    # 1) Define Feature Set (X) & Label Set (y):
    X = dataset[:,1:] # Feature set includes all columns from 1:
    y = dataset[:,0] # Label set only includes column 0:
    # 2) Call helper function to partition the dataset into training set (80%) and validation set (20%):
    X_train, X_test, y_train, y_test = await split_dataset(X,y)
    # 3) Call a helper function to normalize feature set in X_train & X_test:
    X_train, X_test, scaler = await normalize_features(X_train, X_test)
    # 4) Call helper function to tine tune the multi linear regression model using mini batch SGD:
    optimized_multilinear_model = await perform_minibatch_sgd(X_train, y_train)
    # 5) Predict on the validation set:
    y_pred = optimized_multilinear_model.predict(X_test)
    # 6) Evaluate the model:
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"MSE: {mse}")
    print(f"R2: {r2}")
    # 7) Return the optimized machine learning model:
    return optimized_multilinear_model


### Purpose: Split the dataset into a training set (80%) and validation set (20%):
async def split_dataset(X: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8)
    return X_train, X_test, y_train, y_test

### Purpose: Normalize Feature Set so that every feature contributes equally to the prediction:
async def normalize_features(X_train: np.ndarray, X_test: np.ndarray):
    scaler = StandardScaler()
    # 1) Compute the mean & STD of each features in the train FeatureSet (X_train):
    # 2) Apply normalization to ensure mean=0 & STD=1 so that each features contributes equally to the prediction:
    X_train = scaler.fit_transform(X_train)
    # 3) Apply the same normalization computed previously to the FeatureSet (X_test):
    X_test = scaler.transform(X_test)
    return X_train, X_test, scaler

### Purpose: Perform Mini Batch Stochastic Gradient Descend:
async def perform_minibatch_sgd(X_train: np.ndarray, y_train: np.ndarray):
    # 1) Configure the SGD Regressor object:
    sgd = SGDRegressor(
        loss='squared_error', # Sets the loss func to MSE:
        max_iter=1000, # Sets the max nbr of iterations:
        tol=0.001, # Sets the stopping criterion =>  training will stop when (loss > best_loss - tol):
        shuffle=True, # Ensures the sample is shuffled back into the DataSet:
        learning_rate='adaptive', # Determine how step size changes
        eta0=0.01, # Sets the initial learning Rate:
        random_state=42
    )
    # 2) Define Mini Batch & Perform optimization to find global minima:
    batch_size = 32
    for _ in range(1000):
        indices = np.random.choice(X_train.shape[0], batch_size, replace=False)
        X_batch = X_train[indices]
        y_batch = y_train[indices]
        sgd.partial_fit(X_batch, y_batch)
    return sgd

optimized_multi_linear_regression = await optimize_multi_linear_regression(matrix_dataset)

    

MSE: 0.474479363008601
R2: 0.023580335460434365


In [12]:
### Conclusion: The Multi Linear Regression Model isn't performing well enough !
### This could be due to the fact that the relationship btw features and label is non-linear