### STEP 1: IMPORTING NECESSARY LIBRARIES 

In [2]:
# Loading essential libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning Pipeline using Scikit-learn:

import sklearn as sc

  # Data Loading & Splitting  
from sklearn.model_selection  import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

  # Preprocessing (Encoding, Scaling, Pipelines)  
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

  # Model Training (Logistic Regression, SVM, Random Forest)  
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier

  # Evaluation (R² Score, MSE, MAE, Cross-Validation)
from sklearn.metrics import (r2_score, mean_squared_error,mean_absolute_error)


# To remove the warnings
import warnings
warnings.filterwarnings("ignore")

### STEP 2:  LOADING DATASET

In [4]:
s = pd.read_excel(r'C:\Users\Dell\Desktop\Student Performance Factors.xlsx')
s

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23.0,84.0,Low,High,No,7.0,73.0,Low,Yes,0.0,Low,Medium,Public,Positive,3.0,No,High School,Near,Male,67
1,19.0,64.0,Low,Medium,No,8.0,59.0,Low,Yes,2.0,Medium,Medium,Public,Negative,4.0,No,College,Moderate,Female,61
2,24.0,98.0,Medium,Medium,Yes,7.0,91.0,Medium,Yes,2.0,Medium,Medium,Public,Neutral,4.0,No,Postgraduate,Near,Male,74
3,29.0,89.0,Low,Medium,Yes,8.0,98.0,Medium,Yes,1.0,Medium,Medium,Public,Negative,4.0,No,High School,Moderate,Male,71
4,19.0,92.0,Medium,Medium,Yes,6.0,65.0,Medium,Yes,3.0,Medium,High,Public,Neutral,4.0,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6603,23.0,76.0,High,Medium,No,8.0,81.0,Medium,Yes,3.0,Low,High,Public,Positive,2.0,No,High School,Near,Female,69
6604,20.0,90.0,Medium,Low,Yes,6.0,65.0,Low,Yes,3.0,Low,Medium,Public,Negative,2.0,No,Postgraduate,Near,Female,68
6605,10.0,86.0,High,High,Yes,6.0,91.0,High,Yes,2.0,Low,Medium,Private,Positive,3.0,No,High School,Far,Female,68
6606,15.0,67.0,Medium,Low,Yes,9.0,94.0,Medium,Yes,0.0,Medium,Medium,Public,Positive,4.0,No,Postgraduate,Near,Male,64


### STEP 3: KNOW ABOUT YOUR DATASET / DATA CLEANING

In [6]:
s.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender',
       'Exam_Score'],
      dtype='object')

In [7]:
s.describe()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6608.0
mean,19.975329,79.977448,7.02906,75.070531,1.493719,2.96761,95.647851
std,5.990594,11.547475,1.46812,14.399784,1.23057,1.031231,2309.619157
min,1.0,60.0,4.0,50.0,0.0,0.0,55.0
25%,16.0,70.0,6.0,63.0,1.0,2.0,65.0
50%,20.0,80.0,7.0,75.0,1.0,3.0,67.0
75%,24.0,90.0,8.0,88.0,2.0,4.0,69.0
max,44.0,100.0,10.0,100.0,8.0,6.0,187815.0


In [8]:
s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6608 entries, 0 to 6607
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Hours_Studied               6607 non-null   float64
 1   Attendance                  6607 non-null   float64
 2   Parental_Involvement        6607 non-null   object 
 3   Access_to_Resources         6607 non-null   object 
 4   Extracurricular_Activities  6607 non-null   object 
 5   Sleep_Hours                 6607 non-null   float64
 6   Previous_Scores             6607 non-null   float64
 7   Motivation_Level            6607 non-null   object 
 8   Internet_Access             6607 non-null   object 
 9   Tutoring_Sessions           6607 non-null   float64
 10  Family_Income               6607 non-null   object 
 11  Teacher_Quality             6529 non-null   object 
 12  School_Type                 6607 non-null   object 
 13  Peer_Influence              6607 

In [9]:
s.isnull().sum()

Hours_Studied                  1
Attendance                     1
Parental_Involvement           1
Access_to_Resources            1
Extracurricular_Activities     1
Sleep_Hours                    1
Previous_Scores                1
Motivation_Level               1
Internet_Access                1
Tutoring_Sessions              1
Family_Income                  1
Teacher_Quality               79
School_Type                    1
Peer_Influence                 1
Physical_Activity              1
Learning_Disabilities          1
Parental_Education_Level      91
Distance_from_Home            68
Gender                         1
Exam_Score                     0
dtype: int64

## Filling The Missing Values

In [11]:
num_var= s.iloc[: ,[0,1,5,6,9,14]].mean()

s.fillna(num_var,inplace= True)

In [12]:
cat_var= s.iloc[: ,[2,3,4,7,8,10,11,12,13,15,16,17,18]].mode().iloc[0]

s.fillna(cat_var,inplace= True)

In [13]:
s.isnull().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
Exam_Score                    0
dtype: int64

### STEP 3: ASSIGN INDEPENDENT AND DEPENDENT VARIABLE

In [15]:
# INDEPENDENT VARIABLE (FEATURES)

x= s.iloc[:, :-1].values
x

array([[23.0, 84.0, 'Low', ..., 'High School', 'Near', 'Male'],
       [19.0, 64.0, 'Low', ..., 'College', 'Moderate', 'Female'],
       [24.0, 98.0, 'Medium', ..., 'Postgraduate', 'Near', 'Male'],
       ...,
       [10.0, 86.0, 'High', ..., 'High School', 'Far', 'Female'],
       [15.0, 67.0, 'Medium', ..., 'Postgraduate', 'Near', 'Male'],
       [19.975329196306948, 79.97744816104132, 'Medium', ...,
        'High School', 'Near', 'Male']], dtype=object)

In [16]:
# DEPENDENT VARIABLE (TARGET)

y = s.iloc[:,-1]
y

0           67
1           61
2           74
3           71
4           70
         ...  
6603        69
6604        68
6605        68
6606        64
6607    187815
Name: Exam_Score, Length: 6608, dtype: int64

### STEP 4 : Encoding

In [18]:
num_var= [0,1,5,6,9,14]

cat_var= [2,3,4,7,8,10,11,12,13,15,16,17,18]


preprocess = ColumnTransformer(
    transformers=[
        ("num_var", StandardScaler(), num_var),
        ("cat_var", OneHotEncoder(handle_unknown="ignore"), cat_var),
    ],
    remainder="drop"        # drop anything which is not in the list
)

### STEP 5 : SPLITTING THE DATASET

In [20]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.3 , random_state = 42)

In [21]:
x_train

array([[24.0, 73.0, 'High', ..., 'Postgraduate', 'Near', 'Female'],
       [28.0, 83.0, 'High', ..., 'High School', 'Moderate', 'Male'],
       [31.0, 67.0, 'Low', ..., 'College', 'Near', 'Male'],
       ...,
       [17.0, 64.0, 'High', ..., 'High School', 'Moderate', 'Female'],
       [16.0, 100.0, 'High', ..., 'High School', 'Near', 'Male'],
       [21.0, 62.0, 'Medium', ..., 'Postgraduate', 'Far', 'Male']],
      dtype=object)

In [22]:
x_test

array([[16.0, 100.0, 'High', ..., 'High School', 'Moderate', 'Male'],
       [16.0, 95.0, 'Medium', ..., 'High School', 'Near', 'Female'],
       [22.0, 79.0, 'Low', ..., 'College', 'Moderate', 'Male'],
       ...,
       [16.0, 84.0, 'Low', ..., 'High School', 'Near', 'Female'],
       [17.0, 82.0, 'High', ..., 'High School', 'Near', 'Female'],
       [14.0, 91.0, 'Medium', ..., 'High School', 'Moderate', 'Male']],
      dtype=object)

In [23]:
y_train

1718    68
1951    68
1379    70
6134    63
6496    69
        ..
3772    66
5191    65
5226    62
5390    73
860     62
Name: Exam_Score, Length: 4625, dtype: int64

In [24]:
y_test

230     69
2540    66
1385    68
1544    62
3692    64
        ..
2892    75
1465    65
1258    65
2029    66
6455    70
Name: Exam_Score, Length: 1983, dtype: int64

### STEP 6 :CREATE AND FIT THE MODEL

In [26]:
# Logistic Regression pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('classifier', LogisticRegression(class_weight='balanced', random_state=0))
])
lr_pipeline.fit(x_train, y_train)

In [27]:
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('classifier', SVC(probability=True, random_state=0))  # Support Vector Classification (SVC)
])
svm_pipeline.fit(x_train, y_train)

In [28]:
# Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('classifier', RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42))
])
rf_pipeline.fit(x_train, y_train)

### STEP 7 :FEATURE SCALING

#### Using Logistic Regression (lr) and Random Forest (rf)

In [31]:
models = [lr_pipeline, rf_pipeline]
name = ["Support Vector Machine", "Random Forest"]

for model, name in zip(models, name):
    print(f"\n===== {name} =====")

    # Using the full test set
    y_prob = model.predict_proba(x_test)[:, 1]  # Get the predicted probability that each test sample belongs to the positive class (class 1).
    
    # Predict
    y_pred = model.predict(x_test)
    
    print("R² Score:", r2_score(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))


===== Support Vector Machine =====
R² Score: -23495048.00169222
MSE: 337740084.3787191
MAE: 1803.0292486132123

===== Random Forest =====
R² Score: 0.4418624868083869
MSE: 8.023197175995966
MAE: 1.756933938477055


#### Using Support Vector Machine (SVM) and Random Forest (rf)

In [33]:
models = [svm_pipeline, rf_pipeline]
name = ["Support Vector Machine", "Random Forest"]

for model, name in zip(models, name):
    print(f"\n===== {name} =====")

    # Using the full test set
    y_prob = model.predict_proba(x_test)[:, 1]  # Get the predicted probability that each test sample belongs to the positive class (class 1).
    
    # Predict
    y_pred = model.predict(x_test)
    
    print("R² Score:", r2_score(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))


===== Support Vector Machine =====
R² Score: 0.7184054168202968
MSE: 4.047907211296016
MAE: 0.7216338880484114

===== Random Forest =====
R² Score: 0.4418624868083869
MSE: 8.023197175995966
MAE: 1.756933938477055


### Cross-Validation for Regression

In [None]:
from sklearn.model_selection import cross_val_score

models = [svm_pipeline, rf_pipeline]
names = ["Support Vector Machine", "Random Forest"]

for model, name in zip(models, names):
    scores = cross_val_score(model, x, y, cv=5, scoring='r2')
    print(f"\n===== {name} =====")
    print("R² Scores (5-Fold CV):", scores)
    print("Mean R² Score:", scores.mean())


### K FOLD 

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_score(model, x, y, cv=kf, scoring='r2')

In [None]:
kf_mean = (0.5081-0.0007+0.5852+0.4012+0.3730)/5

kf_mean

### Summary:

#### SVM clearly performs best: Highest test R², lowest MSE & MAE, and best cross-validation mean R².

#### Random Forest is reasonable but less effective than SVM.

#### Logistic Regression is unsuitable here – its metrics suggest it's being used on a non-classification (regression) problem.

## Checking which factors has an impact

In [None]:
target = 'Exam_Score'
num_var = [0, 1, 5, 6, 9, 14]
cat_var = [2, 3, 4, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18]

# --- Numeric Feature Correlations ---
num_data = []
for idx in num_var:
    col = s.columns[idx]
    corr = s[[col, target]].corr().iloc[0, 1]
    num_data.append((col, abs(corr)))  # Use absolute value

num_df = pd.DataFrame(num_data, columns=['Feature', 'Value'])

explode_num = [0.05] * len(num_df)

# --- Categorical Feature Mean Scores ---
cat_data = []
for idx in cat_var:
    col = s.columns[idx]
    mean_score = s.groupby(col)[target].mean().mean()
    cat_data.append((col, mean_score))

cat_df = pd.DataFrame(cat_data, columns=['Feature', 'Value'])
explode_cat = [0.05] * len(cat_df)

# --- Plot: Numeric Feature Pie Chart ---
plt.figure(figsize=(6, 6))
plt.pie(num_df['Value'], labels=num_df['Feature'], autopct='%1.1f%%', startangle=140, explode=explode_num)
plt.title('Relative Impact of Numeric Features (Correlation Strength)',color ="blue")
plt.tight_layout()
plt.show()

# --- Plot: Categorical Feature Pie Chart ---
plt.figure(figsize=(6, 6))
plt.pie(cat_df['Value'], labels=cat_df['Feature'], autopct='%1.1f%%', startangle=140, explode=explode_cat)
plt.title('Average Exam Score by Categorical Feature', color ="blue")
plt.tight_layout()
plt.show()
