In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier


In [64]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [65]:
df = pd.read_csv('Student_performance_data .csv')


In [78]:
#df = pd.get_dummies(df, columns=['Ethnicity'], prefix='Ethnicity')
ethnicity_cols = ['Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2', 'Ethnicity_3']
df[ethnicity_cols] = df[ethnicity_cols].astype(int)
print("Sample of Ethnicity Encoded Columns (converted to integers):")
print(df[ethnicity_cols].head())

Sample of Ethnicity Encoded Columns (converted to integers):
   Ethnicity_0  Ethnicity_1  Ethnicity_2  Ethnicity_3
0            1            0            0            0
1            1            0            0            0
2            0            0            1            0
3            1            0            0            0
4            1            0            0            0


In [80]:
df.columns.to_list()

['StudentID',
 'Age',
 'Gender',
 'ParentalEducation',
 'StudyTimeWeekly',
 'Absences',
 'Tutoring',
 'ParentalSupport',
 'Extracurricular',
 'Sports',
 'Music',
 'Volunteering',
 'GPA',
 'GradeClass',
 'Ethnicity_0',
 'Ethnicity_1',
 'Ethnicity_2',
 'Ethnicity_3']

In [None]:
df.head() #checking the new columns

Unnamed: 0,StudentID,Age,Gender,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass,Ethnicity_0,Ethnicity_1,Ethnicity_2,Ethnicity_3
0,1001,0.472919,1,2,1.780336,-0.890822,1,-0.108744,0,0,1,0,2.929196,2.0,1,0,0,0
1,1002,1.362944,0,1,0.997376,-1.717694,0,-0.999551,0,0,0,0,3.042915,1.0,1,0,0,0
2,1003,-1.307132,0,3,-0.984045,1.353542,0,-0.108744,0,0,0,0,0.112602,4.0,0,0,1,0
3,1004,0.472919,1,3,0.045445,-0.063951,0,0.782063,1,0,0,0,2.054218,3.0,1,0,0,0
4,1005,0.472919,1,2,-0.902311,0.290422,1,0.782063,0,0,0,0,1.288061,4.0,1,0,0,0


In [68]:
scaler = StandardScaler()
numerical_cols = ['Age', 'ParentalSupport','StudyTimeWeekly', 'Absences' ] #Ensures that all the numeric features are all on the same scale , allows LR to perform better
df[numerical_cols] = scaler.fit_transform(df[numerical_cols]) #scale is between -2 and 2
print("Scaled Data Sample:")
df[numerical_cols].head()

Scaled Data Sample:


Unnamed: 0,Age,ParentalSupport,StudyTimeWeekly,Absences
0,0.472919,-0.108744,1.780336,-0.890822
1,1.362944,-0.999551,0.997376,-1.717694
2,-1.307132,-0.108744,-0.984045,1.353542
3,0.472919,0.782063,0.045445,-0.063951
4,0.472919,0.782063,-0.902311,0.290422


In [69]:
x = df.drop(['StudentID', 'GPA', 'GradeClass'], axis=1)
y = df['GradeClass']

In [70]:
print("Features in X:", x.columns.tolist()) #displays the features the Datafeame will use to train modeedls
print("Target Y Sample:", y.head()) # the target variable that is we're tyring to predict

Features in X: ['Age', 'Gender', 'ParentalEducation', 'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering', 'Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2', 'Ethnicity_3']
Target Y Sample: 0    2.0
1    1.0
2    4.0
3    3.0
4    4.0
Name: GradeClass, dtype: float64


In [71]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y) #Train-Test Split : Training-Data = 80% Test-Data = 20%


In [72]:
print("Training set size:", x_train.shape) #Displays the data split
print("Testing set size:", x_test.shape)

Training set size: (1913, 15)
Testing set size: (479, 15)


In [73]:
log_model = LogisticRegression(max_iter=1000 , class_weight='balanced') #Creates Logistic regression model , "balanced" handles the implance in GradeClass(over-sampling) ; avoids the model from over-focusing on the majority class (GradeClass = C) 
log_model.fit(x_train, y_train) #trains model on the given data
y_pred_log = log_model.predict(x_test)
print("\nLogistic Regression Results:")
print(classification_report(y_test, y_pred_log))



Logistic Regression Results:
              precision    recall  f1-score   support

         0.0       0.11      0.33      0.17        21
         1.0       0.39      0.30      0.34        54
         2.0       0.46      0.47      0.47        78
         3.0       0.35      0.43      0.39        83
         4.0       0.95      0.76      0.84       243

    accuracy                           0.58       479
   macro avg       0.45      0.46      0.44       479
weighted avg       0.67      0.58      0.62       479



In [74]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42 , class_weight='balanced')
rf_model.fit(x_train, y_train) #trains model on the given data
y_pred_rf = rf_model.predict(x_test)
print("\nRandom Forest Results:")
print(classification_report(y_test, y_pred_rf))


Random Forest Results:
              precision    recall  f1-score   support

         0.0       0.20      0.10      0.13        21
         1.0       0.48      0.39      0.43        54
         2.0       0.50      0.62      0.55        78
         3.0       0.47      0.41      0.44        83
         4.0       0.87      0.92      0.90       243

    accuracy                           0.69       479
   macro avg       0.50      0.49      0.49       479
weighted avg       0.67      0.69      0.67       479



In [75]:
#Checks which feature is most predictive in the rf model
importances = rf_model.feature_importances_
feature_names = x.columns
feature_importance_df = pd.DataFrame({'Feature' :feature_names, 'Importance' : importances})
feature_importance_df.sort_values(by='Importance', ascending=False)


Unnamed: 0,Feature,Importance
4,Absences,0.332502
3,StudyTimeWeekly,0.182718
6,ParentalSupport,0.084954
0,Age,0.071098
2,ParentalEducation,0.070268
1,Gender,0.036486
7,Extracurricular,0.033779
8,Sports,0.032438
5,Tutoring,0.031456
9,Music,0.025202


In [76]:
#XBB_Model
xgb_model = XGBClassifier(eval_metric='mlogloss' ,random_state=42) #used in multi-class classifications
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
xgb_model.fit(x_train,y_train,sample_weight=sample_weights) #trains model on the given data
y_pred_xgb = xgb_model.predict(x_test)
print("\nXGBoost Results:")
print(classification_report(y_test, y_pred_xgb))


XGBoost Results:
              precision    recall  f1-score   support

         0.0       0.33      0.19      0.24        21
         1.0       0.45      0.50      0.47        54
         2.0       0.45      0.49      0.47        78
         3.0       0.41      0.49      0.45        83
         4.0       0.92      0.84      0.88       243

    accuracy                           0.65       479
   macro avg       0.51      0.50      0.50       479
weighted avg       0.67      0.65      0.66       479

