In [94]:
import pandas as pd
import numpy as np
!pip install dash
from dash import Dash
from dash import dcc
from dash import html
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.pipeline import Pipeline
!pip install xgboost
from xgboost import XGBClassifier
import joblib



In [None]:
df = pd.read_csv('Student_performance_data .csv')


In [53]:
print(f"Dataset shape: {df.shape}")  # Should show (2392, 17)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())


Dataset shape: (2392, 17)

First 5 rows:
   studentid       age    gender  parentaleducation  studytimeweekly  \
0       1001  0.472919  0.978492           0.253711         1.780336   
1       1002  1.362944 -1.021981          -0.746087         0.997376   
2       1003 -1.307132 -1.021981           1.253509        -0.984045   
3       1004  0.472919  0.978492           1.253509         0.045445   
4       1005  0.472919  0.978492           0.253711        -0.902311   

   absences  tutoring  parentalsupport  extracurricular    sports     music  \
0 -0.890822  1.522371        -0.108744        -0.788476 -0.660132  2.019544   
1 -1.717694 -0.656870        -0.999551        -0.788476 -0.660132 -0.495161   
2  1.353542 -0.656870        -0.108744        -0.788476 -0.660132 -0.495161   
3 -0.063951 -0.656870         0.782063         1.268269 -0.660132 -0.495161   
4  0.290422  1.522371         0.782063        -0.788476 -0.660132 -0.495161   

   volunteering       gpa  gradeclass  ethnicity_1 

In [54]:
print(f"\nMissing values per column:\n{df.isnull().sum()}")
df = df.dropna()
df


Missing values per column:
studentid            0
age                  0
gender               0
parentaleducation    0
studytimeweekly      0
absences             0
tutoring             0
parentalsupport      0
extracurricular      0
sports               0
music                0
volunteering         0
gpa                  0
gradeclass           0
ethnicity_1          0
ethnicity_2          0
ethnicity_3          0
dtype: int64


Unnamed: 0,studentid,age,gender,parentaleducation,studytimeweekly,absences,tutoring,parentalsupport,extracurricular,sports,music,volunteering,gpa,gradeclass,ethnicity_1,ethnicity_2,ethnicity_3
0,1001,0.472919,0.978492,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866,1.118086,2.0,-0.509520,-0.494507,-0.31985
1,1002,1.362944,-1.021981,-0.746087,0.997376,-1.717694,-0.656870,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866,1.242374,1.0,-0.509520,-0.494507,-0.31985
2,1003,-1.307132,-1.021981,1.253509,-0.984045,1.353542,-0.656870,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866,-1.960277,4.0,-0.509520,2.022217,-0.31985
3,1004,0.472919,0.978492,1.253509,0.045445,-0.063951,-0.656870,0.782063,1.268269,-0.660132,-0.495161,-0.431866,0.161790,3.0,-0.509520,-0.494507,-0.31985
4,1005,0.472919,0.978492,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866,-0.675573,4.0,-0.509520,-0.494507,-0.31985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,1.362944,0.978492,1.253509,0.160762,-1.481445,-0.656870,1.672869,1.268269,-0.660132,-0.495161,-0.431866,1.693315,0.0,-0.509520,-0.494507,-0.31985
2388,3389,0.472919,-1.021981,-0.746087,-0.387285,-1.245196,1.522371,1.672869,-0.788476,1.514848,-0.495161,-0.431866,1.500565,4.0,-0.509520,-0.494507,-0.31985
2389,3390,-0.417106,0.978492,0.253711,-0.524895,0.644796,-0.656870,-0.108744,-0.788476,-0.660132,-0.495161,2.315535,-0.834845,2.0,-0.509520,-0.494507,-0.31985
2390,3391,-0.417106,0.978492,-1.745885,0.467950,0.290422,-0.656870,-0.108744,-0.788476,1.514848,2.019544,-0.431866,-0.112452,1.0,1.962633,-0.494507,-0.31985


In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [89]:
categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in ['studentid', 'gradeclass']]
if categorical_cols:
    print(f"\nEncoding categorical columns: {categorical_cols}")
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

numerical_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col]) and col not in ['studentid', 'gradeclass']]
print(f"\nScaling numerical columns: {numerical_cols}")
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])



Scaling numerical columns: ['age', 'gender', 'parentaleducation', 'studytimeweekly', 'absences', 'tutoring', 'parentalsupport', 'extracurricular', 'sports', 'music', 'volunteering', 'gpa', 'ethnicity_1', 'ethnicity_2', 'ethnicity_3']


In [None]:
#df = pd.get_dummies(df, columns=['Ethnicity'], prefix= 'Ethnicity', drop_first=True)

#scaler = StandardScaler()
#numerical_cols = ['Age', 'ParentalSupport','StudyTimeWeekly', 'Absences' ] #Ensures that all the numeric features are all on the same scale , allows LR to perform better
#df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

#print("Encoded and Scaled Data:")
#print(df.head())

In [68]:
target_col = 'gradeclass'
X = df.drop(['studentid', 'gradeclass'], axis=1, errors='ignore')
y = df[target_col]

'''if y.dtype == 'object':
    y = y.astype('category').cat.codes
    class_names = dict(enumerate(df[target_col].astype('category').cat.categories))
else:
    class_names = dict(enumerate(np.unique(y)))
'''

if not np.issubdtype(y.dtype, np.number):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    target_names = label_encoder.classes_
else:
    target_names = [str(label) for label in np.unique(y)]

In [57]:
# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTraining set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (1913, 15), Test set: (479, 15)


In [58]:
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [63]:
pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [10, 20, None],
    'clf__min_samples_split': [2, 5],
    'clf__class_weight': ['balanced']
}

In [77]:
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

grid.fit(X_train, y_train, clf__sample_weight=sample_weights)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [78]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print(f"\nBest Parameters: {grid.best_params_}")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))


Best Parameters: {'clf__class_weight': 'balanced', 'clf__max_depth': 20, 'clf__min_samples_split': 5, 'clf__n_estimators': 200}
Accuracy Score: 0.9206680584551148

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.52      0.69        21
         1.0       0.91      0.89      0.90        54
         2.0       0.91      0.94      0.92        78
         3.0       0.85      0.89      0.87        83
         4.0       0.95      0.97      0.96       243

    accuracy                           0.92       479
   macro avg       0.92      0.84      0.87       479
weighted avg       0.92      0.92      0.92       479



In [74]:
joblib.dump(best_model, 'best_random_forest_model.pkl')
joblib.dump(scaler, 'scaler_17cols.save')

['scaler_17cols.save']

In [90]:
!pip install dash==2.0.0 pandas==1.3.3

Collecting dash==2.0.0
  Obtaining dependency information for dash==2.0.0 from https://files.pythonhosted.org/packages/96/3f/f287d7a0084460e41abb636694a9dc13d030733687d944256fccdbac2ff8/dash-2.0.0-py3-none-any.whl.metadata
  Downloading dash-2.0.0-py3-none-any.whl.metadata (9.9 kB)
Collecting pandas==1.3.3
  Downloading pandas-1.3.3.tar.gz (4.7 MB)
     ---------------------------------------- 0.0/4.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/4.7 MB ? eta -:--:--
      --------------------------------------- 0.1/4.7 MB 3.2 MB/s eta 0:00:02
     - -------------------------------------- 0.1/4.7 MB 1.8 MB/s eta 0:00:03
     - -------------------------------------- 0.2/4.7 MB 1.8 MB/s eta 0:00:03
     - -------------------------------------- 0.2/4.7 MB 1.8 MB/s eta 0:00:03
     --- ------------------------------------ 0.4/4.7 MB 1.6 MB/s eta 0:00:03
     --- ------------------------------------ 0.5/4.7 MB 1.8 MB/s eta 0:00:03
     ---- -----------------------------

  error: subprocess-exited-with-error
  
  pip subprocess to install build dependencies did not run successfully.
  exit code: 1
  
  [435 lines of output]
  Ignoring numpy: markers 'python_version == "3.7" and (platform_machine != "arm64" or platform_system != "Darwin") and platform_machine != "aarch64"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.8" and (platform_machine != "arm64" or platform_system != "Darwin") and platform_machine != "aarch64"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.7" and platform_machine == "aarch64"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.8" and platform_machine == "aarch64"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.8" and platform_machine == "arm64" and platform_system == "Darwin"' don't match your environment
  Ignoring numpy: markers 'python_version == "3.9" and platform_machine == "arm64" and platform_system == "Da

In [None]:
train_dataset = tf.data.

In [80]:
df.columns.to_list()

['studentid',
 'age',
 'gender',
 'parentaleducation',
 'studytimeweekly',
 'absences',
 'tutoring',
 'parentalsupport',
 'extracurricular',
 'sports',
 'music',
 'volunteering',
 'gpa',
 'gradeclass',
 'ethnicity_1',
 'ethnicity_2',
 'ethnicity_3']

In [81]:
df.head() #checking the new columns

Unnamed: 0,studentid,age,gender,parentaleducation,studytimeweekly,absences,tutoring,parentalsupport,extracurricular,sports,music,volunteering,gpa,gradeclass,ethnicity_1,ethnicity_2,ethnicity_3
0,1001,0.472919,0.978492,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866,1.118086,2.0,-0.50952,-0.494507,-0.31985
1,1002,1.362944,-1.021981,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866,1.242374,1.0,-0.50952,-0.494507,-0.31985
2,1003,-1.307132,-1.021981,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866,-1.960277,4.0,-0.50952,2.022217,-0.31985
3,1004,0.472919,0.978492,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866,0.16179,3.0,-0.50952,-0.494507,-0.31985
4,1005,0.472919,0.978492,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866,-0.675573,4.0,-0.50952,-0.494507,-0.31985


In [85]:
scaler = StandardScaler()
numerical_cols = ['age', 'parentalsupport','studytimeweekly', 'absences' ] #Ensures that all the numeric features are all on the same scale , allows LR to perform better
df[numerical_cols] = scaler.fit_transform(df[numerical_cols]) #scale is between -2 and 2
print("Scaled Data Sample:")
df[numerical_cols].head()

Scaled Data Sample:


Unnamed: 0,age,parentalsupport,studytimeweekly,absences
0,0.472919,-0.108744,1.780336,-0.890822
1,1.362944,-0.999551,0.997376,-1.717694
2,-1.307132,-0.108744,-0.984045,1.353542
3,0.472919,0.782063,0.045445,-0.063951
4,0.472919,0.782063,-0.902311,0.290422


In [86]:
x = df.drop(['studentid', 'gpa', 'gradeclass'], axis=1)
y = df['gradeclass']

In [87]:
print("Features in X:", x.columns.tolist()) #displays the features the Datafeame will use to train modeedls
print("Target Y Sample:", y.head()) # the target variable that is we're tyring to predict

Features in X: ['age', 'gender', 'parentaleducation', 'studytimeweekly', 'absences', 'tutoring', 'parentalsupport', 'extracurricular', 'sports', 'music', 'volunteering', 'ethnicity_1', 'ethnicity_2', 'ethnicity_3']
Target Y Sample: 0    2.0
1    1.0
2    4.0
3    3.0
4    4.0
Name: gradeclass, dtype: float64


In [None]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y) #Train-Test Split : Training-Data = 80% Test-Data = 20%


In [88]:
print("Training set size:", x_train.shape) #Displays the data split
print("Testing set size:", x_test.shape)

Training set size: (1913, 14)
Testing set size: (479, 14)


In [70]:
log_model = LogisticRegression(max_iter=1000 , class_weight='balanced') #Creates Logistic regression model , "balanced" handles the implance in GradeClass(over-sampling) ; avoids the model from over-focusing on the majority class (GradeClass = C) 
log_model.fit(x_train, y_train) #trains model on the given data
y_pred_log = log_model.predict(x_test)
print("\nLogistic Regression Results:")
print(classification_report(y_test, y_pred_log))



Logistic Regression Results:
              precision    recall  f1-score   support

         0.0       0.11      0.33      0.17        21
         1.0       0.39      0.30      0.34        54
         2.0       0.46      0.47      0.47        78
         3.0       0.35      0.43      0.39        83
         4.0       0.95      0.76      0.84       243

    accuracy                           0.58       479
   macro avg       0.45      0.46      0.44       479
weighted avg       0.67      0.58      0.62       479



In [71]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42 , class_weight='balanced')
rf_model.fit(x_train, y_train) #trains model on the given data
y_pred_rf = rf_model.predict(x_test)
print("\nRandom Forest Results:")
print(classification_report(y_test, y_pred_rf))


Random Forest Results:
              precision    recall  f1-score   support

         0.0       0.44      0.19      0.27        21
         1.0       0.55      0.44      0.49        54
         2.0       0.51      0.63      0.56        78
         3.0       0.49      0.43      0.46        83
         4.0       0.88      0.92      0.90       243

    accuracy                           0.70       479
   macro avg       0.57      0.52      0.53       479
weighted avg       0.69      0.70      0.69       479



In [None]:
#Checks which feature is most predictive in the rf model
importances = rf_model.feature_importances_
feature_names = x.columns
feature_importance_df = pd.DataFrame({'Feature' :feature_names, 'Importance' : importances})
feature_importance_df.sort_values(by='Importance', ascending=False)


In [None]:
#XBB_Model
xgb_model = XGBClassifier(eval_metric='mlogloss' ,random_state=42) #used in multi-class classifications
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
xgb_model.fit(x_train,y_train,sample_weight=sample_weights) #trains model on the given data
y_pred_xgb = xgb_model.predict(x_test)
print("\nXGBoost Results:")
print(classification_report(y_test, y_pred_xgb))