In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import scipy.stats as stats
import tabulate
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset from a CSV file
data = pd.read_csv("data/Student_performance_data .csv")  

# Feature Engineering

# Creating new features
data['StudyTimePerAbsence'] = data['StudyTimeWeekly'] / (data['Absences'] + 1)

data['TotalExtracurricular'] = data[['Extracurricular', 'Sports', 'Music', 'Volunteering']].sum(axis=1)

# StudyTimeWeekly to be divided into categories
bins = [0, 5, 10, 15, 20]
labels = ['Low', 'Moderate', 'High', 'Very High']
data['StudyTimeCategory'] = pd.cut(data['StudyTimeWeekly'], bins=bins, labels=labels, include_lowest=True)

categorical_cols_to_encode = ['Ethnicity', 'ParentalEducation', 'StudyTimeCategory']
data_encoded = pd.get_dummies(data, columns=categorical_cols_to_encode, drop_first=True)

scaler = StandardScaler()
numerical_cols_to_scale = ['StudyTimeWeekly', 'Absences', 'GPA', 'StudyTimePerAbsence', 'TotalExtracurricular']
data_encoded[numerical_cols_to_scale] = scaler.fit_transform(data_encoded[numerical_cols_to_scale])

# Drop columns of the dataset that are unnecessary
data_encoded.drop(['StudentID'], axis=1, inplace=True) 

# For the final dataset to display
print("\nFinal Feature Set Columns:\n", data_encoded.columns)
print("\nFirst 5 rows of processed dataset:\n", data_encoded.head())

# This saves the new processed dataset
data_encoded.to_csv("Processed_Student_Performance.csv", index=False)
print("\nProcessed dataset saved as 'Processed_Student_Performance.csv'")


Final Feature Set Columns:
 Index(['Age', 'Gender', 'StudyTimeWeekly', 'Absences', 'Tutoring',
       'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering',
       'GPA', 'GradeClass', 'StudyTimePerAbsence', 'TotalExtracurricular',
       'Ethnicity_1', 'Ethnicity_2', 'Ethnicity_3', 'ParentalEducation_1',
       'ParentalEducation_2', 'ParentalEducation_3', 'ParentalEducation_4',
       'StudyTimeCategory_Moderate', 'StudyTimeCategory_High',
       'StudyTimeCategory_Very High'],
      dtype='object')

First 5 rows of processed dataset:
    Age  Gender  StudyTimeWeekly  Absences  Tutoring  ParentalSupport  \
0   17       1         1.780336 -0.890822         1                2   
1   18       0         0.997376 -1.717694         0                1   
2   15       0        -0.984045  1.353542         0                2   
3   17       1         0.045445 -0.063951         0                3   
4   17       1        -0.902311  0.290422         1                3   

   E