# Data Preprocessing

## Getting The System Ready

In [43]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import scipy.stats as stats
# import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## Loading The Data

In [44]:
# Define the file path for the dataset
file_path = "../data/Student_performance_data .csv" 

# Checking of the file
if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file '{file_path}' was not found in {os.getcwd()}. "
        f"Please ensure the file is in the correct directory or provide the correct path. "
        f"Available files in 'data': {os.listdir('data')}"
    )
    
    # Loading the dataset
try:
    data = pd.read_csv(file_path)
except Exception as e:
    raise Exception(f"Failed to load CSV file: {e}")

## 1. Missing Value and Oulier Treatment

### *i.* Missing Values

In [45]:
# Checking for any missing values of the dataset
missing_values = data.isnull().sum()
print("\nMissing Values:\n", missing_values)


Missing Values:
 StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64


In [46]:
# Making sure the numerical columns are numeric
numerical_cols = ['StudyTimeWeekly', 'Absences', 'GPA']
for col in numerical_cols:
    try:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    except Exception as e:
        print(f"Warning: Could not convert {col} to numeric: {e}")

In [47]:
# Handling missing values (if there is any)
for col in numerical_cols:
    try:
        if data[col].isnull().any():
            median_value = data[col].median()
            data[col] = data[col].fillna(median_value)
            print(f"Imputed missing values in {col} with median: {median_value}")
    except Exception as e:
        print(f"Error imputing {col}: {e}")

### *i.* Outliers

In [48]:
# To check for any missing values in categorical columns, impute with mode
categorical_cols = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring', 'ParentalSupport', 
                    'Extracurricular', 'Sports', 'Music', 'Volunteering']
for col in categorical_cols:
    try:
        if data[col].isnull().any():
            mode_value = data[col].mode()[0]
            data[col] = data[col].fillna(mode_value)
            print(f"Imputed missing values in {col} with mode: {mode_value}")
    except Exception as e:
        print(f"Error imputing {col}: {e}")

In [49]:
# Outlier detection and treatment using Z-score
def detect_outliers_zscore(df, column, threshold=3):
    try:
        # Ensure column is numeric and drop NaN values for Z-score calculation
        col_data = pd.to_numeric(df[column], errors='coerce').dropna()
        if col_data.empty:
            print(f"No valid data in {column} for outlier detection")
            return pd.DataFrame()
        z_scores = np.abs(stats.zscore(col_data))
        outliers = df.loc[col_data.index][z_scores > threshold]
        return outliers
    except Exception as e:
        print(f"Error detecting outliers in {column}: {e}")
        return pd.DataFrame()

In [50]:
# Numerical columns to check for outliers
for col in numerical_cols:
    try:
        outliers = detect_outliers_zscore(data, col)
        if not outliers.empty:
            print(f"\nOutliers in {col}:\n", outliers[[col]])
        else:
            print(f"\nNo outliers detected in {col}")
    except Exception as e:
        print(f"Error processing outliers for {col}: {e}")

# Capping the outliers at the 1st and 99th percentiles
for col in numerical_cols:
    try:
        lower_bound = data[col].quantile(0.01)
        upper_bound = data[col].quantile(0.99)
        if not pd.isna(lower_bound) and not pd.isna(upper_bound):
            data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)
            print(f"Capped outliers in {col} at 1st ({lower_bound}) and 99th ({upper_bound}) percentiles")
        else:
            print(f"Skipping outlier capping for {col} due to invalid quantiles")
    except Exception as e:
        print(f"Error capping outliers in {col}: {e}")

# Verifying for the outlier treatment
for col in numerical_cols:
    try:
        print(f"\nSummary of {col} after capping:\n", data[col].describe())
    except Exception as e:
        print(f"Error summarizing {col}: {e}")


No outliers detected in StudyTimeWeekly

No outliers detected in Absences

No outliers detected in GPA
Capped outliers in StudyTimeWeekly at 1st (0.19036748663715453) and 99th (19.720476474805494) percentiles
Capped outliers in Absences at 1st (0.0) and 99th (29.0) percentiles
Capped outliers in GPA at 1st (0.11152134489227275) and 99th (3.748510617060825) percentiles

Summary of StudyTimeWeekly after capping:
 count    2392.000000
mean        9.771592
std         5.648582
min         0.190367
25%         5.043079
50%         9.705363
75%        14.408410
max        19.720476
Name: StudyTimeWeekly, dtype: float64

Summary of Absences after capping:
 count    2392.000000
mean       14.541388
std         8.467417
min         0.000000
25%         7.000000
50%        15.000000
75%        22.000000
max        29.000000
Name: Absences, dtype: float64

Summary of GPA after capping:
 count    2392.000000
mean        1.905612
std         0.909947
min         0.111521
25%         1.174803
50%  

## 2. Feature Engineering

In [51]:
# Drop columns of the dataset that are unnecessary
data.drop(['StudentID','GPA'], axis=1, inplace=True) 

In [52]:
# Creating new features
data['StudyTimePerAbsence'] = data['StudyTimeWeekly'] / (data['Absences'] + 1)

data['TotalExtracurricular'] = data[['Extracurricular', 'Sports', 'Music', 'Volunteering']].sum(axis=1)

In [53]:
# StudyTimeWeekly to be divided into categories
bins = [0, 5, 10, 15, 20]
labels = ['Low', 'Moderate', 'High', 'Very High']
data['StudyTimeCategory'] = pd.cut(data['StudyTimeWeekly'], bins=bins, labels=labels, include_lowest=True)

categorical_cols_to_encode = ['Ethnicity', 'ParentalEducation', 'StudyTimeCategory']
data_encoded = pd.get_dummies(data, columns=categorical_cols_to_encode, drop_first=True)

scaler = StandardScaler()
numerical_cols_to_scale = ['Age','StudyTimeWeekly', 'Absences', 'StudyTimePerAbsence', 'TotalExtracurricular']
scaler.fit_transform(data_encoded[numerical_cols_to_scale])
data_encoded[numerical_cols_to_scale] = scaler.fit_transform(data_encoded[numerical_cols_to_scale])

In [54]:

# For the final dataset to display
print("\nFinal Feature Set Columns:\n", data_encoded.columns)
print("\nFirst 5 rows of processed dataset:\n", data_encoded.head())




Final Feature Set Columns:
 Index(['Age', 'Gender', 'StudyTimeWeekly', 'Absences', 'Tutoring',
       'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering',
       'GradeClass', 'StudyTimePerAbsence', 'TotalExtracurricular',
       'Ethnicity_1', 'Ethnicity_2', 'Ethnicity_3', 'ParentalEducation_1',
       'ParentalEducation_2', 'ParentalEducation_3', 'ParentalEducation_4',
       'StudyTimeCategory_Moderate', 'StudyTimeCategory_High',
       'StudyTimeCategory_Very High'],
      dtype='object')

First 5 rows of processed dataset:
         Age  Gender  StudyTimeWeekly  Absences  Tutoring  ParentalSupport  \
0  0.472919       1         1.761675 -0.890822         1                2   
1  1.362944       0         0.998187 -1.717694         0                1   
2 -1.307132       0        -0.984705  1.353542         0                2   
3  0.472919       1         0.045550 -0.063951         0                3   
4  0.472919       1        -0.902910  0.290422         1   

## 3. Saving The Newly Processed Data

In [55]:
# This saves the new processed dataset
data_encoded.to_csv("../data/Processed_Student_Performance.csv", index=False)
print("\nProcessed dataset saved as 'Processed_Student_Performance.csv'")


Processed dataset saved as 'Processed_Student_Performance.csv'


In [56]:
# Saving the new features and scaler
with open("../artifacts/features.pkl", "wb") as f:
    pickle.dump(data_encoded.columns, f)

with open("../artifacts/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)