In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import scipy.stats as stats
# import tabulate
import matplotlib.pyplot as plt
import seaborn as sns

# Missing Value and Outlier Treatment

# Define the file path for the dataset
file_path = "../data/Student_performance_data .csv"

# Print current working directory and available files for debugging
print("Current Working Directory:", os.getcwd())
# print("Files in 'data' Directory:", os.listdir("data"))

# Checking of the file
if not os.path.exists(file_path):
    raise FileNotFoundError(
        f"The file '{file_path}' was not found in {os.getcwd()}. "
        f"Please ensure the file is in the correct directory or provide the correct path. "
        f"Available files in 'data': {os.listdir('data')}"
    )

# Loading the dataset
try:
    data = pd.read_csv(file_path)
except Exception as e:
    raise Exception(f"Failed to load CSV file: {e}")

# Display column names
print("\nDataset Columns:\n", data.columns.tolist())

# Checking for any missing values of the dataset
missing_values = data.isnull().sum()
print("\nMissing Values:\n", missing_values)

# Making sure the numerical columns are numeric
numerical_cols = ['StudyTimeWeekly', 'Absences', 'GPA']
for col in numerical_cols:
    try:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    except Exception as e:
        print(f"Warning: Could not convert {col} to numeric: {e}")

# Handling missing values (if there is any)
for col in numerical_cols:
    try:
        if data[col].isnull().any():
            median_value = data[col].median()
            data[col] = data[col].fillna(median_value)
            print(f"Imputed missing values in {col} with median: {median_value}")
    except Exception as e:
        print(f"Error imputing {col}: {e}")

# To check for any missing values in categorical columns, impute with mode
categorical_cols = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring', 'ParentalSupport', 
                    'Extracurricular', 'Sports', 'Music', 'Volunteering']
for col in categorical_cols:
    try:
        if data[col].isnull().any():
            mode_value = data[col].mode()[0]
            data[col] = data[col].fillna(mode_value)
            print(f"Imputed missing values in {col} with mode: {mode_value}")
    except Exception as e:
        print(f"Error imputing {col}: {e}")

# To check for no missing values remain
print("\nMissing Values After Imputation:\n", data.isnull().sum())

# Outlier detection and treatment using Z-score
def detect_outliers_zscore(df, column, threshold=3):
    try:
        # Ensure column is numeric and drop NaN values for Z-score calculation
        col_data = pd.to_numeric(df[column], errors='coerce').dropna()
        if col_data.empty:
            print(f"No valid data in {column} for outlier detection")
            return pd.DataFrame()
        z_scores = np.abs(stats.zscore(col_data))
        outliers = df.loc[col_data.index][z_scores > threshold]
        return outliers
    except Exception as e:
        print(f"Error detecting outliers in {column}: {e}")
        return pd.DataFrame()

# Numerical columns to check for outliers
for col in numerical_cols:
    try:
        outliers = detect_outliers_zscore(data, col)
        if not outliers.empty:
            print(f"\nOutliers in {col}:\n", outliers[[col]])
        else:
            print(f"\nNo outliers detected in {col}")
    except Exception as e:
        print(f"Error processing outliers for {col}: {e}")

# Capping the outliers at the 1st and 99th percentiles
for col in numerical_cols:
    try:
        lower_bound = data[col].quantile(0.01)
        upper_bound = data[col].quantile(0.99)
        if not pd.isna(lower_bound) and not pd.isna(upper_bound):
            data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)
            print(f"Capped outliers in {col} at 1st ({lower_bound}) and 99th ({upper_bound}) percentiles")
        else:
            print(f"Skipping outlier capping for {col} due to invalid quantiles")
    except Exception as e:
        print(f"Error capping outliers in {col}: {e}")

# Verifying for the outlier treatment
for col in numerical_cols:
    try:
        print(f"\nSummary of {col} after capping:\n", data[col].describe())
    except Exception as e:
        print(f"Error summarizing {col}: {e}")

Current Working Directory: c:\Users\USER-PC\Documents\BC 2025\MLG382\MLG382_GuidedProject\notebooks

Dataset Columns:
 ['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'GradeClass']

Missing Values:
 StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64

Missing Values After Imputation:
 StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music          