In [None]:
# =========================
# Student Dropout Prediction Preprocessing Pipeline
# Combined Steps from IT_MEMBER1 - IT_MEMBER6
# =========================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.colab import drive
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

# -------------------------
# Mount Google Drive
# -------------------------
drive.mount('/content/drive')

# Define paths
data_path = '/content/drive/My Drive/rawDataset.csv'
results_path = '/content/drive/My Drive/results/'
eda_vis_path = results_path + 'eda_visualizations/'
os.makedirs(eda_vis_path, exist_ok=True)

# -------------------------
# Load dataset
# -------------------------
columns = ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance',
           'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification",
           "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced',
           'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder',
           'Age at enrollment', 'International', 'Curricular units 1st sem (credited)',
           'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)',
           'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)',
           'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)',
           'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
           'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)',
           'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP', 'Target']
df = pd.read_csv(data_path, sep=';', names=columns, header=0)

print("Initial Dataset Shape:", df.shape)
print("Initial Missing Values:\n", df.isnull().sum())

# -------------------------
# STEP 1: Handle Missing Data (Mean / Mode Imputation)
# -------------------------
numerical_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("After Imputation (Missing Values):", df.isnull().sum().sum())

# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=True, cmap='viridis')
plt.title("Missing Values Heatmap (After Imputation)")
plt.savefig(eda_vis_path + 'missing_heatmap.png')
plt.close()

# -------------------------
# STEP 2: Encode Categorical Variables
# -------------------------
le = LabelEncoder()
df['Target_encoded'] = le.fit_transform(df['Target'])
print("Target Encoding Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe_cols = ['Gender', 'Marital status']
ohe_encoded = pd.DataFrame(ohe.fit_transform(df[ohe_cols]),
                           columns=ohe.get_feature_names_out(ohe_cols),
                           index=df.index)
df = pd.concat([df.drop(ohe_cols, axis=1), ohe_encoded], axis=1)

# Class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Target_encoded', data=df)
plt.title("Target Class Distribution")
plt.savefig(eda_vis_path + 'target_countplot.png')
plt.close()

# -------------------------
# STEP 3: Outlier Removal (Isolation Forest)
# -------------------------
numerical_cols = df.select_dtypes(include=[np.number]).columns
iso = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso.fit_predict(df[numerical_cols])
df = df[df['outlier'] == 1].drop('outlier', axis=1)

print("Dataset Shape After Outlier Removal:", df.shape)

# Boxplot Age vs Target
plt.figure(figsize=(10, 6))
sns.boxplot(x='Target_encoded', y='Age at enrollment', data=df)
plt.title("Age Boxplot by Target (Post-Outlier Removal)")
plt.savefig(eda_vis_path + 'age_boxplot.png')
plt.close()

# -------------------------
# STEP 4: Normalization / Scaling
# -------------------------
numerical_cols = df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("Scaled Admission Grade Stats:\n", df['Admission grade'].describe())

plt.figure(figsize=(8, 6))
sns.histplot(df['Admission grade'], kde=True)
plt.title("Scaled Admission Grade Distribution")
plt.savefig(eda_vis_path + 'admission_hist.png')
plt.close()

# -------------------------
# STEP 5: Feature Engineering
# -------------------------
df['Avg_Grade'] = (df['Curricular units 1st sem (grade)'] +
                   df['Curricular units 2nd sem (grade)']) / 2
df['Total_Approved'] = (df['Curricular units 1st sem (approved)'] +
                        df['Curricular units 2nd sem (approved)'])

print("Engineered Features Sample:\n", df[['Avg_Grade', 'Total_Approved']].head())

plt.figure(figsize=(8, 6))
sns.scatterplot(x='Avg_Grade', y='Total_Approved', hue='Target_encoded', data=df)
plt.title("Avg Grade vs Total Approved by Target")
plt.savefig(eda_vis_path + 'scatter_avg_total.png')
plt.close()

# -------------------------
# STEP 6: Feature Selection + PCA
# -------------------------
X = df.drop(['Target', 'Target_encoded'], axis=1, errors='ignore').select_dtypes(include=[np.number])
y = df['Target_encoded']

selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Selected Features:", list(selected_features))

pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_selected)
print("PCA Variance Explained:", pca.explained_variance_ratio_.sum())

plt.figure(figsize=(12, 10))
sns.heatmap(X[selected_features].corr(), annot=True, cmap='coolwarm')
plt.title("Selected Features Correlation Matrix")
plt.savefig(eda_vis_path + 'corr_matrix.png')
plt.close()

# -------------------------
# FINAL OUTPUT
# -------------------------
print("Preprocessing Completed ✅")
