In [1]:
# Student Dropout Prediction Preprocessing Pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.colab import drive
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from collections import Counter

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
data_path = '/content/drive/My Drive/rawDataset.csv'
results_path = '/content/drive/My Drive/results/'
eda_vis_path = results_path + 'eda_visualizations/'
os.makedirs(eda_vis_path, exist_ok=True)

# Load dataset
columns = ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance',
           'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification",
           "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced',
           'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder',
           'Age at enrollment', 'International', 'Curricular units 1st sem (credited)',
           'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)',
           'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)',
           'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)',
           'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)',
           'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)',
           'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP', 'Target']
df = pd.read_csv(data_path, sep=';', names=columns, header=0)

print("Initial Dataset Shape:", df.shape)

# IT_MEMBER1(IT24101839) – Handle Class Imbalance with SMOTE
X = df.drop(columns=["Target"])
y = df["Target"]

# Original distribution
print("Original class distribution:", Counter(y))
plt.bar(Counter(y).keys(), Counter(y).values(), color=['red','green','blue'])
plt.title("Class distribution BEFORE balancing")
plt.xlabel("Class")
plt.ylabel("Count")
plt.savefig(eda_vis_path + 'class_dist_before.png')
plt.close()

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# New distribution
print("Balanced class distribution:", Counter(y_res))
plt.bar(Counter(y_res).keys(), Counter(y_res).values(), color=['red','green','blue'])
plt.title("Class distribution AFTER balancing")
plt.xlabel("Class")
plt.ylabel("Count")
plt.savefig(eda_vis_path + 'class_dist_after.png')
plt.close()

# Rebuild dataframe
df = pd.concat([pd.DataFrame(X_res, columns=X.columns),
                pd.Series(y_res, name="Target")], axis=1)

# Save balanced dataset
balanced_path = results_path + "balanced_dataset.csv"
df.to_csv(balanced_path, index=False)
print("Balanced dataset saved to", balanced_path)

# IT_MEMBER2(IT24104027) – Encoding Categorical Variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove("Target")

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Encode target with LabelEncoder
label_enc = LabelEncoder()
df["Target"] = label_enc.fit_transform(df["Target"])

print("After encoding shape:", df.shape)

# IT_MEMBER3(IT24101554) – Outlier Detection and Removal (Isolation Forest)
iso = IsolationForest(contamination=0.05, random_state=42)
outliers = iso.fit_predict(df.drop(columns=["Target"]))
df = df[outliers == 1]

print("After outlier removal:", df.shape)

# IT_MEMBER4(IT24101627) – Scaling Numerical Features
scaler = StandardScaler()
num_cols = df.drop(columns=["Target"]).columns
df[num_cols] = scaler.fit_transform(df[num_cols])

print("After scaling:", df.shape)

# IT_MEMBER5(IT24101966) – Feature Engineering
# Example features
df["Total_units_enrolled"] = df["Curricular units 1st sem (enrolled)"] + df["Curricular units 2nd sem (enrolled)"]
df["Total_units_approved"] = df["Curricular units 1st sem (approved)"] + df["Curricular units 2nd sem (approved)"]
df["Approval_rate"] = df["Total_units_approved"] / df["Total_units_enrolled"]

df["Total_units_grade"] = df["Curricular units 1st sem (grade)"] + df["Curricular units 2nd sem (grade)"]
df["Average_unit_grade"] = df["Total_units_grade"] / 2

print("After feature engineering:", df.shape)

# IT_MEMBER6(IT24101987) – Feature Selection + PCA
X = df.drop(columns=["Target"])
y = df["Target"]

# Select K best features
selector = SelectKBest(score_func=f_classif, k=20)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

print("Selected top features:", selected_features)

# PCA for dimensionality reduction
pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X_new)

print("Explained variance ratio (PCA):", pca.explained_variance_ratio_)

# Save final dataset
final_df = pd.DataFrame(X_pca, columns=[f"PCA_{i+1}" for i in range(10)])
final_df["Target"] = y.reset_index(drop=True)

final_path = results_path + "final_preprocessed_dataset.csv"
final_df.to_csv(final_path, index=False)

print("Final preprocessed dataset saved to", final_path)


Mounted at /content/drive
Initial Dataset Shape: (4424, 37)
Original class distribution: Counter({'Graduate': 2209, 'Dropout': 1421, 'Enrolled': 794})
Balanced class distribution: Counter({'Dropout': 2209, 'Graduate': 2209, 'Enrolled': 2209})
Balanced dataset saved to /content/drive/My Drive/results/balanced_dataset.csv
After encoding shape: (6627, 37)
After outlier removal: (6295, 37)
After scaling: (6295, 37)
After feature engineering: (6295, 42)
Selected top features: Index(['Application mode', 'Displaced', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)', 'Total_units_enroll