In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import torch
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [76]:
categorical_vars = {
    'Marital Status',
    'Application mode',
    'Course',
    'Daytime/evening attendance',
    'Previous qualification',
    'Nacionality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Displaced', 'Educational special needs', 'Debtor',
    'Tuition fees up to date', 'Gender', 'Scholarship holder',
    'International',
    'Target'
}
quantitative_vars = {
   'Application order',
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
   'Curricular units 1st sem (enrolled)',
   'Curricular units 1st sem (credited)',
   'Curricular units 1st sem (evaluations)',
   'Curricular units 1st sem (approved)',
   'Curricular units 1st sem (grade)',
   'Curricular units 1st sem (without evaluations)',
   'Curricular units 2nd sem (credited)',
   'Curricular units 2nd sem (enrolled)',
   'Curricular units 2nd sem (evaluations)',
   'Curricular units 2nd sem (approved)',
   'Curricular units 2nd sem (grade)',
   'Curricular units 2nd sem (without evaluations)',
   'Unemployment rate',
   'Inflation rate',
    'GDP'
}
drop_features = set(["Father's qualification",
 'Gender',
 'Previous qualification (grade)',
 'International',
 'Curricular units 2nd sem (without evaluations)',
 'Marital Status',
 'GDP',
 'Displaced',
 'Debtor',
 'Inflation rate',
 "Mother's occupation",
 "Father's occupation",
 'Unemployment rate',
 'Previous qualification',
 'Curricular units 1st sem (without evaluations)',
 "Mother's qualification",
 'Curricular units 2nd sem (enrolled)',
 'Application order',
 'Age at enrollment',
 'Educational special needs','Nacionality'])

second_drop = set(['Curricular units 1st sem (grade)', 'Curricular units 1st sem (evaluations)'])

drop_features = drop_features.union(second_drop)

quantitative_vars -= drop_features
categorical_vars -= drop_features

In [77]:
df = pd.read_csv("/content/filtered_data.csv")
df_encoded = df[df['Target'].isin(['Graduate', 'Dropout'])].copy()
df_encoded["Target"] = df_encoded["Target"].replace({'Graduate': 0, 'Dropout': 1})
# Applying one-hot encoding on categorical variables
df_encoded = pd.get_dummies(df_encoded, columns=list(categorical_vars - {'Target'}))

# normalize quantitative columns:
df_encoded[list(quantitative_vars)] = df_encoded[list(quantitative_vars)].apply(lambda x: (x-x.min())/(x.max()-x.min()))

valid_count = int(len(df_encoded) * 0.7)
test_count = int(len(df_encoded) * 0.8)
df_shuffled = df_encoded.sample(frac=1, random_state=42).reset_index(drop=True)
df_train = df_encoded[:valid_count].reset_index(drop=True)
df_valid = df_encoded[valid_count:test_count].reset_index(drop=True)
df_test = df_encoded[test_count:].reset_index(drop=True)

y_train = df_train["Target"]
x_train = df_train.drop(["Target"], axis=1)

y_test = df_test["Target"]
x_test = df_test.drop(["Target"], axis=1)

y_valid = df_valid["Target"]
x_valid = df_valid.drop(["Target"], axis=1)


  df_encoded["Target"] = df_encoded["Target"].replace({'Graduate': 0, 'Dropout': 1})


In [78]:
import torch.nn as nn
import torch

model = torch.load('/content/mlp-model (1).pt')
model.eval()

  model = torch.load('/content/mlp-model (1).pt')


Sequential(
  (0): Linear(in_features=49, out_features=64, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.25, inplace=False)
  (3): Linear(in_features=64, out_features=32, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.25, inplace=False)
  (6): Linear(in_features=32, out_features=16, bias=True)
  (7): ReLU()
  (8): Linear(in_features=16, out_features=8, bias=True)
  (9): ReLU()
  (10): Linear(in_features=8, out_features=1, bias=True)
  (11): Sigmoid()
)

In [79]:
from sklearn.base import BaseEstimator, ClassifierMixin
class MLPWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, scaler):
        self.model = model
        self.scaler = scaler

    def fit(self, X, y):
        pass  # Pre-trained model; no fitting required

    def predict_proba(self, X):
        # Normalize the input data
        if not hasattr(self.scaler, 'mean_'):
          raise ValueError("The scaler has not been fitted. Call `fit` on the scaler with training data first.")

        X_scaled = self.scaler.transform(X)
        X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
        self.model.eval()  # Ensure model is in evaluation mode
        with torch.no_grad():
            probs = self.model(X_tensor).numpy().flatten()
        return np.vstack((1 - probs, probs)).T  # Probabilities for classes 0 and 1

    def predict(self, X):
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)

In [80]:
scaler1 = StandardScaler()
scaler1.fit(x_train)
X_train_scaled = scaler1.transform(x_train)
X_test_scaled = scaler1.transform(x_test)

mlp_wrapper = MLPWrapper(model, scaler1)

In [81]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_scaled, y_train)

xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

In [82]:
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Predictions:", y_pred_rf[:10], '\n')

y_pred_xgb = xgb_model.predict(X_test_scaled)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Predictions:", y_pred_xgb[:10], '\n')

y_pred_mlp = mlp_wrapper.predict(X_test_scaled)
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("MLP Predictions:", y_pred_mlp[:10], '\n')

Random Forest Accuracy: 0.9090909090909091
Random Forest Predictions: [0 0 1 1 0 1 1 0 0 1] 

XGBoost Accuracy: 0.8980716253443526
XGBoost Predictions: [0 0 1 1 0 1 1 0 0 1] 

MLP Accuracy: 0.8512396694214877
MLP Predictions: [0 0 1 1 0 1 1 0 0 1] 





In [83]:
ensemble_pred = np.array([y_pred_rf, y_pred_xgb, y_pred_mlp])
final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=ensemble_pred)
print("Ensemble Accuracy (Hard Voting):", accuracy_score(y_test, final_pred))
print("\nClassification Report:\n", classification_report(y_test, final_pred))

Ensemble Accuracy (Hard Voting): 0.9035812672176309

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92       423
           1       0.92      0.84      0.88       303

    accuracy                           0.90       726
   macro avg       0.91      0.89      0.90       726
weighted avg       0.90      0.90      0.90       726



In [84]:
# soft voting
rf_proba = rf_model.predict_proba(X_test_scaled)

xgb_proba = xgb_model.predict_proba(X_test_scaled)

mlp_proba = mlp_wrapper.predict_proba(X_test_scaled)

ensemble_proba = (rf_proba + xgb_proba + mlp_proba) / 3

final_pred = np.argmax(ensemble_proba, axis=1)


print("Ensemble Accuracy (Soft Voting):", accuracy_score(y_test, final_pred))
print("\nClassification Report:\n", classification_report(y_test, final_pred))

Ensemble Accuracy (Soft Voting): 0.9008264462809917

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.92       423
           1       0.90      0.85      0.88       303

    accuracy                           0.90       726
   macro avg       0.90      0.89      0.90       726
weighted avg       0.90      0.90      0.90       726



