In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
!pip install xgboost
from xgboost import XGBClassifier
import torch
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from collections import deque




In [13]:
from ucimlrepo import fetch_ucirepo
np.random.seed(0)
torch.manual_seed(0)
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)

# data (as pandas dataframes) 
X = predict_students_dropout_and_academic_success.data.features
y = predict_students_dropout_and_academic_success.data.targets
# metadata 
metadata = predict_students_dropout_and_academic_success.metadata

# variable information 
variable_info = predict_students_dropout_and_academic_success.variables

df = X
df['Target'] = y

print(f"{df.shape[0]} entries with {df.shape[1]} features")

categorical_vars = {
    'Marital Status',
    'Application mode',
    'Course',
    'Daytime/evening attendance',
    'Previous qualification',
    'Nacionality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Displaced', 'Educational special needs', 'Debtor',
    'Tuition fees up to date', 'Gender', 'Scholarship holder',
    'International',
    'Target'
}
quantitative_vars = {
   'Application order',
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
   'Curricular units 1st sem (enrolled)',
   'Curricular units 1st sem (credited)',
   'Curricular units 1st sem (evaluations)',
   'Curricular units 1st sem (approved)',
   'Curricular units 1st sem (grade)',
   'Curricular units 1st sem (without evaluations)',
   'Curricular units 2nd sem (credited)',
   'Curricular units 2nd sem (enrolled)',
   'Curricular units 2nd sem (evaluations)',
   'Curricular units 2nd sem (approved)',
   'Curricular units 2nd sem (grade)',
   'Curricular units 2nd sem (without evaluations)',
   'Unemployment rate',
   'Inflation rate',
    'GDP'
}
drop_features = set(["Father's qualification",
 'Gender',
 'Previous qualification (grade)',
 'International',
 'Curricular units 2nd sem (without evaluations)',
 'Marital Status',
 'GDP',
 'Displaced',
 'Debtor',
 'Inflation rate',
 "Mother's occupation",
 "Father's occupation",
 'Unemployment rate',
 'Previous qualification',
 'Curricular units 1st sem (without evaluations)',
 "Mother's qualification",
 'Curricular units 2nd sem (enrolled)',
 'Application order',
 'Age at enrollment',
 'Educational special needs','Nacionality'])

second_drop = set(['Curricular units 1st sem (grade)', 'Curricular units 1st sem (evaluations)'])

drop_features = drop_features.union(second_drop)

quantitative_vars -= drop_features
categorical_vars -= drop_features

4424 entries with 37 features


In [14]:
df.drop(drop_features, axis=1, inplace=True)
df.to_csv('filtered_data.csv', index=False)

df_encoded = df[df['Target'].isin(['Graduate', 'Dropout'])].copy()
df_encoded["Target"] = df_encoded["Target"].replace({'Graduate': 0, 'Dropout': 1})
# Applying one-hot encoding on categorical variables
df_encoded = pd.get_dummies(df_encoded, columns=list(categorical_vars - {'Target'}))

# normalize quantitative columns:
df_encoded[list(quantitative_vars)] = df_encoded[list(quantitative_vars)].apply(lambda x: (x-x.min())/(x.max()-x.min()))

valid_count = int(len(df_encoded) * 0.7)
test_count = int(len(df_encoded) * 0.8)
df_shuffled = df_encoded.sample(frac=1, random_state=42).reset_index(drop=True)
df_train = df_encoded[:valid_count].reset_index(drop=True)
df_valid = df_encoded[valid_count:test_count].reset_index(drop=True)
df_test = df_encoded[test_count:].reset_index(drop=True)

y_train = df_train["Target"]
x_train = df_train.drop(["Target"], axis=1)

y_test = df_test["Target"]
x_test = df_test.drop(["Target"], axis=1)

y_valid = df_valid["Target"]
x_valid = df_valid.drop(["Target"], axis=1)

print(f"train_ds: {df_train.shape[0]} samples")
print(f"test_ds: {df_test.shape[0]} samples")
print(f'[train_ds]: input shape: {x_train.shape}, output shape: {y_train.shape}')
print(f'[valid_ds]: input shape: {x_valid.shape}, valid shape: {y_valid.shape}')
print(f'[test_ds]: input shape: {x_test.shape}, output shape: {y_test.shape}')


train_ds: 2541 samples
test_ds: 726 samples
[train_ds]: input shape: (2541, 49), output shape: (2541,)
[valid_ds]: input shape: (363, 49), valid shape: (363,)
[test_ds]: input shape: (726, 49), output shape: (726,)


  df_encoded["Target"] = df_encoded["Target"].replace({'Graduate': 0, 'Dropout': 1})


In [15]:
scaler1 = StandardScaler()
scaler1.fit(x_train)
X_train_scaled = scaler1.transform(x_train.copy())
X_test_scaled = scaler1.transform(x_test.copy())

In [16]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_scaled, y_train)

xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

In [17]:
from torch.utils.data import Dataset, DataLoader
class DropoutDS(Dataset):
    def __init__(self, x,y):
        self.input_df = x
        self.output_df = y

    def __len__(self):
        return self.input_df.shape[0]

    def __getitem__(self, idx):
        inp = self.input_df.iloc[idx].astype(float)
        out = np.expand_dims(self.output_df.iloc[idx].astype(float), axis=0)
        # out = self.output_df.iloc[idx].astype(float)
        inp_t = torch.tensor(inp.values, dtype=torch.float32)
        out_t = torch.tensor(out, dtype=torch.float32)
        return inp_t, out_t

batch_size = 10
train_dl = DataLoader(DropoutDS(x_train, y_train), batch_size=batch_size, shuffle=True)
test_dl = DataLoader(DropoutDS(x_test, y_test), batch_size=batch_size, shuffle=False)
valid_dl = DataLoader(DropoutDS(x_valid, y_valid), batch_size=batch_size, shuffle=True)
# testing dl size
x, y= next(iter(train_dl))
print(x.shape, y.shape)

import torch.nn as nn
dropout_prob = 0.25

model = nn.Sequential(
    # nn.Linear(246, 256),
    # nn.ReLU(),
    # nn.Dropout(p=dropout_prob),
    # nn.Linear(256, 128),
    # nn.ReLU(),
    # nn.Dropout(p=dropout_prob),
    # nn.Linear(49, 64),
    # nn.ReLU(),
    # nn.Dropout(p=dropout_prob),
    nn.Linear(49, 32),
    nn.ReLU(),
    nn.Dropout(p=dropout_prob),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 1),
    nn.Sigmoid()
)

def init_model_weights(model):
    if isinstance(model, nn.Linear):
        nn.init.normal_(model.weight, mean=0, std=0.1)
        nn.init.constant_(model.bias, 0)

model.apply(init_model_weights)
print(f"{sum(p.numel() for p in model.parameters())} trainable params")


torch.Size([10, 49]) torch.Size([10, 1])
2273 trainable params


In [25]:
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
from sklearn.metrics import accuracy_score

def get_mis_cls(outputs, tgt):
    cls = torch.round(outputs).type(torch.int32)
    tgt = tgt.type(torch.int32)
    mis_cls = torch.sum(torch.abs(cls - tgt))
    return mis_cls.item()
    

def valid_epoch(dl):
    model.eval()
    running_loss = 0.
    total_mis_cls = 0
    all_outputs = None
    all_probs = None
    with torch.no_grad():
        for i, data in enumerate(dl):
            inputs, tgt = data

            outputs = model(inputs)
            save_outputs = outputs.round().numpy().squeeze()
            save_probs = outputs.numpy().squeeze()
            if all_outputs is None:
                all_outputs = save_outputs
            else:
                all_outputs = np.concatenate((all_outputs, save_outputs))
            if all_probs is None:
                all_probs = save_probs
            else:
                all_probs = np.concatenate((all_probs, save_probs))
            
            loss = loss_fn(outputs, tgt)
            running_loss += loss.item()
            total_mis_cls += get_mis_cls(torch.round(outputs), tgt)
    return running_loss/len(dl), 1 - float(total_mis_cls) / (len(dl) * batch_size), all_outputs, all_probs

def train_epoch():
    model.train(True)
    running_loss = 0.
    total_mis_cls = 0
    for i, data in enumerate(train_dl):
        inputs, tgt = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, tgt)
        total_mis_cls += get_mis_cls(outputs, tgt)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss/len(train_dl), 1 - float(total_mis_cls) / x_train.shape[0]

EPOCHS = 200
loss = []
all_train_acc = []
all_valid_acc = []
all_valid_loss = []
all_train_loss = []
acc_queue = deque()
max_acc = -1e10
best_performance = 0,0
count_avg_perf = 5
for epoch in range(EPOCHS):
    avg_train_loss, train_acc = train_epoch()
    avg_valid_loss, valid_acc, _, _ = valid_epoch(valid_dl)
    all_train_loss.append(avg_train_loss)
    all_valid_loss.append(avg_valid_loss)
    all_train_acc.append(train_acc)
    all_valid_acc.append(valid_acc)
    if len(acc_queue) > count_avg_perf:
        acc_queue.popleft()
    acc_queue.append(valid_acc)
    curr_max_acc = sum(list(acc_queue)) / float(count_avg_perf)
    if curr_max_acc > max_acc:
        max_acc =  curr_max_acc
        best_performance = (train_acc, valid_acc, avg_train_loss, avg_valid_loss)
        max_acc_epoch = epoch
        torch.save(model, 'mlp-model.pt')
    print(f'epoch: {epoch} | train_loss: {avg_train_loss:.2f} valid_loss: {avg_valid_loss:.2f}'
          f'| train_acc: {train_acc:.4f} | valid_acc: {valid_acc:.4f}', end= '\r')
print(f'\nBest Model Performance:\n\ttrain_acc: {best_performance[0]:.4f}\n\tvalid_acc: {best_performance[1]:.4f}')
print(f'\nBest Model Performance:\n\ttrain_loss: {best_performance[2]:.4f}\n\tvalid_loss: {best_performance[3]:.4f}')


epoch: 199 | train_loss: 0.17 valid_loss: 0.30| train_acc: 0.9366 | valid_acc: 0.9162
Best Model Performance:
	train_acc: 0.9398
	valid_acc: 0.9324

Best Model Performance:
	train_loss: 0.1716
	valid_loss: 0.2940


In [26]:
model = torch.load('mlp-model.pt')
avg_test_loss, test_acc, all_out, all_prob = valid_epoch(test_dl)
combined_probs = np.stack((all_prob, 1 - all_prob))
avg_test_loss, test_acc

  model = torch.load('mlp-model.pt')


(0.30098932994248934, 0.9095890410958904)

In [27]:
model = torch.load('mlp-model.pt')
# now need to get all mlp_predictions
all_pred = None
all_probs = None
for i, data in enumerate(test_dl):
            inputs, y_out = data
            model.train(False)
            outputs = model(inputs).squeeze()
            out_cls = torch.round(outputs).type(torch.int32).numpy()
            out_prob = outputs.detach().numpy()
            if all_pred is None:
                all_pred = out_cls
            else:
                all_pred = np.concatenate((all_pred, out_cls), axis=0)
            if all_probs is None:
                all_probs = out_prob 
            else:
                all_probs = np.concatenate((all_probs, out_prob), axis=0)

print("MLP Accuracy:", accuracy_score(y_test, all_pred))
print("MLP Predictions:", all_pred[:10], '\n')


# y_pred_mlp = mlp_wrapper.predict(mlp_inp)
# print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
# print("MLP Predictions:", y_pred_mlp[:10], '\n')

MLP Accuracy: 0.9090909090909091
MLP Predictions: [0 0 1 1 0 1 0 0 0 1] 



  model = torch.load('mlp-model.pt')


In [30]:
scaler1 = StandardScaler()
scaler1.fit(x_train)
X_train_scaled = scaler1.transform(x_train.copy())
X_test_scaled = scaler1.transform(x_test.copy())
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Predictions:", y_pred_rf[:10], '\n')

y_pred_xgb = xgb_model.predict(X_test_scaled)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Predictions:", y_pred_xgb[:10], '\n')

# now need to get all mlp_predictions
all_pred = None
for i, data in enumerate(test_dl):
            inputs, y_out = data
            model.train(False)
            outputs = model(inputs).squeeze()
            outputs = torch.round(outputs).type(torch.int32).numpy()
            if all_pred is None:
                all_pred = outputs
            else:
                all_pred = np.concatenate((all_pred, outputs), axis=0)

print("MLP Accuracy:", accuracy_score(y_test, all_pred))
print("MLP Predictions:", all_out[:10], '\n')


# y_pred_mlp = mlp_wrapper.predict(mlp_inp)
# print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
# print("MLP Predictions:", y_pred_mlp[:10], '\n')

Random Forest Accuracy: 0.9077134986225895
Random Forest Predictions: [0 0 1 1 0 1 1 0 0 1] 

XGBoost Accuracy: 0.8980716253443526
XGBoost Predictions: [0 0 1 1 0 1 1 0 0 1] 

MLP Accuracy: 0.9090909090909091
MLP Predictions: [0. 0. 1. 1. 0. 1. 0. 0. 0. 1.] 



In [31]:
ensemble_pred = np.array([y_pred_rf, y_pred_xgb, all_out.astype(int)])
final_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=ensemble_pred)
print("Ensemble Accuracy (Hard Voting):", accuracy_score(y_test, final_pred))
print("\nClassification Report:\n", classification_report(y_test, final_pred))

Ensemble Accuracy (Hard Voting): 0.9104683195592287

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       423
           1       0.95      0.83      0.89       303

    accuracy                           0.91       726
   macro avg       0.92      0.90      0.91       726
weighted avg       0.91      0.91      0.91       726



In [32]:
# soft voting
rf_proba = rf_model.predict_proba(X_test_scaled)

xgb_proba = xgb_model.predict_proba(X_test_scaled)

mlp_proba = combined_probs.T

ensemble_proba = (rf_proba + xgb_proba + mlp_proba) / 3

final_pred = np.argmax(ensemble_proba, axis=1)


print("Ensemble Accuracy (Soft Voting):", accuracy_score(y_test, final_pred))
print("\nClassification Report:\n", classification_report(y_test, final_pred))

Ensemble Accuracy (Soft Voting): 0.8870523415977961

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.96      0.91       423
           1       0.93      0.79      0.85       303

    accuracy                           0.89       726
   macro avg       0.90      0.87      0.88       726
weighted avg       0.89      0.89      0.89       726

