In [149]:
import numpy as np
import pandas as pd
import kagglehub
import os
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score

In [150]:
path = kagglehub.dataset_download("thedevastator/higher-education-predictors-of-student-retention")

In [151]:
print(os.listdir(path))

['dataset.csv']


In [152]:
csv_path = os.path.join(path, 'dataset.csv')

In [153]:
df = pd.read_csv(csv_path)

In [154]:
data = pd.DataFrame(df)

In [155]:
data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,15,1,1,1,1,1,6,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,15,1,1,19,1,1,10,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,12,1,1,1,22,27,10,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9,1,1,1,22,27,8,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [156]:
data_1 = pd.DataFrame(data)

In [157]:
data_1.rename(columns={'Nacionality':'Nationality'}, inplace=True)

In [158]:
# Filtering out 'Enrolled' records
data_1 = data_1[data_1['Target'] != 'Enrolled'].copy()

In [159]:
X = data_1.drop('Target', axis=1)
y = data_1['Target']

In [160]:
#Encoding Dropout as 1 and Graduated as 0
y = y.apply(lambda x: 1 if x == 'Dropout' else 0)

In [161]:
#Categorical feature handling
cat_thresh = 10
for col in X.columns:
    if X[col].nunique() < cat_thresh:
        X[col] = X[col].astype('category')

In [162]:
#One-hot encode
X = pd.get_dummies(X, drop_first=True)

In [163]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [164]:
model = XGBClassifier(
    n_estimators = 300,
    learning_rate = 0.05,
    max_depth = 5,
    min_child_weight = 1,
    gamma = 0.2,
    subsample = 0.8,
    colsample_bytree = 0.8,
    use_label_encoder = False,
    eval_metric = 'logloss',
    random_state = 42
)

model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [165]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

In [166]:
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Graduated', 'Dropout']))

Accuracy: 92.98%
Precision: 91.46%

Classification Report:
               precision    recall  f1-score   support

   Graduated       0.94      0.95      0.94       442
     Dropout       0.91      0.90      0.91       284

    accuracy                           0.93       726
   macro avg       0.93      0.93      0.93       726
weighted avg       0.93      0.93      0.93       726

