In [None]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = [
    "pandas",
    "numpy",
    "scikit-learn",
    "xgboost",
    "category_encoders"
]

for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        install(package)

#-------------------------------
# Imports
#-------------------------------
import pandas as pd
import numpy as np
import re
import pickle
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

#-------------------------------
# Load data
#-------------------------------
df = pd.read_csv('/content/drive/MyDrive/data vision/fully_transformed_student_dataset.csv')
df["target"] = df["Student Status"].map({"Dropout": 1, "Graduate": 0, "Enrolled": 0})

#-------------------------------
# Feature engineering
#-------------------------------
df["Enrolled Units (1st Sem)"].replace(0, np.nan, inplace=True)
df["Enrolled Units (2nd Sem)"].replace(0, np.nan, inplace=True)
df["Evaluated Units (1st Sem)"].replace(0, np.nan, inplace=True)
df["Evaluated Units (2nd Sem)"].replace(0, np.nan, inplace=True)

df["load_ratio_sem1"] = df["Evaluated Units (1st Sem)"] / (df["Enrolled Units (1st Sem)"] + 1e-6)
df["load_ratio_sem2"] = df["Evaluated Units (2nd Sem)"] / (df["Enrolled Units (2nd Sem)"] + 1e-6)

df["backlog_sem1"] = df["Evaluated Units (1st Sem)"] - df["Enrolled Units (1st Sem)"]
df["backlog_sem2"] = df["Evaluated Units (2nd Sem)"] - df["Enrolled Units (2nd Sem)"]
df["pass_ratio_sem1"] = df["Approved Units (1st Sem)"] / (df["Evaluated Units (1st Sem)"] + 1e-6)
df["pass_ratio_sem2"] = df["Approved Units (2nd Sem)"] / (df["Evaluated Units (2nd Sem)"] + 1e-6)

df["fail_count_sem1"] = df["Evaluated Units (1st Sem)"] - df["Approved Units (1st Sem)"]
df["fail_count_sem2"] = df["Evaluated Units (2nd Sem)"] - df["Approved Units (2nd Sem)"]

df["incomplete_sem1"] = df["Not Evaluated Units (1st Sem)"]
df["incomplete_sem2"] = df["Not Evaluated Units (2nd Sem)"]

df["grade_scaled_sem1"] = df["Average Grade (1st Sem)"].replace(0, np.nan) / 20
df["grade_scaled_sem2"] = df["Average Grade (2nd Sem)"].replace(0, np.nan) / 20

df["momentum_sem1"] = df["Approved Units (1st Sem)"] - df["fail_count_sem1"]
df["momentum_sem2"] = df["Approved Units (2nd Sem)"] - df["fail_count_sem2"]

df["is_debtor_flag"] = df["Is Debtor"]
df["scholarship_flag"] = df["Scholarship Holder"]
df["tuition_up_to_date_flag"] = df["Tuition Fees Up-to-Date"]
df["special_needs_flag"] = df["Special Educational Needs"]

#-------------------------------
# Encode categorical variables
#-------------------------------
df_encoded = df.copy()

target_cols = [
    'Marital Status', 'Application Mode', 'Course Name',
    'Previous Qualification', "Mother's Qualification", "Father's Qualification",
    "Mother's Occupation", "Father's Occupation", 'Nationality'
]

binary_cols = [
    'Daytime/Evening Attendance', 'Displaced Student', 'Special Educational Needs',
    'Is Debtor', 'Tuition Fees Up-to-Date', 'Scholarship Holder',
    'Gender (1=Male, 0=Female)', 'International Student',
    'is_debtor_flag', 'scholarship_flag', 'tuition_up_to_date_flag', 'special_needs_flag'
]

for col in binary_cols:
    if col in df_encoded.columns and df_encoded[col].dtype == 'object':
        df_encoded[col] = df_encoded[col].map({'Yes':1, 'No':0, 'Evening':1, 'Daytime':0, 'Male':1, 'Female':0})
    elif col in df_encoded.columns:
        df_encoded[col] = df_encoded[col].astype(int)

min_count = 20
for col in target_cols:
    if col in df_encoded.columns:
        counts = df_encoded[col].value_counts()
        rare_cats = counts[counts < min_count].index
        df_encoded[col] = df_encoded[col].replace(rare_cats, 'Other')

y = df_encoded['target']
X = df_encoded.drop(['Student Status', 'Application Order', 'target'], axis=1)

te = TargetEncoder(cols=target_cols)
X[target_cols] = te.fit_transform(X[target_cols], y)
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')

#-------------------------------
# Train-test split
#-------------------------------
def sanitize_columns(df_to_sanitize):
    new_columns = [re.sub(r'[^A-Za-z0-9_]+', '', col).replace(' ', '_') for col in df_to_sanitize.columns]
    df_to_sanitize.columns = new_columns
    return df_to_sanitize

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train = sanitize_columns(X_train)
X_test = sanitize_columns(X_test)

xgb_model = XGBClassifier(
    n_estimators=520,
    max_depth=8,
    learning_rate=0.29823344535055346,
    subsample=0.9733285851155771,
    colsample_bytree=0.7301163829404953,
    gamma=4.330244627848201,
    min_child_weight=10,
    scale_pos_weight=9.874046887342207,
    objective='binary:logistic',
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)


with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

with open('target_encoder.pkl', 'wb') as f:
    pickle.dump(te, f)

with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(X_train.columns.tolist(), f)

print("Model and necessary files saved: xgb_model.pkl, target_encoder.pkl, feature_columns.pkl")

y_pred_proba = xgb_model.predict_proba(X_test)[:,1]
y_pred = (y_pred_proba >= 0.35).astype(int)

df_results = X_test.copy()
df_results['Dropout_Probability'] = y_pred_proba
df_results['Predicted_Dropout'] = y_pred
df_results['Actual_Dropout'] = y_test.values

df_results.to_csv('student_dropout_predictions.csv', index=False)
print("Predictions saved to student_dropout_predictions.csv")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/data vision/fully_transformed_student_dataset.csv'