In [8]:
import json
import nbformat
import ipykernel
import requests

# Function to get the current notebook path in Colab
def get_notebook_name():
    """
    Returns the full path of the current notebook in Colab.
    """
    kernel_id = ipykernel.kernelapp.get_connection_file().split('/')[-1].split('-')[1]
    response = requests.get('http://172.28.0.2:9000/api/sessions')
    response.raise_for_status()
    sessions = response.json()
    for sess in sessions:
        if sess['kernel']['id'] == kernel_id:
            return sess['notebook']['path']
    raise FileNotFoundError("Could not find the current notebook path")

try:
    notebook_path = get_notebook_name()
    print(f"Current notebook path: {notebook_path}")

    # Load current notebook
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)

    # Remove widgets metadata
    if 'widgets' in nb.metadata:
        print("Removing 'widgets' metadata...")
        del nb.metadata['widgets']

    # Save cleaned version
    cleaned_notebook_path = notebook_path.replace('.ipynb', '_fixed.ipynb')
    with open(cleaned_notebook_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)

    print(f"Fixed notebook saved to: {cleaned_notebook_path}")

except FileNotFoundError as e:
    print(f"Error: {e}. Make sure you are running this in a Colab environment and the file exists.")
except requests.exceptions.RequestException as e:
    print(f"Error connecting to the Colab backend to get notebook name: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

An unexpected error occurred: module 'ipykernel.kernelapp' has no attribute 'get_connection_file'


In [None]:

import pandas as pd
df = pd.read_csv('Data Set.csv')
df.head(-5)

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4414,1,1,1,8,1,1,1,3,28,4,...,0,5,8,5,11.600000,0,9.4,-0.8,-3.12,Graduate
4415,4,12,1,12,1,12,1,22,27,7,...,0,7,12,3,11.083333,0,11.1,0.6,2.02,Dropout
4416,1,14,2,12,1,1,1,23,28,10,...,11,14,15,12,12.625000,1,7.6,2.6,0.32,Graduate
4417,1,1,1,5,1,1,1,1,1,10,...,0,6,6,6,13.500000,0,16.2,0.3,-0.92,Graduate


In [None]:
pip install nbformat




In [None]:
# Install CatBoost if needed
!pip install catboost

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Path
path = "/content/drive/MyDrive/SIHdataset.csv"

# Load dataset
df = pd.read_csv(path)

target = "Target"

# ---------------------------
#  Define features
# ---------------------------

# All features except Target
X_full = df.drop(columns=[target])
y = df[target]

# Drop post-outcome leakage features (grades, approvals, etc.)
leakage_cols = [
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
]
X_early = df.drop(columns=[target] + leakage_cols)

# ---------------------------
#  Helper function
# ---------------------------
def train_and_evaluate(X, y, model_name="Model"):
    # Identify categorical columns
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
    for col in categorical_cols:
        X[col] = X[col].astype(str).fillna("Unknown")

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Pools
    train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
    test_pool = Pool(X_test, y_test, cat_features=categorical_cols)

    # Model
    model = CatBoostClassifier(
        iterations=1200,
        depth=10,
        l2_leaf_reg=2, #changed to 2 from 5
        learning_rate=0.025,
        bootstrap_type="Bernoulli",
        subsample=0.8,
        min_data_in_leaf=20,
        loss_function="MultiClass",
        eval_metric="TotalF1",
        task_type = "CPU", #or GPU
        thread_count = -1, #only needed when using CPU
        random_seed=42,
        verbose=100
    )

    # Train
    model.fit(train_pool, eval_set=test_pool, use_best_model=True)

    # Predict
    y_pred = model.predict(X_test)

    # Results
    print(f"\n===== {model_name} Results =====")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=3))

    # Top features
    importances = model.get_feature_importance(prettified=True)
    print("\nTop Feature Importances:")
    print(importances.head(15))

    return model

# ---------------------------
#  Train both models
# ---------------------------

model_full = train_and_evaluate(X_full.copy(), y, model_name="Full Model (All Features)")
model_early = train_and_evaluate(X_early.copy(), y, model_name="Early-Warning Model (Leakage Removed)")

In [None]:
!pip install optuna catboost --quiet

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
from catboost import CatBoostClassifier, Pool

# -------------------
# Config
# -------------------
PATH = "/content/drive/MyDrive/SIHdataset.csv"
TARGET = "Target"
RANDOM_STATE = 42

LEAKAGE_COLS = [
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
]

# -------------------
# Helpers
# -------------------
def prep_X(df, drop_cols=None):
    if drop_cols is None: drop_cols = []
    X = df.drop(columns=[TARGET] + drop_cols)
    y = df[TARGET].astype(str)
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    return X, y, cat_cols

def make_objective(df, drop_cols=None):
    def objective(trial):
        # --- Search space (only what you requested) ---
        depth            = trial.suggest_int("depth", 8, 12)
        l2_leaf_reg      = trial.suggest_float("l2_leaf_reg", 1.0, 5.0)
        learning_rate    = trial.suggest_float("learning_rate", 0.01, 0.1, log=True)
        # bootstrap_type   = trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "Poisson"])
        min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 10, 50)

        X, y, cat_cols = prep_X(df, drop_cols=drop_cols)
        cat_idx = X.columns.isin(cat_cols).nonzero()[0]

        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        scores = []

        for tr_idx, va_idx in skf.split(X, y):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
            valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

            model = CatBoostClassifier(
                iterations=1200,
                depth=depth,
                l2_leaf_reg=l2_leaf_reg,
                learning_rate=learning_rate,
                bootstrap_type="Bernoulli",
                subsample = 0.8,
                min_data_in_leaf=min_data_in_leaf,
                loss_function="MultiClass",
                eval_metric="TotalF1",
                random_seed=RANDOM_STATE,
                task_type="CPU",
                verbose=False,
                od_type="Iter",
                od_wait=50,
                thread_count=-1
            )

            model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False)
            y_pred = model.predict(X_va)
            scores.append(f1_score(y_va, y_pred, average="macro"))

        return float(np.mean(scores))
    return objective

def train_eval_final(df, best_params, label, drop_cols=None):
    print(f"\n=== Final training: {label} ===")
    X, y, cat_cols = prep_X(df, drop_cols=drop_cols)
    cat_idx = X.columns.isin(cat_cols).nonzero()[0]

    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    test_pool  = Pool(X_te, y_te, cat_features=cat_idx)

    model = CatBoostClassifier(
        iterations=800,
        depth=best_params["depth"],
        l2_leaf_reg=best_params["l2_leaf_reg"],
        learning_rate=best_params["learning_rate"],
        bootstrap_type=best_params["bootstrap_type"],
        min_data_in_leaf=best_params["min_data_in_leaf"],
        loss_function="MultiClass",
        eval_metric="TotalF1",
        random_seed=RANDOM_STATE,
        task_type="CPU",
        verbose=200,
        od_type="Iter",
        od_wait=50,
        thread_count=-1
    )
    model.fit(train_pool, eval_set=test_pool, use_best_model=True)

    y_pred = model.predict(X_te)

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_te, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_te, y_pred, digits=3))

    acc = accuracy_score(y_te, y_pred)
    macro = f1_score(y_te, y_pred, average="macro")
    print(f"Accuracy: {acc:.3f} | Macro-F1: {macro:.3f}")

    return {"model": model, "acc": float(acc), "macro_f1": float(macro)}

# -------------------
# Run studies (Early-Warning and Full)
# -------------------
df = pd.read_csv(PATH)

print("\n=== Optuna: EARLY-WARNING dataset (leakage removed) ===")
study_early = optuna.create_study(direction="maximize")
study_early.optimize(make_objective(df, drop_cols=LEAKAGE_COLS), n_trials=30, show_progress_bar=True) #CHANGE n_trials=30 TILL 50 or 60
print("Best EARLY params:", study_early.best_params)
print("Best EARLY Macro-F1:", study_early.best_value)

print("\n=== Optuna: FULL dataset (leakage allowed) ===")
study_full = optuna.create_study(direction="maximize")
study_full.optimize(make_objective(df, drop_cols=None), n_trials=30, show_progress_bar=True) #CHANGE n_trials=30 TILL 50 or 60
print("Best FULL params:", study_full.best_params)
print("Best FULL Macro-F1:", study_full.best_value)

# -------------------
# Final evaluation on holdout splits
# -------------------
early_res = train_eval_final(df, study_early.best_params, label="EARLY-WARNING (leakage removed)", drop_cols=LEAKAGE_COLS)
full_res  = train_eval_final(df, study_full.best_params,  label="FULL (leakage allowed)",     drop_cols=None)

print("\n=== Summary ===")
print(pd.DataFrame([
    {"Dataset": "Early-Warning", "Accuracy": early_res["acc"], "Macro-F1": early_res["macro_f1"]},
    {"Dataset": "Full",          "Accuracy": full_res["acc"],  "Macro-F1": full_res["macro_f1"]},
]))
