In [17]:
import pandas as pd
import os

DATA_PATH = os.path.abspath('../data')

# Load CSV file


def load_csv(filename):
    path = os.path.join(DATA_PATH, filename)
    return pd.read_csv(path)


# Clean up original csv


def clean_csv(df):
    print(f"original rows: {len(df)}")

    df = df.dropna()
    print(f"rows after deleting null: {len(df)}")

    df = df.drop_duplicates()
    print(f"rows after deleting duplicates: {len(df)}")

    return df


df_student_vle = load_csv("studentVle.csv")
df_student_reg = load_csv("studentRegistration.csv")
df_student_info = load_csv("studentInfo.csv")
# Fill 0 or unknown for fields that can cause false deletion
df_student_vle['sum_click'] = df_student_vle['sum_click'].fillna(0)
df_student_reg['date_unregistration'] = df_student_reg['date_unregistration'].fillna(
    0)

cat_cols = ['region', 'gender', 'disability',
            'highest_education', 'imd_band', 'age_band']
df_student_info[cat_cols] = df_student_info[cat_cols].fillna('Not_Provided')

num_cols = ['num_of_prev_attempts', 'studied_credits']
df_student_info[num_cols] = df_student_info[num_cols].fillna(0)

clean_csv(df_student_vle)
clean_csv(df_student_reg)
clean_csv(df_student_info)
df_student_vle = df_student_vle[[
    'id_student', 'code_module', 'code_presentation', 'sum_click']]
df_student_reg = df_student_reg[[
    'id_student', 'code_module', 'code_presentation', 'date_registration']]
clicks_per_student = df_student_vle.groupby(
    ['id_student', 'code_module', 'code_presentation'], as_index=False)['sum_click'].sum()
print(f' click per student:\n{clicks_per_student.head()}')
print(f'student vle: \n{df_student_vle.head()}')
print(f'student reg: \n{df_student_reg.head()} ')
print(f'student info: \n{df_student_info.head()} ')

# Merge csv files
df = pd.merge(df_student_info, clicks_per_student, on=[
              'id_student', 'code_module', 'code_presentation'], how='left')
df = pd.merge(df, df_student_reg, on=[
              'id_student', 'code_module', 'code_presentation'], how='left')

# Fill in sum_click if sum_click is NAH
df['sum_click'] = df['sum_click'].fillna(0)

df['target'] = df['final_result'].map({
    'Pass': 1,
    'Distinction': 1,
    'Fail': 0,
    'Withdrawn': 0
})

print(f'merged df: \n{df.head()} ')

original rows: 10655280
rows after deleting null: 10655280
rows after deleting duplicates: 9868110
original rows: 32593
rows after deleting null: 32548
rows after deleting duplicates: 32548
original rows: 32593
rows after deleting null: 32593
rows after deleting duplicates: 32593
 click per student:
   id_student code_module code_presentation  sum_click
0        6516         AAA             2014J       2791
1        8462         DDD             2013J        646
2        8462         DDD             2014J         10
3       11391         AAA             2013J        934
4       23629         BBB             2013B        161
student vle: 
   id_student code_module code_presentation  sum_click
0       28400         AAA             2013J          4
1       28400         AAA             2013J          1
2       28400         AAA             2013J          1
3       28400         AAA             2013J         11
4       28400         AAA             2013J          1
student reg: 
   id_stude

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

# -----------------------------------------
# Function for numeric feature vs target (e.g., sum_click)
# Shows a boxplot to compare distribution by class
# -----------------------------------------
def plot_numeric_vs_target(df, numeric_col, target_col='target'):
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=target_col, y=numeric_col, data=df)
    plt.title(f"{numeric_col} vs {target_col}")
    plt.xlabel("Target (0 = Fail/Withdrawn, 1 = Pass/Distinction)")
    plt.ylabel(numeric_col)
    plt.tight_layout()
    plt.show()

# -----------------------------------------
# Function for categorical feature vs target
# Shows a countplot (grouped bar chart)
# -----------------------------------------
def plot_categorical_vs_target(df, cat_col, target_col='target'):
    plt.figure(figsize=(8, 5))
    sns.countplot(x=cat_col, hue=target_col, data=df,
                  order=sorted(df[cat_col].dropna().unique()))
    plt.title(f"{cat_col} vs {target_col}")
    plt.xlabel(cat_col)
    plt.ylabel("Count")
    plt.legend(title=target_col)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    if col != 'target' and col != 'id_student':  
        plot_numeric_vs_target(df, col)

categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    if col != 'final_result': 
        plot_categorical_vs_target(df, col)

numeric_df = df.select_dtypes(include=['float64', 'int64'])
numeric_df = numeric_df.drop(columns=['target'])
corr_matrix = numeric_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import joblib

# Separate target from df
X = df.drop(columns=['target'])
y = df['target']

X = pd.get_dummies(X)  # one-hot encoding for catogorical columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

models = {
    'decision_tree': DecisionTreeClassifier(random_state=42),
    'svm': SVC(probability=True, random_state=42)
}


kf = KFold(n_splits=5, shuffle=True, random_state=42)


best_model = None
best_score = 0
best_model_name = ""

for name, model in models.items():
    print(f"\nTraining {name.upper()}...")

    fold_metrics = []

    for train_index, val_index in kf.split(X_train):
        X_ktrain, X_kval = X_train[train_index], X_train[val_index]
        y_ktrain, y_kval = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(X_ktrain, y_ktrain)
        y_pred = model.predict(X_kval)
        y_prob = model.predict_proba(X_kval)[:, 1] if hasattr(model, "predict_proba") else None

        acc = accuracy_score(y_kval, y_pred)
        prec = precision_score(y_kval, y_pred, zero_division=0)
        rec = recall_score(y_kval, y_pred)
        f1 = f1_score(y_kval, y_pred)
        auc = roc_auc_score(y_kval, y_prob) if y_prob is not None else 0

        fold_metrics.append((acc, prec, rec, f1, auc))

    
    avg_scores = np.mean(fold_metrics, axis=0)
    print(f"Avg Accuracy: {avg_scores[0]:.4f}, Precision: {avg_scores[1]:.4f}, Recall: {avg_scores[2]:.4f}, F1: {avg_scores[3]:.4f}, AUC: {avg_scores[4]:.4f}")

    # save the best model based on AUC-ROC
    if avg_scores[4] > best_score:  
        best_score = avg_scores[4]
        best_model = model
        best_model_name = name

# evaluate the best model with test splits
print(f"\nBest model: {best_model_name.upper()}")
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None

print("Test Set Evaluation:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1-score:  {f1_score(y_test, y_pred):.4f}")
if y_prob is not None:
    print(f"AUC-ROC:   {roc_auc_score(y_test, y_prob):.4f}")

# save the best model
joblib.dump(best_model, f"best_model_{best_model_name}.pkl")
print(f"Model saved to best_model_{best_model_name}.pkl")