In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder

# load data
df = pd.read_csv("sorted_attendance_data.csv")

# if there's a timestamp-like column, ensure proper time order
for time_col in ("timestamp", "date", "datetime"):
    if time_col in df.columns:
        df[time_col] = pd.to_datetime(df[time_col], errors="coerce")
        # keep rows with valid dates first, then sort; if parsing failed, assume file already sorted
        if df[time_col].notna().any():
            df = df.sort_values(time_col).reset_index(drop=True)
        break

# target and features
if "note" not in df.columns:
    raise KeyError("Column 'note' not found in sorted_attendance_data.csv")
y = df["note"].copy()
X = df.drop(columns=["note"])

# simple preprocessing: convert categoricals to dummies, fill NA with median
X = pd.get_dummies(X, drop_first=True)
# numeric median fill (get_dummies produces numeric columns)
X = X.fillna(X.median())

# encode target if not numeric
if not np.issubdtype(y.dtype, np.number):
    le = LabelEncoder()
    y = le.fit_transform(y.astype(str))

# time-series split: first 2/3 for train, last 1/3 for test
n = len(df)
n_train = int(np.floor(n * 2 / 3))
X_train = X.iloc[:n_train].to_numpy()
X_test = X.iloc[n_train:].to_numpy()
y_train = y[:n_train].to_numpy()
y_test = y[n_train:].to_numpy()

print("Train size:", len(y_train), "Test size:", len(y_test))
print("Train class distribution before SMOTE:", Counter(y_train))
print("Test class distribution:", Counter(y_test))


AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Fit GridSearchCV pipeline (includes SMOTE) using existing variables
# Requires: search, X_train, y_train, X_test, y_test already defined in the notebook
search.fit(X_train, y_train)

# best pipeline (with fitted SMOTE + scaler + classifier)
smote_model = search.best_estimator_
print("Best params:", search.best_params_)
print("CV best roc_auc:", search.best_score_)

# Evaluate on test set (use new variable names to avoid overwriting existing ones)

y_pred_smote = smote_model.predict(X_test)
y_proba_smote = smote_model.predict_proba(X_test)[:, 1] if hasattr(smote_model, "predict_proba") else None

if y_proba_smote is not None:
    print("Test ROC AUC:", roc_auc_score(y_test, y_proba_smote))
else:
    print("Model doesn't provide predict_proba; cannot compute ROC AUC.")

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_smote))
print("Classification report:\n", classification_report(y_test, y_pred_smote))