In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, median_absolute_error, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd
import numpy as np

In [2]:
featured_df = pd.read_csv("attendance_with_features.csv")

In [3]:
df_model = featured_df.copy()
df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')

features = ['Count_Telat_7D', 'Count_Alpa_30D', 'Streak_Telat', 'Avg_Arrival_Time_7D']

df_model['has_checkin'] = df_model['checkin_time'].notna().astype(int)
df_model['Lag_1_Status_filled'] = df_model['Lag_1_Status'].fillna('None')
cat_cols = ['DayOfWeek', 'Lag_1_Status_filled']

X_num = df_model[features + ['has_checkin']].copy()
X_num['Avg_Arrival_Time_7D'] = X_num['Avg_Arrival_Time_7D'].fillna(X_num['Avg_Arrival_Time_7D'].median())

X_cat = pd.get_dummies(df_model[cat_cols].astype(str), prefix=cat_cols, drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
y = df_model['note'].astype(str) 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train_enc)

y_pred_cont = dt_reg.predict(X_test)

In [8]:
mae = mean_absolute_error(y_test_enc, y_pred_cont)
mse = mean_squared_error(y_test_enc, y_pred_cont)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_enc, y_pred_cont)
expl_var = explained_variance_score(y_test_enc, y_pred_cont)
med_ae = median_absolute_error(y_test_enc, y_pred_cont)

print("Regression metrics on encoded target:")
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}, Explained Var: {expl_var:.4f}, MedianAE: {med_ae:.4f}")

# map continuous predictions to nearest class index and evaluate as classification
y_pred_round = np.clip(np.rint(y_pred_cont).astype(int), 0, len(le.classes_) - 1)
y_pred_labels = le.inverse_transform(y_pred_round)

acc = accuracy_score(y_test, y_pred_labels)
print("\nClassification report (rounded predictions):")
print(classification_report(y_test, y_pred_labels))
print("Confusion matrix (rounded predictions):")
print(confusion_matrix(y_test, y_pred_labels))
print(f"\nApproximate classification accuracy (by rounding regressor output): {acc:.4f}")

Regression metrics on encoded target:
MAE: 0.1832, MSE: 0.3449, RMSE: 0.5873, R2: 0.5699, Explained Var: 0.5700, MedianAE: 0.0000

Classification report (rounded predictions):
              precision    recall  f1-score   support

        alpa       0.99      0.99      0.99      9246
       hadir       0.89      0.87      0.88     11407
       libur       0.95      0.99      0.97      9402
       telat       0.28      0.26      0.26      1852

    accuracy                           0.90     31907
   macro avg       0.78      0.78      0.77     31907
weighted avg       0.90      0.90      0.90     31907

Confusion matrix (rounded predictions):
[[9142    0  104    0]
 [   0 9936  224 1247]
 [ 138    0 9264    0]
 [   0 1185  194  473]]

Approximate classification accuracy (by rounding regressor output): 0.9031


In [None]:

# joblib.dump({"model": dt_reg, "label_encoder": le}, "dt_regressor_with_le.joblib")