In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd
import numpy as np

In [8]:
featured_df = pd.read_csv("attendance_with_features.csv")

In [9]:
df_model = featured_df.copy()
df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')

features = ['Count_Telat_7D', 'Count_Alpa_30D', 'Streak_Telat', 'Avg_Arrival_Time_7D']

df_model['has_checkin'] = df_model['checkin_time'].notna().astype(int)
df_model['Lag_1_Status_filled'] = df_model['Lag_1_Status'].fillna('None')
cat_cols = ['DayOfWeek', 'Lag_1_Status_filled']

X_num = df_model[features + ['has_checkin']].copy()
X_num['Avg_Arrival_Time_7D'] = X_num['Avg_Arrival_Time_7D'].fillna(X_num['Avg_Arrival_Time_7D'].median())

X_cat = pd.get_dummies(df_model[cat_cols].astype(str), prefix=cat_cols, drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
y = df_model['note'].astype(str) 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train_enc)

y_pred_enc = dt_clf.predict(X_test)

In [None]:
acc = accuracy_score(y_test_enc, y_pred_enc)
prec_macro = precision_score(y_test_enc, y_pred_enc, average='macro', zero_division=0)
rec_macro = recall_score(y_test_enc, y_pred_enc, average='macro', zero_division=0)
f1_macro = f1_score(y_test_enc, y_pred_enc, average='macro', zero_division=0)
prec_weight = precision_score(y_test_enc, y_pred_enc, average='weighted', zero_division=0)
rec_weight = recall_score(y_test_enc, y_pred_enc, average='weighted', zero_division=0)
f1_weight = f1_score(y_test_enc, y_pred_enc, average='weighted', zero_division=0)

print("Classification metrics (encoded labels):")
print(f"Accuracy: {acc:.4f}")
print(f"Precision (macro / weighted): {prec_macro:.4f} / {prec_weight:.4f}")
print(f"Recall    (macro / weighted): {rec_macro:.4f} / {rec_weight:.4f}")
print(f"F1-score  (macro / weighted): {f1_macro:.4f} / {f1_weight:.4f}")

# human-readable report using original label strings
y_pred_labels = le.inverse_transform(y_pred_enc)
print("\nClassification report (string labels):")
print(classification_report(y_test, y_pred_labels, zero_division=0))

print("Confusion matrix (string labels):")
print(confusion_matrix(y_test, y_pred_labels))

Classification metrics (encoded labels):
Accuracy: 0.9116
Precision (macro / weighted): 0.7946 / 0.9134
Recall    (macro / weighted): 0.7968 / 0.9116
F1-score  (macro / weighted): 0.7956 / 0.9125

Classification report (string labels):
              precision    recall  f1-score   support

        alpa       0.99      0.99      0.99      9246
       hadir       0.89      0.88      0.89     11407
       libur       0.99      0.99      0.99      9402
       telat       0.31      0.33      0.32      1852

    accuracy                           0.91     31907
   macro avg       0.79      0.80      0.80     31907
weighted avg       0.91      0.91      0.91     31907

Confusion matrix (string labels):
[[ 9142     0   104     0]
 [    0 10069     0  1338]
 [  138     0  9264     0]
 [    0  1240     0   612]]


In [None]:

# joblib.dump({"model": dt_reg, "label_encoder": le}, "dt_regressor_with_le.joblib")