In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
import joblib
import pandas as pd
import numpy as np

In [19]:
featured_df = pd.read_csv("attendance_with_features.csv")
featured_df.head()

Unnamed: 0,date,id,rfid_tag,checkin_time,checkout_time,note,DayOfWeek,Lag_1_Status,Count_Telat_7D,Count_Alpa_30D,Streak_Telat,Avg_Arrival_Time_7D
0,2025-09-10,94907,1418C9BC,14:39:32,14:39:40,telat,Wednesday,,0,0,0,
1,2025-09-11,96391,1418C9BC,,,alpa,Thursday,telat,1,0,1,879.533333
2,2025-09-12,97876,1418C9BC,07:22:42,13:50:34,telat,Friday,alpa,1,1,0,879.533333
3,2025-09-13,99361,1418C9BC,,,libur,Saturday,telat,2,1,2,661.116667
4,2025-09-14,100846,1418C9BC,,,libur,Sunday,libur,2,1,0,661.116667


In [16]:
df_model = featured_df.copy()
df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')

features = ['Count_Telat_7D', 'Count_Alpa_30D', 'Streak_Telat', 'Avg_Arrival_Time_7D']

df_model['has_checkin'] = df_model['checkin_time'].notna().astype(int)
df_model['Lag_1_Status_filled'] = df_model['Lag_1_Status'].fillna('None')
cat_cols = ['DayOfWeek', 'Lag_1_Status_filled']

X_num = df_model[features + ['has_checkin']].copy()
X_num['Avg_Arrival_Time_7D'] = X_num['Avg_Arrival_Time_7D'].fillna(X_num['Avg_Arrival_Time_7D'].median())

X_cat = pd.get_dummies(df_model[cat_cols].astype(str), prefix=cat_cols, drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
y = df_model['note'].astype(str) 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf_clf = RandomForestClassifier(
    n_estimators=100,          # default sesuai best practice
    random_state=42,
    n_jobs=-1
)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)

In [17]:
print("Classification report:\n")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("\nAggregate metrics:")
print(f"Accuracy: {acc:.6f}")
print(f"Precision (weighted): {precision:.6f}")
print(f"Recall (weighted): {recall:.6f}")
print(f"F1-score (weighted): {f1:.6f}")


Classification report:

              precision    recall  f1-score   support

        alpa       0.99      0.99      0.99      9246
       hadir       0.89      0.91      0.90     11407
       libur       0.99      0.99      0.99      9402
       telat       0.36      0.31      0.33      1852

    accuracy                           0.92     31907
   macro avg       0.81      0.80      0.80     31907
weighted avg       0.92      0.92      0.92     31907

Confusion matrix:

[[ 9176     0    70     0]
 [    0 10359     0  1048]
 [  136     0  9266     0]
 [    0  1270     0   582]]

Aggregate metrics:
Accuracy: 0.920895
Precision (weighted): 0.917196
Recall (weighted): 0.920895
F1-score (weighted): 0.918929


In [None]:
# print("\nLabel -> integer mapping (used for regression metrics):")
# print(label_to_int)

# tampilkan feature importances teratas
# importances = pd.Series(rf_clf.feature_importances_, index=X.columns).sort_values(ascending=False)
# print("\nTop 15 feature importances:")
# print(importances.head(15))

# simpan model untuk nanti digunakan
# joblib.dump(rf_clf, "random_forest_attendance_model.joblib")
# print("\nModel tersimpan -> random_forest_attendance_model.joblib")