In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
import joblib
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

In [24]:
featured_df = pd.read_csv("sorted_attendance_data.csv")
# featured_df.head()

In [25]:
df_model = featured_df.copy()
df_model['date'] = pd.to_datetime(df_model['date'], errors='coerce')

features = ['Count_Telat_7D', 'Count_Alpa_30D', 'Streak_Telat', 'Avg_Arrival_Time_7D']

df_model['has_checkin'] = df_model['checkin_time'].notna().astype(int)
df_model['Lag_1_Status_filled'] = df_model['Lag_1_Status'].fillna('None')
cat_cols = ['DayOfWeek', 'Lag_1_Status_filled']

X_num = df_model[features + ['has_checkin']].copy()
X_num['Avg_Arrival_Time_7D'] = X_num['Avg_Arrival_Time_7D'].fillna(X_num['Avg_Arrival_Time_7D'].median())

X_cat = pd.get_dummies(df_model[cat_cols].astype(str), prefix=cat_cols, drop_first=True)
X = pd.concat([X_num, X_cat], axis=1)
y = df_model['note'].astype(str) 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [26]:
X_train_nb = X_train.copy()
X_test_nb = X_test.copy()
bool_cols = X_train_nb.select_dtypes(include=['bool']).columns
X_train_nb[bool_cols] = X_train_nb[bool_cols].astype(int)
X_test_nb[bool_cols] = X_test_nb[bool_cols].astype(int)

nb_clf = GaussianNB()
nb_clf.fit(X_train_nb, y_train)

y_pred_nb = nb_clf.predict(X_test_nb)



In [27]:
print(classification_report(y_test, y_pred_nb, zero_division=0))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.6f}")
acc = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred_nb, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred_nb, average='weighted', zero_division=0)

print("\nAggregate metrics:")
print(f"Accuracy: {acc:.6f}")
print(f"Precision (weighted): {precision:.6f}")
print(f"Recall (weighted): {recall:.6f}")
print(f"F1-score (weighted): {f1:.6f}")

              precision    recall  f1-score   support

        alpa       1.00      0.79      0.88      9246
       hadir       0.88      0.91      0.90     11407
       libur       0.83      1.00      0.91      9402
       telat       0.32      0.27      0.29      1852

    accuracy                           0.86     31907
   macro avg       0.76      0.74      0.74     31907
weighted avg       0.87      0.86      0.86     31907

Confusion matrix:
[[ 7291     0  1955     0]
 [    0 10344     0  1063]
 [    0     0  9402     0]
 [    0  1348     0   504]]
Accuracy: 0.863165

Aggregate metrics:
Accuracy: 0.863165
Precision (weighted): 0.868683
Recall (weighted): 0.863165
F1-score (weighted): 0.859745

Aggregate metrics:
Accuracy: 0.863165
Precision (weighted): 0.868683
Recall (weighted): 0.863165
F1-score (weighted): 0.859745


In [28]:
# SMOTE -> resample training set lalu latih GaussianNB pada data hasil SMOTE
X_train_nb_sm = X_train.copy()
X_train_nb_sm[X_train_nb_sm.select_dtypes(include=['bool']).columns] = X_train_nb_sm.select_dtypes(include=['bool']).astype(int)
X_test_nb_sm = X_test.copy()
X_test_nb_sm[X_test_nb_sm.select_dtypes(include=['bool']).columns] = X_test_nb_sm.select_dtypes(include=['bool']).astype(int)

# Terapkan SMOTE pada training data
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_nb_sm, y_train)
print(f"Original training samples: {X_train_nb_sm.shape[0]}, Resampled: {X_res.shape[0]}")

# Latih NB pada data hasil SMOTE
nb_clf_smote = GaussianNB()
nb_clf_smote.fit(X_res, y_res)
y_pred_nb_smote = nb_clf_smote.predict(X_test_nb_sm)

Original training samples: 127627, Resampled: 182496


In [29]:
print("SMOTE Naive Bayes - classification report:")
print(classification_report(y_test, y_pred_nb_smote, zero_division=0))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred_nb_smote))
acc = accuracy_score(y_test, y_pred_nb_smote)
precision = precision_score(y_test, y_pred_nb_smote, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred_nb_smote, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred_nb_smote, average='weighted', zero_division=0)

print("\nAggregate metrics (SMOTE):")
print(f"Accuracy: {acc:.6f}")
print(f"Precision (weighted): {precision:.6f}")
print(f"Recall (weighted): {recall:.6f}")
print(f"F1-score (weighted): {f1:.6f}")

SMOTE Naive Bayes - classification report:
              precision    recall  f1-score   support

        alpa       1.00      0.79      0.88      9246
       hadir       0.91      0.81      0.86     11407
       libur       0.83      1.00      0.91      9402
       telat       0.30      0.50      0.38      1852

    accuracy                           0.84     31907
   macro avg       0.76      0.78      0.75     31907
weighted avg       0.88      0.84      0.85     31907

Confusion matrix:
[[7291    0 1955    0]
 [   0 9228    0 2179]
 [   0    0 9402    0]
 [   0  920    0  932]]

Aggregate metrics (SMOTE):
Accuracy: 0.841602
Precision (weighted): 0.876210
Recall (weighted): 0.841602
F1-score (weighted): 0.850348
[[7291    0 1955    0]
 [   0 9228    0 2179]
 [   0    0 9402    0]
 [   0  920    0  932]]

Aggregate metrics (SMOTE):
Accuracy: 0.841602
Precision (weighted): 0.876210
Recall (weighted): 0.841602
F1-score (weighted): 0.850348


In [30]:
# # simpan model
# joblib.dump(nb_clf, "naive_bayes_attendance_model.joblib")
# print("Model tersimpan -> naive_bayes_attendance_model.joblib")