In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from datetime import datetime

In [2]:
df = pd.read_csv('attendance_with_features.csv')

In [3]:
# 2.1 Convert date → float timestamp
df["date_ts"] = pd.to_datetime(df["date"]).astype('int64') / 10**9   # POSIX timestamp

# 2.2 Convert check-in time → seconds
def time_to_seconds(t):
    if pd.isna(t):
        return None
    h, m, s = t.split(":")
    return int(h)*3600 + int(m)*60 + int(s)

df["checkin_seconds"] = df["checkin_time"].apply(time_to_seconds)

# Fill missing checkin time (libur/alpa) dengan median
df["checkin_seconds"].fillna(df["checkin_seconds"].median(), inplace=True)

# 2.3 Extract hour
df["checkin_hour"] = pd.to_datetime(df["checkin_time"], errors='coerce').dt.hour
df["checkin_hour"].fillna(df["checkin_hour"].median(), inplace=True)

# 2.4 Encode DayOfWeek → number
df["day_number"] = df["DayOfWeek"].astype("category").cat.codes

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["checkin_seconds"].fillna(df["checkin_seconds"].median(), inplace=True)
  df["checkin_hour"] = pd.to_datetime(df["checkin_time"], errors='coerce').dt.hour
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["checkin_hour"].fillna(df["checkin_hour"].median(), inplace=True)


In [4]:

features = [
    "date_ts",
    "checkin_seconds",
    "checkin_hour",
    "day_number",
    "Count_Telat_7D",
    "Count_Alpa_30D",
    "Streak_Telat",
    "Avg_Arrival_Time_7D"
]

X = df[features]

In [6]:

model = IsolationForest(
    n_estimators=300,
    contamination=0.05,   # asumsi 5% data adalah anomali
    random_state=42
)

model.fit(X)

In [7]:
df["anomaly_score"] = model.decision_function(X)
df["anomaly"] = model.predict(X)

# Convert: -1 = anomaly, 1 = normal
df["anomaly"] = df["anomaly"].map({1: "Normal", -1: "Anomaly"})

In [8]:

df.to_csv("attendance_with_anomaly.csv", index=False)
df.head()


Unnamed: 0,date,id,rfid_tag,checkin_time,checkout_time,note,DayOfWeek,Lag_1_Status,Count_Telat_7D,Count_Alpa_30D,Streak_Telat,Avg_Arrival_Time_7D,date_ts,checkin_seconds,checkin_hour,day_number,anomaly_score,anomaly
0,2025-09-10,94907,1418C9BC,14:39:32,14:39:40,telat,Wednesday,,0,0,0,,1757462000.0,52772.0,14.0,6,-0.116443,Anomaly
1,2025-09-11,96391,1418C9BC,,,alpa,Thursday,telat,1,0,1,879.533333,1757549000.0,20667.0,5.0,4,0.021964,Normal
2,2025-09-12,97876,1418C9BC,07:22:42,13:50:34,telat,Friday,alpa,1,1,0,879.533333,1757635000.0,26562.0,7.0,0,-0.048414,Anomaly
3,2025-09-13,99361,1418C9BC,,,libur,Saturday,telat,2,1,2,661.116667,1757722000.0,20667.0,5.0,2,-0.01193,Anomaly
4,2025-09-14,100846,1418C9BC,,,libur,Sunday,libur,2,1,0,661.116667,1757808000.0,20667.0,5.0,3,0.059144,Normal
