# Preprocessing from Raw Dataset

In [12]:
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
from src.preprocessing import apply_anomaly_detection, add_temporal_and_lag_features, remove_low_attendance_students

## Load Dataset

In [13]:
df = pd.read_csv("../data/raw-dataset.csv")

In [14]:
df.head()

Unnamed: 0,id,rfid_tag,checkin_time,checkout_time,note,date
0,231966,6A3599CB,,,,2025-12-11
1,231965,5AEC24CB,2025-12-10 21:54:41.57978+00,2025-12-11 04:12:48.000143+00,,2025-12-11
2,231964,DAFD7CC0,,,,2025-12-11
3,231963,4AECC6CB,,,,2025-12-11
4,231962,6A48BDCB,,,,2025-12-11


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183755 entries, 0 to 183754
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             183755 non-null  int64 
 1   rfid_tag       183755 non-null  object
 2   checkin_time   75502 non-null   object
 3   checkout_time  44666 non-null   object
 4   note           118278 non-null  object
 5   date           183755 non-null  object
dtypes: int64(1), object(5)
memory usage: 8.4+ MB


## Cleaning Outlier Student (10% rfid_tag with low attendance)

In [16]:
df_filtered = remove_low_attendance_students(df, threshold_pct=10.0)

--- Removing Low Attendance Students (< 10.0%) ---
Total siswa: 1491
Siswa dengan kehadiran < 10.0%: 62
Siswa yang dipertahankan: 1429

Contoh siswa outlier yang dihapus:
rfid_tag  total_records  total_hadir  pct_hadir
1B29409D            129            5       3.88
 1B4A79D            129           12       9.30
1BD5279D            129            5       3.88
 285D2D3             30            1       3.33
2B58469D            129            0       0.00
2B604A9D            129            0       0.00
 3AC8EC3             93            7       7.53
3B7C359D            129            9       6.98
3BD0379D            129           10       7.75
 3BF049D            129            5       3.88

Data sebelum: 183755 baris
Data setelah: 176730 baris
Baris dihapus: 7025


## Anomaly Detection Cleaning

In [None]:
df_clean = apply_anomaly_detection(df_filtered, contamination='auto') # asumsi 5% data adalah anomali

--- Anomaly Detection (Isolation Forest) ---
Total data valid (checkin ada): 75240
Terdeteksi 14059 anomali teknis. Contoh:
                       checkin_time                    checkout_time  \
1  2025-12-10 21:54:41.579780+00:00 2025-12-11 04:12:48.000143+00:00   
8  2025-12-11 02:53:19.911990+00:00 2025-12-11 03:01:20.076312+00:00   
31 2025-12-11 05:18:24.802613+00:00                              NaT   
58 2025-12-10 23:05:57.227735+00:00 2025-12-11 04:35:10.822720+00:00   
70 2025-12-11 03:17:26.788000+00:00 2025-12-11 03:17:32.934234+00:00   

    duration_hours  
1         6.301783  
8         0.133379  
31        0.000000  
58        5.487110  
70        0.001707  
Data setelah pembersihan anomali: 162671


## Feature Engineering

### Without Anomaly Detection Cleaning

In [18]:
df_without_anomaly = add_temporal_and_lag_features(df_filtered)

In [19]:
df_without_anomaly.to_csv("../data/processed_data_without_anomaly.csv", index=False)

### With Anomaly Detection Cleaning

In [20]:
df_with_anomaly = add_temporal_and_lag_features(df_clean)

In [21]:
df_with_anomaly.to_csv("../data/processed_data_with_anomaly.csv", index=False)

In [22]:
df_with_anomaly['note'].value_counts(dropna=False)

note
hadir    61181
alpa     50892
Name: count, dtype: int64