## Perubahan Struktur Dataset Pembelian

### Perubahan Struktur Dataset Pembelian

In [1]:
import pandas as pd
import re
from pathlib import Path

def parse_number(x):
    """Konversi string angka format Indonesia menjadi float"""
    if pd.isna(x) or str(x).strip() == '':
        return 0.0
    x = str(x).replace('.', '').replace(',', '.')
    try:
        return float(x)
    except:
        return 0.0

# Lokasi file
file_path = Path("pembelian.tsv")

# Baca file baris per baris
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

data = []
kode, nama, unit = '', '', ''

for line in lines:
    line = line.strip()
    if not line:
        continue

    # Deteksi baris header produk (contoh: A000001 ANATON TAB STRIP)
    header_match = re.match(r'^([A-Z0-9]+)\s+(.+?)\s+(STRIP|BTL|BOX|PCS)$', line)
    if header_match:
        kode = header_match.group(1).strip()
        nama = header_match.group(2).strip()
        unit = header_match.group(3).strip()
        continue

    # Deteksi baris transaksi (dimulai tanggal DD-MM-YY)
    date_match = re.match(r'^(\d{2}-\d{2}-\d{2})', line)
    if date_match:
        parts = re.split(r'\s{2,}', line)
        tanggal = parts[0]
        no_transaksi = parts[1] if len(parts) > 1 else ''

        qty_msk = nilai_msk = qty_klr = nilai_klr = 0.0

        # Format umum: tanggal | no_transaksi | qty | nilai
        if len(parts) == 4:
            # Cek kode transaksi (1. = masuk, 2. = keluar)
            if no_transaksi.strip().startswith('1.'):
                qty_msk = parse_number(parts[2])
                nilai_msk = parse_number(parts[3])
            elif no_transaksi.strip().startswith('2.'):
                qty_klr = parse_number(parts[2])
                nilai_klr = parse_number(parts[3])

        elif len(parts) == 5:
            # Bisa terjadi karena format tidak konsisten, kita cek pola juga
            if no_transaksi.strip().startswith('1.'):
                qty_msk = parse_number(parts[2])
                nilai_msk = parse_number(parts[3])
            elif no_transaksi.strip().startswith('2.'):
                qty_klr = parse_number(parts[2])
                nilai_klr = parse_number(parts[3])

        elif len(parts) >= 6:
            # Kalau keduanya ada (jarang terjadi)
            qty_msk = parse_number(parts[2])
            nilai_msk = parse_number(parts[3])
            qty_klr = parse_number(parts[4])
            nilai_klr = parse_number(parts[5])

        # Tentukan kategori dan total
        if qty_msk > 0:
            kategori = 'MASUK'
            qty_total = qty_msk
            nilai_total = nilai_msk
        elif qty_klr > 0:
            kategori = 'KELUAR'
            qty_total = qty_klr
            nilai_total = nilai_klr
        else:
            kategori = 'LAINNYA'
            qty_total = 0
            nilai_total = 0

        data.append({
            'KODE': kode,
            'NAMA_PRODUK': nama,
            'UNIT': unit,
            'TANGGAL': pd.to_datetime(tanggal, format='%d-%m-%y', errors='coerce'),
            'NO_TRANSAKSI': no_transaksi,
            'QTY_MSK': qty_msk,
            'NILAI_MSK': nilai_msk,
            'QTY_KLR': qty_klr,
            'NILAI_KLR': nilai_klr,
            'KATEGORI': kategori,
            'QTY_TOTAL': qty_total,
            'NILAI_TOTAL': nilai_total
        })

# Buat DataFrame
df = pd.DataFrame(data)

# Urutkan berdasarkan kode dan tanggal
df = df.sort_values(by=["KODE", "TANGGAL"]).reset_index(drop=True)

# Simpan hasil ke CSV
output_path = "pembelian_final_fix.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ File '{output_path}' berhasil dibuat!")
print(df.head(15))


✅ File 'pembelian_final_fix.csv' berhasil dibuat!
       KODE NAMA_PRODUK   UNIT    TANGGAL          NO_TRANSAKSI  QTY_MSK  \
0   A000001  ANATON TAB  STRIP 2021-07-06  1.13-210706.0908-003     10.0   
1   A000001  ANATON TAB  STRIP 2021-07-12   2.6-210712.1519-097      0.0   
2   A000001  ANATON TAB  STRIP 2021-07-12  2.11-210712.1633-013      0.0   
3   A000001  ANATON TAB  STRIP 2021-07-12  2.13-210712.1807-013      0.0   
4   A000001  ANATON TAB  STRIP 2021-07-12  2.11-210712.1855-018      0.0   
5   A000001  ANATON TAB  STRIP 2021-07-12  2.11-210712.1925-027      0.0   
6   A000001  ANATON TAB  STRIP 2021-07-12  2.11-210712.1957-035      0.0   
7   A000001  ANATON TAB  STRIP 2021-07-12   2.6-210712.0907-023      0.0   
8   A000001  ANATON TAB  STRIP 2021-07-13  2.11-210713.1102-011      0.0   
9   A000001  ANATON TAB  STRIP 2021-07-13   2.6-210713.1701-006      0.0   
10  A000001  ANATON TAB  STRIP 2021-08-23  1.12-210823.1955-003     10.0   
11  A000001  ANATON TAB  STRIP 2021-09

### Perubahan Struktur Dataset Stok

In [3]:
import pandas as pd
import re
from pathlib import Path

# === Fungsi bantu untuk parsing angka format Indonesia ===
def parse_number(x):
    """Konversi string angka format Indonesia menjadi float"""
    if pd.isna(x) or str(x).strip() == '':
        return 0.0
    x = str(x).replace('.', '').replace(',', '.')
    try:
        return float(x)
    except:
        return 0.0

# === Lokasi file input ===
file_path = Path("stok.tsv")

# Baca file baris per baris
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

data = []

for line in lines:
    line = line.strip()
    if not line:
        continue

    # Lewati baris header
    if line.startswith("KODE") or line.startswith("Kode"):
        continue

    # Pisahkan berdasarkan 2+ spasi
    parts = re.split(r'\s{2,}', line)
    # Contoh hasil split:
    # ['A000001', 'ANATON TAB', 'ETL1', '12,00', 'STRIP']

    if len(parts) >= 5:
        kode = parts[0].strip()
        nama_produk = parts[1].strip()
        lokasi = parts[2].strip()
        qty_stok = parse_number(parts[3])
        unit = parts[4].strip()

        data.append({
            'KODE': kode,
            'NAMA_PRODUK': nama_produk,
            'LOKASI': lokasi,
            'QTY_STOK': qty_stok,
            'UNIT': unit
        })

# Buat DataFrame
df = pd.DataFrame(data)

# Urutkan berdasarkan KODE
df = df.sort_values(by=["KODE"]).reset_index(drop=True)

# Simpan ke CSV
output_path = "stok_final_fix.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ File '{output_path}' berhasil dibuat!")
print(df.head(10))


✅ File 'stok_final_fix.csv' berhasil dibuat!
      KODE         NAMA_PRODUK LOKASI  QTY_STOK   UNIT
0  A000001          ANATON TAB   ETL1      12.0  STRIP
1   A00001       ACTIVED HIJAU  ETL3A       2.0    BTL
2  A000012  APIALYS SYR 100 ML  ETL3A       2.0    BTL
3  A000014     ALKOHOL 1000 ML  ETL3B       7.0    BTL
4  A000016     ALLOPURINOL 300   RAK2      40.0  STRIP
5  A000018   ATORVASTATIN 10MG   RAK2       6.0  STRIP
6   A00004     ACYCLOVIR 200MG   RAK2      13.0  STRIP
7  A000040         MEFIX 500MG   RAK1       9.0  STRIP
8   A00005     ACYCLOVIR 400MG   RAK2      21.0  STRIP
9  A000066       ANDALAN KB FE   RAK4      35.0  STRIP


## Proses Pengecekan Dataset Pembelian

#### Load Dataset

In [114]:
import pandas as pd
import numpy as np
import os

# Gunakan current working directory
cwd = os.getcwd()
file_path = os.path.join(cwd, "pembelian_final_fix.csv")  

# Baca file CSV
df = pd.read_csv(file_path)

# Tampilkan 5 baris pertama
df.head()

Unnamed: 0,KODE,NAMA_PRODUK,UNIT,TANGGAL,NO_TRANSAKSI,QTY_MSK,NILAI_MSK,QTY_KLR,NILAI_KLR,KATEGORI,QTY_TOTAL,NILAI_TOTAL
0,A000001,ANATON TAB,STRIP,2021-07-06,1.13-210706.0908-003,10.0,2520.0,0.0,0.0,MASUK,10.0,2520.0
1,A000001,ANATON TAB,STRIP,2021-07-12,2.6-210712.1519-097,0.0,0.0,1.0,3000.0,KELUAR,1.0,3000.0
2,A000001,ANATON TAB,STRIP,2021-07-12,2.11-210712.1633-013,0.0,0.0,1.0,3000.0,KELUAR,1.0,3000.0
3,A000001,ANATON TAB,STRIP,2021-07-12,2.13-210712.1807-013,0.0,0.0,1.0,3000.0,KELUAR,1.0,3000.0
4,A000001,ANATON TAB,STRIP,2021-07-12,2.11-210712.1855-018,0.0,0.0,1.0,3000.0,KELUAR,1.0,3000.0


#### Dataframes

In [115]:
print("📌 Informasi DataFrame:")
df.info()

print("\n📌 Tipe Data Kolom:")
print(df.dtypes)

print("\n📌 Cek Missing Value:")
print(df.isna().sum())

📌 Informasi DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138364 entries, 0 to 138363
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0       KODE      138364 non-null  object 
 1   NAMA_PRODUK   138364 non-null  object 
 2   UNIT          138364 non-null  object 
 3   TANGGAL       138364 non-null  object 
 4   NO_TRANSAKSI  138364 non-null  object 
 5   QTY_MSK       138364 non-null  float64
 6   NILAI_MSK     138364 non-null  float64
 7   QTY_KLR       138364 non-null  float64
 8   NILAI_KLR     138364 non-null  float64
 9   KATEGORI      138364 non-null  object 
 10  QTY_TOTAL     138364 non-null  float64
 11  NILAI_TOTAL   138364 non-null  float64
dtypes: float64(6), object(6)
memory usage: 12.7+ MB

📌 Tipe Data Kolom:
    KODE         object
NAMA_PRODUK      object
UNIT             object
TANGGAL          object
NO_TRANSAKSI     object
QTY_MSK         float64
NILAI_MSK       float64
QTY_KLR  

#### Konversi Tipe Data

In [116]:
# Konversi kolom tanggal
df['TANGGAL'] = pd.to_datetime(df['TANGGAL'], errors='coerce')

# Kolom numerik
num_cols = ['QTY_MSK', 'NILAI_MSK', 'QTY_KLR', 'NILAI_KLR', 'QTY_TOTAL', 'NILAI_TOTAL']
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Kolom teks
str_cols = ['KODE', 'NAMA_PRODUK', 'UNIT', 'NO_TRANSAKSI', 'KATEGORI']
for col in str_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

df.dtypes

    KODE                object
NAMA_PRODUK             object
UNIT                    object
TANGGAL         datetime64[ns]
NO_TRANSAKSI            object
QTY_MSK                float64
NILAI_MSK              float64
QTY_KLR                float64
NILAI_KLR              float64
KATEGORI                object
QTY_TOTAL              float64
NILAI_TOTAL            float64
dtype: object

#### Cek duplikasi berdasarkan NO_TRANSAKSI

In [117]:
# Cek duplikasi berdasarkan NO_TRANSAKSI
duplikat_no_transaksi = df[df.duplicated(subset=['NO_TRANSAKSI'], keep=False)]

print(f"📌 Jumlah duplikasi No Transaksi   : {duplikat_no_transaksi.shape[0]}")

# Tampilkan sebagian contoh duplikasi
duplikat_no_transaksi.sort_values('NO_TRANSAKSI').head(10)

📌 Jumlah duplikasi No Transaksi   : 111749


Unnamed: 0,KODE,NAMA_PRODUK,UNIT,TANGGAL,NO_TRANSAKSI,QTY_MSK,NILAI_MSK,QTY_KLR,NILAI_KLR,KATEGORI,QTY_TOTAL,NILAI_TOTAL
136190,Y000001,YUSIMOX SYR,BTL,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
36016,E0000055,"ETADEX 0,5",STRIP,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
48284,H0000006,HEROCYN TALK 85G,BTL,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
132248,V0000025,VITALONG C (4),STRIP,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
138169,Z000006,ZAMBUK,PCS,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
75336,M0000060,MYLANTA SYR (B),BTL,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
74579,M0000055,MYLANTA SYR (K),BTL,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
74038,M0000050,MOLEXFLU,STRIP,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
53842,I00000028,IMBOOST SYR 120ML,BTL,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0
106620,P0000197,POLOFAR PLUS,STRIP,2021-04-14,-----1.12-210414.1355-001,0.0,0.0,0.0,0.0,LAINNYA,0.0,0.0


#### Menghapus Duplikasi

In [118]:
# Hapus duplikasi dengan mempertahankan baris pertama
df = df.drop_duplicates(subset=['NO_TRANSAKSI'], keep='first')

print(f"✅ Jumlah data setelah hapus duplikasi: {len(df)}")

✅ Jumlah data setelah hapus duplikasi: 59823


#### Deteksi Inkonsistensi Nama Produk per Kode

In [119]:
print(df.columns.tolist())

df.columns = df.columns.str.strip().str.upper()
print(df.columns.tolist())

# Group by KODE → hitung berapa nama unik per kode
kode_nama_group = df.groupby('KODE')['NAMA_PRODUK'].nunique()
kode_nama_tidak_konsisten = kode_nama_group[kode_nama_group > 1]

print(f"📌 Jumlah kode inkonsisten nama    : {len(kode_nama_tidak_konsisten)}")

# Tampilkan nama-nama produk untuk setiap kode yang bermasalah
for kode in kode_nama_tidak_konsisten.index:
    print(f"\nKode {kode} punya nama produk:")
    print(df[df['KODE'] == kode]['NAMA_PRODUK'].unique())

['    KODE', 'NAMA_PRODUK', 'UNIT', 'TANGGAL', 'NO_TRANSAKSI', 'QTY_MSK', 'NILAI_MSK', 'QTY_KLR', 'NILAI_KLR', 'KATEGORI', 'QTY_TOTAL', 'NILAI_TOTAL']
['KODE', 'NAMA_PRODUK', 'UNIT', 'TANGGAL', 'NO_TRANSAKSI', 'QTY_MSK', 'NILAI_MSK', 'QTY_KLR', 'NILAI_KLR', 'KATEGORI', 'QTY_TOTAL', 'NILAI_TOTAL']
📌 Jumlah kode inkonsisten nama    : 0


#### Cek No Transaksi yang Tanggalnya Beda

In [120]:
no_transaksi_group = df.groupby('NO_TRANSAKSI')['TANGGAL'].nunique()
no_transaksi_beda_tanggal = no_transaksi_group[no_transaksi_group > 1]

print(f"📌 Jumlah No Transaksi beda tanggal: {len(no_transaksi_beda_tanggal)}")

# Tampilkan jika ada
if len(no_transaksi_beda_tanggal) > 0:
    print(df[df['NO_TRANSAKSI'].isin(no_transaksi_beda_tanggal.index)].sort_values('NO_TRANSAKSI'))


📌 Jumlah No Transaksi beda tanggal: 0


In [121]:
rekap = df.groupby(['KODE', 'NAMA_PRODUK', 'UNIT']).agg({
    'QTY_MSK': 'sum',
    'NILAI_MSK': 'sum',
    'QTY_KLR': 'sum',
    'NILAI_KLR': 'sum',
    'QTY_TOTAL': 'sum',
    'NILAI_TOTAL': 'sum'
}).reset_index()

print("📊 Rekapitulasi Data:")
rekap.head(10)

📊 Rekapitulasi Data:


Unnamed: 0,KODE,NAMA_PRODUK,UNIT,QTY_MSK,NILAI_MSK,QTY_KLR,NILAI_KLR,QTY_TOTAL,NILAI_TOTAL
0,A000001,ANATON TAB,STRIP,30.0,8846.13,18.0,60000.0,48.0,68846.13
1,A00000156,zz,BTL,4.0,14355.43,4.0,52500.0,8.0,66855.43
2,A00000157,z,BTL,3.0,21425.19,6.0,150000.0,9.0,171425.19
3,A000002,ASEPTIC PLUS SPRAY,BTL,3.0,11000.0,3.0,30000.0,6.0,41000.0
4,A00001,ACTIVED HIJAU,BTL,20.0,527359.96,19.0,1120000.0,39.0,1647359.96
5,A000011,ALLERIN EXP,BTL,0.0,0.0,3.0,40500.0,3.0,40500.0
6,A000012,APIALYS SYR 100 ML,BTL,18.0,575784.53,16.0,649000.0,34.0,1224784.53
7,A000014,ALKOHOL 1000 ML,BTL,20.0,195375.16,25.0,966000.0,45.0,1161375.16
8,A000016,ALLOPURINOL 300,STRIP,230.0,64728.45,210.0,1032000.0,440.0,1096728.45
9,A000018,ATORVASTATIN 10MG,STRIP,9.0,61879.02,9.0,297000.0,18.0,358879.02


#### Standarisasi Format Teks dan Satuan Numerik

In [3]:
# Daftar kolom teks
text_cols = ['KODE', 'NAMA_PRODUK', 'UNIT', 'KATEGORI', 'NO_TRANSAKSI']
num_cols = ['QTY_MSK', 'NILAI_MSK', 'QTY_KLR', 'NILAI_KLR', 'QTY_TOTAL', 'NILAI_TOTAL']

# Pastikan semua numerik, hilangkan anomali karakter seperti koma, titik salah posisi
for col in num_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(',', '.', regex=False)  
        .str.replace(r'[^\d\.]', '', regex=True)  
    )
    df[col] = pd.to_numeric(df[col], errors='coerce').round(2) 

# Ubah huruf besar semua, hilangkan spasi berlebih, dan karakter tidak penting
for col in text_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.replace(r'\s+', ' ', regex=True)  # hilangkan spasi ganda
        .str.upper()
    )
    
df.loc[:, 'TANGGAL'] = pd.to_datetime(df['TANGGAL'], errors='coerce', format='%Y-%m-%d')

df.loc[:, 'KODE'] = (
    df['KODE']
    .astype(str)
    .str.strip()
    .str.replace(r'[^A-Z0-9]', '', regex=True)   # hanya huruf besar & angka
    .str.upper()
)

def valid_kode(k):
    return bool(re.match(r'^[A-Z0-9]{3,12}$', str(k).strip()))

df.loc[:, 'KODE_VALID'] = df['KODE'].apply(valid_kode)

KeyError: 'QTY_MSK'

#### Validasi Format Tanggal dan Kode Obat

In [123]:
# Konversi tanggal dan validasi
df['TANGGAL'] = pd.to_datetime(df['TANGGAL'], errors='coerce')
invalid_date = df[df['TANGGAL'].isna()]
invalid_kode = df[df['KODE_VALID'] == False]

print(f"⚠️ Tanggal tidak valid: {len(invalid_date)} baris")
print(f"⚠️ Kode obat tidak valid: {len(invalid_kode)} baris")

⚠️ Tanggal tidak valid: 0 baris
⚠️ Kode obat tidak valid: 0 baris


#### RULE-BASED METHOD

In [124]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [125]:
# Aturan logika:

# - Jika KATEGORI = MASUK maka QTY_MSK > 0 dan QTY_KLR = 0
# - Jika KATEGORI = KELUAR maka QTY_KLR > 0 dan QTY_MSK = 0
# - Nilai total tidak boleh negatif

df.loc[:, 'RULE_FLAG'] = np.where(
    ((df['KATEGORI'] == 'MASUK') & ((df['QTY_MSK'] <= 0) | (df['QTY_KLR'] != 0))) |
    ((df['KATEGORI'] == 'KELUAR') & ((df['QTY_KLR'] <= 0) | (df['QTY_MSK'] != 0))) |
    (df['NILAI_TOTAL'] < 0),
    True, False
)

rule_count = df['RULE_FLAG'].sum()
print(f"🧩 [Rule-Based Detection] Ditemukan {rule_count} baris anomali berdasarkan aturan logika.")
if rule_count > 0:
    print(df[df['RULE_FLAG']].head(5))
print("--------------------------------------------------\n")

🧩 [Rule-Based Detection] Ditemukan 0 baris anomali berdasarkan aturan logika.
--------------------------------------------------



#### CONSTRAINT-BASED DETECTION


In [126]:
df.loc[:, 'CONSTRAINT_FLAG'] = np.where(
    (df['QTY_TOTAL'] > 10000) | 
    (df['NILAI_TOTAL'] > 1_000_000) |
    (df['TANGGAL'].isna()),
    True, False
)

constraint_count = df['CONSTRAINT_FLAG'].sum()
print(f"📏 [Constraint-Based Detection] Ditemukan {constraint_count} baris melanggar batas wajar.")
if constraint_count > 0:
    print(df[df['CONSTRAINT_FLAG']].head(5))
print("--------------------------------------------------\n")

📏 [Constraint-Based Detection] Ditemukan 0 baris melanggar batas wajar.
--------------------------------------------------



#### STATISTICAL / PATTERN-BASED DETECTION

In [127]:
# PENANGANAN ANOMALI STATISTIK PER PRODUK (Final Tanpa Error)

# Pastikan kolom yang dibutuhkan tersedia
if 'NILAI_TOTAL' not in df.columns:
    df['NILAI_TOTAL'] = df.get('NILAI_TOTAL_FIX', df['NILAI_MSK'] - df['NILAI_KLR'])

if 'QTY_TOTAL' not in df.columns:
    df['QTY_TOTAL'] = df.get('QTY_TOTAL_FIX', df['QTY_MSK'] - df['QTY_KLR'])

# Fungsi deteksi dan perbaikan outlier per produk
def detect_outlier_group(df_group, col):
    mean = df_group[col].mean()
    median = df_group[col].median()
    std = df_group[col].std(ddof=0)

    if len(df_group) < 3 or std == 0 or np.isnan(std):
        df_group[f'STAT_FLAG_{col}'] = False
        df_group[f'{col}_FIX_STAT'] = df_group[col]
        return df_group

    zscore = (df_group[col] - mean) / std
    df_group[f'STAT_FLAG_{col}'] = abs(zscore) > 3
    df_group[f'{col}_FIX_STAT'] = np.where(abs(zscore) > 3, median, df_group[col])
    return df_group

# Terapkan ke tiap kolom numerik per produk (tanpa subset kolom)
for col in ['QTY_TOTAL', 'NILAI_TOTAL']:
    if col in df.columns:
        df = df.groupby('NAMA_PRODUK', group_keys=False).apply(lambda g: detect_outlier_group(g, col))

# LAPORAN HASIL DETEKSI DAN PERBAIKAN

stat_anom_qty = df['STAT_FLAG_QTY_TOTAL'].sum() if 'STAT_FLAG_QTY_TOTAL' in df else 0
stat_anom_nilai = df['STAT_FLAG_NILAI_TOTAL'].sum() if 'STAT_FLAG_NILAI_TOTAL' in df else 0
total_anom = stat_anom_qty + stat_anom_nilai

df['DIKOREKSI_STAT'] = df.get('STAT_FLAG_QTY_TOTAL', False) | df.get('STAT_FLAG_NILAI_TOTAL', False)

print(f"📊 [Statistical Detection] (Per Produk)")
print(f"   Anomali QTY_TOTAL: {stat_anom_qty}")
print(f"   Anomali NILAI_TOTAL: {stat_anom_nilai}")
print(f"   Total anomali terdeteksi: {total_anom}")
print(f"✅ Jumlah data dikoreksi otomatis (statistik): {df['DIKOREKSI_STAT'].sum()}\n")

# Tampilkan contoh hasil koreksi
print("🔍 Contoh hasil koreksi outlier statistik:")
cols_show = [c for c in [
    'KODE', 'NAMA_PRODUK',
    'QTY_TOTAL', 'QTY_TOTAL_FIX_STAT',
    'NILAI_TOTAL', 'NILAI_TOTAL_FIX_STAT',
    'STAT_FLAG_QTY_TOTAL', 'STAT_FLAG_NILAI_TOTAL'
] if c in df.columns]
print(df[df['DIKOREKSI_STAT']].head(10)[cols_show])
print("--------------------------------------------------\n")


  df = df.groupby('NAMA_PRODUK', group_keys=False).apply(lambda g: detect_outlier_group(g, col))


📊 [Statistical Detection] (Per Produk)
   Anomali QTY_TOTAL: 1240
   Anomali NILAI_TOTAL: 975
   Total anomali terdeteksi: 2215
✅ Jumlah data dikoreksi otomatis (statistik): 1827

🔍 Contoh hasil koreksi outlier statistik:
        KODE         NAMA_PRODUK  QTY_TOTAL  QTY_TOTAL_FIX_STAT  NILAI_TOTAL  \
49    A00001       ACTIVED HIJAU        3.0                 1.0     49666.67   
82   A000012  APIALYS SYR 100 ML        1.0                 1.0     49000.00   
84   A000012  APIALYS SYR 100 ML        2.0                 1.0     35398.37   
93   A000012  APIALYS SYR 100 ML        2.0                 1.0     35392.92   
125  A000014     ALKOHOL 1000 ML        6.0                 1.0     32500.00   
153  A000016     ALLOPURINOL 300       20.0                 1.0      4300.01   
155  A000016     ALLOPURINOL 300        0.0                 0.0         0.00   
181  A000016     ALLOPURINOL 300        0.0                 0.0         0.00   
196  A000016     ALLOPURINOL 300       20.0               

  df = df.groupby('NAMA_PRODUK', group_keys=False).apply(lambda g: detect_outlier_group(g, col))


#### CLUSTERING-BASED DETECTION

In [128]:
# CLUSTERING-BASED DETECTION (Per Produk) + PENANGANAN

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np

df = df.copy()
df.loc[:, 'CLUSTER_FLAG'] = False
df.loc[:, 'DIKOREKSI_CLUSTER'] = False
df.loc[:, 'QTY_TOTAL_FIX_CLUSTER'] = df['QTY_TOTAL']
df.loc[:, 'NILAI_TOTAL_FIX_CLUSTER'] = df['NILAI_TOTAL']

# Fungsi deteksi + koreksi per produk
def clustering_per_produk(group):
    if len(group) < 4:
        return group 

    X = group[['QTY_TOTAL', 'NILAI_TOTAL']].fillna(0)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # 2 cluster per produk
    kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
    group = group.copy()  
    group['CLUSTER'] = kmeans.fit_predict(X_scaled)

    centroids = kmeans.cluster_centers_
    distances = np.linalg.norm(X_scaled - centroids[group['CLUSTER']], axis=1)
    threshold = distances.mean() + 2 * distances.std()
    group['CLUSTER_FLAG'] = distances > threshold

    # Koreksi nilai ekstrem
    for i, row in group[group['CLUSTER_FLAG']].iterrows():
        cid = row['CLUSTER']
        centroid_unscaled = scaler.inverse_transform([centroids[cid]])[0]
        group.at[i, 'QTY_TOTAL_FIX_CLUSTER'] = centroid_unscaled[0]
        group.at[i, 'NILAI_TOTAL_FIX_CLUSTER'] = centroid_unscaled[1]
        group.at[i, 'DIKOREKSI_CLUSTER'] = True

    return group

# Terapkan per produk (kompatibel semua pandas)
df = df.groupby('NAMA_PRODUK', group_keys=False).apply(clustering_per_produk).reset_index(drop=True)

# HASIL AKHIR
outlier_count = df['CLUSTER_FLAG'].sum()
fixed_count = df['DIKOREKSI_CLUSTER'].sum()

print(f"🧮 [Clustering-Based Detection per Produk] Ditemukan {outlier_count} data jauh dari pusat cluster produk.")
print(f"🩺 {fixed_count} data telah dikoreksi mendekati centroid cluster produk masing-masing.\n")

print(df[df['DIKOREKSI_CLUSTER']].head(10)[[
    'KODE', 'NAMA_PRODUK', 'QTY_TOTAL', 'NILAI_TOTAL',
    'QTY_TOTAL_FIX_CLUSTER', 'NILAI_TOTAL_FIX_CLUSTER',
    'CLUSTER', 'DIKOREKSI_CLUSTER'
]])


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

🧮 [Clustering-Based Detection per Produk] Ditemukan 1949 data jauh dari pusat cluster produk.
🩺 1949 data telah dikoreksi mendekati centroid cluster produk masing-masing.

        KODE         NAMA_PRODUK  QTY_TOTAL  NILAI_TOTAL  \
48    A00001       ACTIVED HIJAU        3.0     49666.67   
54    A00001       ACTIVED HIJAU        2.0     58550.17   
80   A000012  APIALYS SYR 100 ML        1.0     49000.00   
100  A000014     ALKOHOL 1000 ML        2.0     29275.55   
110  A000014     ALKOHOL 1000 ML        3.0     42000.00   
123  A000014     ALKOHOL 1000 ML        6.0     32500.00   
151  A000016     ALLOPURINOL 300       20.0      4300.01   
153  A000016     ALLOPURINOL 300        0.0         0.00   
179  A000016     ALLOPURINOL 300        0.0         0.00   
194  A000016     ALLOPURINOL 300       20.0      4299.90   

     QTY_TOTAL_FIX_CLUSTER  NILAI_TOTAL_FIX_CLUSTER  CLUSTER  \
48                2.111111             52652.583333      0.0   
54                2.111111             

  df = df.groupby('NAMA_PRODUK', group_keys=False).apply(clustering_per_produk).reset_index(drop=True)


#### CROSS-DATASET CONSISTENCY

In [129]:
# CROSS-DATASET CONSISTENCY 

# Simpan nilai sebelum perbaikan
df['NILAI_TOTAL_BEFORE_FIX'] = df['NILAI_TOTAL']

# Fungsi logika nilai seharusnya berdasarkan kategori
def nilai_seharusnya(row):
    kategori = str(row['KATEGORI']).upper()
    if kategori == 'MASUK':
        return row['NILAI_MSK']
    elif kategori == 'KELUAR':
        return row['NILAI_KLR']
    else:
        return row['NILAI_MSK'] - row['NILAI_KLR']

# Hitung nilai seharusnya
df['NILAI_TOTAL_EXPECTED'] = df.apply(nilai_seharusnya, axis=1)

# Deteksi inkonsistensi awal
df['CONSIST_FLAG'] = abs(df['NILAI_TOTAL'] - df['NILAI_TOTAL_EXPECTED']) > 1e-6
before_fix = df['CONSIST_FLAG'].sum()

# Koreksi otomatis hanya untuk baris tidak konsisten
df.loc[df['CONSIST_FLAG'], 'NILAI_TOTAL'] = df.loc[df['CONSIST_FLAG'], 'NILAI_TOTAL_EXPECTED']
df['NILAI_TOTAL_FIX_CONSIST'] = df['NILAI_TOTAL']

# Recheck setelah perbaikan
df['CONSIST_FLAG_AFTER'] = abs(df['NILAI_TOTAL'] - df['NILAI_TOTAL_EXPECTED']) > 1e-6
after_fix = df['CONSIST_FLAG_AFTER'].sum()

# Laporan hasil perbaikan
print(f"🔗 [Cross-Dataset Consistency] Sebelum koreksi: {before_fix} baris tidak konsisten.")
print(f"🧮 Setelah koreksi: {after_fix} baris masih tidak konsisten.")
print(f"✅ Berhasil memperbaiki {before_fix - after_fix} baris data otomatis.\n")

# Contoh hasil koreksi
print(df[df['CONSIST_FLAG_AFTER'] == False].head(10)[[
    'KODE', 'NAMA_PRODUK', 'KATEGORI',
    'NILAI_MSK', 'NILAI_KLR',
    'NILAI_TOTAL_BEFORE_FIX', 'NILAI_TOTAL_FIX_CONSIST'
]])
print("--------------------------------------------------\n")

🔗 [Cross-Dataset Consistency] Sebelum koreksi: 0 baris tidak konsisten.
🧮 Setelah koreksi: 0 baris masih tidak konsisten.
✅ Berhasil memperbaiki 0 baris data otomatis.

      KODE NAMA_PRODUK KATEGORI  NILAI_MSK  NILAI_KLR  NILAI_TOTAL_BEFORE_FIX  \
0  A000001  ANATON TAB    MASUK     2520.0        0.0                  2520.0   
1  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
2  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
3  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
4  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
5  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
6  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
7  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
8  A000001  ANATON TAB   KELUAR        0.0     3000.0                  3000.0   
9  A000001  ANATON TA

In [132]:
# HASIL AKHIR FINAL BERSIH

cols_final = [
    'KODE', 'NAMA_PRODUK', 'UNIT', 'TANGGAL', 'NO_TRANSAKSI',
    'KATEGORI', 'QTY_MSK', 'NILAI_MSK', 'QTY_KLR', 'NILAI_KLR',
    'QTY_TOTAL_FIX_STAT', 'NILAI_TOTAL_FIX_CONSIST'
]

df_clean = df[cols_final].copy()

from pathlib import Path
import datetime

output_dir = Path("output")
output_dir.mkdir(exist_ok=True)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = output_dir / f"dataset_final_bersih_{timestamp}.csv"

df_clean.to_csv(output_path, index=False, encoding='utf-8-sig', float_format='%.2f')

print(f"✅ Versi final bersih berhasil diekspor ke:\n📂 {output_path}")
print(f"📊 Total baris: {len(df_clean)} | Total kolom: {len(df_clean.columns)}")


✅ Versi final bersih berhasil diekspor ke:
📂 output\dataset_final_bersih_20251020_145253.csv
📊 Total baris: 59823 | Total kolom: 12
