# Deteksi Double Counting dengan Isolation Forest

Notebook ini menerapkan pendekatan dua tahap:
1. Mengidentifikasi data klaim yang diduga double counting menggunakan Isolation Forest.
2. Menghitung estimasi cadangan klaim dengan metode Chain Ladder dan Bornhuetter‑Ferguson pada data mentah dan data yang telah dibersihkan dari anomali.

Dataset: `claims_company_style_with_reported_year.csv`


In [None]:
!pip install chainladder pandas scikit-learn matplotlib seaborn --quiet

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('claims_company_style_with_reported_year.csv')
print('Shape:', df.shape)
df.head()

In [None]:
# Fitur untuk deteksi anomali
a_features = df[['incremental_paid','incremental_incurred','cumulative_paid','cumulative_incurred']]
iso = IsolationForest(contamination=0.02, random_state=42)
df['anomaly'] = iso.fit_predict(a_features)

# Visualisasi skor anomali
sns.histplot(iso.decision_function(a_features), bins=30)
plt.xlabel('Isolation Forest score')
plt.show()

print('Jumlah anomali:', (df['anomaly']==-1).sum())


In [None]:
clean_df = df[df['anomaly'] != -1].copy()
print('Data bersih:', clean_df.shape)


In [None]:
def build_triangle(data):
    tri = data.pivot_table(index='accident_year', columns='development_year', values='cumulative_paid', aggfunc='sum').sort_index()
    return tri

def chain_ladder(triangle):
    dev_periods = triangle.columns.tolist()
    n = len(dev_periods)
    factors = []
    for j in range(n-1):
        numer = triangle.loc[:, dev_periods[j+1]].iloc[:-j-1].sum()
        denom = triangle.loc[:, dev_periods[j]].iloc[:-j-1].sum()
        factors.append(numer/denom)
    factors.append(1.0)
    ultimate = {}
    for i, row in triangle.iterrows():
        last_dev = row.last_valid_index()
        idx = dev_periods.index(last_dev)
        factor_prod = np.prod(factors[idx:])
        ultimate[i] = row[last_dev] * factor_prod
    return pd.Series(ultimate), factors

def bornhuetter_ferguson(triangle, exposures, factors):
    dev_periods = triangle.columns.tolist()
    lr = triangle.iloc[:,-1].sum() / exposures.sum()
    ultimates = {}
    cum_factors = np.cumprod(factors)
    for i, row in triangle.iterrows():
        last_dev = row.last_valid_index()
        idx = dev_periods.index(last_dev)
        percent_reported = 1.0/cum_factors[idx]
        expected = exposures.loc[i] * lr
        ultimates[i] = row[last_dev] + expected*(1-percent_reported)
    return pd.Series(ultimates)


In [None]:
triangle_raw = build_triangle(df)
triangle_clean = build_triangle(clean_df)

exposure = df.groupby('accident_year')['exposure_premium'].sum()

cl_raw, factors = chain_ladder(triangle_raw)
cl_clean, _ = chain_ladder(triangle_clean)

bf_raw = bornhuetter_ferguson(triangle_raw, exposure, factors)
bf_clean = bornhuetter_ferguson(triangle_clean, exposure, factors)

result = pd.DataFrame({
    'CL_raw': cl_raw,
    'CL_clean': cl_clean,
    'BF_raw': bf_raw,
    'BF_clean': bf_clean
})
result

In [None]:
result.plot(kind='bar', figsize=(12,6))
plt.ylabel('Ultimate Estimate')
plt.title('Perbandingan Estimasi Cadangan per Accident Year')
plt.show()


Notebook selesai.