In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# 0. Load & drop unused cols
df = pd.read_csv("Training_Dataset.csv")

# Inject ~1% targeted anomalies before any splitting
# -----------------------------------------------
# Decide how many rows to corrupt
n_anom = int(len(df) * 0.01)
anom_idx = np.random.choice(df.index, size=n_anom, replace=False)

# Inject numeric anomalies
df.loc[anom_idx, "repair_amount"] = df.loc[anom_idx, "market_value"] * 2.5  # excessive repair cost
df.loc[anom_idx, "num_third_parties"] = 0
df.loc[anom_idx, "num_witnesses"] = 0
df.loc[anom_idx, "vehicle_age_years"] = np.random.randint(36, 60, size=n_anom)  # very old vehicles
df.loc[anom_idx, "deductible_amount"] = 0
df.loc[anom_idx, "damage_severity_score"] = 10  # very high damage score
df.loc[anom_idx, "age"] = np.random.randint(16, 20, size=n_anom)  # unusually young drivers
df.loc[anom_idx, "months_as_customer"] = 0  # new customer
df.loc[anom_idx, "policy_expired_flag"] = 1
df.loc[anom_idx, "license_type_missing_flag"] = 1
df.loc[anom_idx, "at_fault_flag"] = 1
df.loc[anom_idx, "claim_reported_to_police_flag"] = 0  # serious claim not reported
df.loc[anom_idx, "time_to_report_days"] = np.random.randint(40, 180, size=n_anom)  # very late report

# Mark true anomalies for later evaluation
df["true_anomaly"] = 0
df.loc[anom_idx, "true_anomaly"] = 1

# Now drop the unused columns
X = df.drop(columns=[
    "ic_number",
    "approval_flag",
    "coverage_amount"
])

# 1. Split into train/test
X_train, X_test = train_test_split(
    X,
    test_size=0.2,
    random_state=42,
    stratify=df["true_anomaly"]  # ensure anomalies are represented in both splits
)

# 2. Preprocessor (same as before)
numeric_features = [
    "age", "months_as_customer", "vehicle_age_years",
    "policy_expired_flag", "deductible_amount", "market_value",
    "damage_severity_score", "repair_amount", "at_fault_flag",
    "time_to_report_days", "claim_reported_to_police_flag",
    "license_type_missing_flag", "num_third_parties", "num_witnesses"
]
categorical_features = ["vehicle_make"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first", sparse=False), categorical_features),
])

# 3. Build pipeline with known contamination rate
iso_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("anomaly_detector", IsolationForest(
        n_estimators=100,
        contamination=0.01,     # we know ~1% anomalies were injected
        random_state=42,
        n_jobs=-1
    )),
])

# 4. Fit only on the training set
iso_pipeline.fit(X_train[numeric_features + categorical_features])

# 5. Score & predict on the test set
df_test = X_test.copy()
df_test["anomaly_score"] = iso_pipeline.decision_function(
    X_test[numeric_features + categorical_features]
)
preds = iso_pipeline.predict(X_test[numeric_features + categorical_features])
df_test["is_anomaly"] = np.where(preds == -1, 1, 0)

# 6. See the top 10 most anomalous test observations
suspects = df_test.sort_values("anomaly_score").head(10)
print(suspects[[
    "age", "months_as_customer", "vehicle_make",
    "repair_amount", "market_value", "anomaly_score", "is_anomaly", "true_anomaly"
]])

# 7. Evaluate detection performance
print("\nClassification Report (anomaly vs. normal):")
print(classification_report(df_test["true_anomaly"], df_test["is_anomaly"], digits=4))




      age  months_as_customer   vehicle_make  repair_amount  market_value  \
3092   16                   0     Honda City       200000.0       80000.0   
2296   17                   0    Toyota Vios       187500.0       75000.0   
479    16                   0   Perodua Myvi       125000.0       50000.0   
31     17                   0     Proton X70       275000.0      110000.0   
2035   19                   0     Proton X70       275000.0      110000.0   
3160   16                   0    Toyota Vios       187500.0       75000.0   
2668   17                   0  Nissan Almera       175000.0       70000.0   
1309   18                   0   Perodua Myvi       125000.0       50000.0   
945    17                   0   Perodua Myvi       125000.0       50000.0   
1185   16                   0   Perodua Myvi       125000.0       50000.0   

      anomaly_score  is_anomaly  true_anomaly  
3092      -0.114794           1             1  
2296      -0.113283           1             1  
479     

In [2]:
df_test

Unnamed: 0,plate_number,age,months_as_customer,vehicle_age_years,vehicle_make,policy_expired_flag,deductible_amount,market_value,damage_severity_score,repair_amount,at_fault_flag,time_to_report_days,claim_reported_to_police_flag,license_type_missing_flag,num_third_parties,num_witnesses,true_anomaly,anomaly_score,is_anomaly
4867,RUJ7407,33,9,10,Perodua Myvi,1,1000.0,50000.0,0.43,9525.17,0,5,0,0,0,0,0,0.128343,0
4382,MNM1400,29,4,4,Proton X70,0,500.0,110000.0,0.43,23240.73,1,6,1,0,0,1,0,0.160610,0
1155,MFX9215,40,1,5,Proton X70,0,1000.0,110000.0,0.57,30851.94,0,2,1,0,1,1,0,0.177701,0
4598,MMX8441,48,19,11,Perodua Myvi,0,1000.0,50000.0,0.72,16489.38,0,8,0,0,0,1,0,0.179421,0
3135,XYO1324,22,4,4,Proton X70,0,1000.0,110000.0,0.48,20959.71,0,5,0,0,2,1,0,0.136278,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4905,TOO5542,59,79,14,Toyota Vios,0,800.0,75000.0,0.45,12901.17,0,1,0,0,0,0,0,0.092282,0
735,TZU5041,27,56,3,Mazda 3,1,200.0,130000.0,0.80,31505.93,1,5,0,0,0,2,0,0.026287,0
4785,DSK4250,32,50,16,Perodua Myvi,0,1000.0,50000.0,0.67,11653.67,0,4,1,0,0,0,0,0.203900,0
265,VJH2360,29,55,6,Proton X70,0,800.0,110000.0,0.63,26135.00,0,5,1,0,0,0,0,0.183838,0


In [3]:
import pickle

# Save the entire pipeline (preprocessor + model)
with open("Anomaly_Model.pkl", "wb") as f:
    pickle.dump(iso_pipeline, f)