In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest

# 0. Load & drop unused cols
df = pd.read_csv("Training_Dataset.csv")
X = df.drop(columns=[
    "ic_number",
    "approval_flag",
    "coverage_amount"
])

# 1. Split into train/test
X_train, X_test = train_test_split(
    X,
    test_size=0.2,
    random_state=42,
    stratify=df["true_anomaly"]  # ensure anomalies are represented in both splits
)

# 2. Preprocessor (same as before)
numeric_features = [
    "age", "months_as_customer", "vehicle_age_years",
    "policy_expired_flag", "deductible_amount", "market_value",
    "damage_severity_score", "repair_amount", "at_fault_flag",
    "time_to_report_days", "claim_reported_to_police_flag",
    "license_type_missing_flag", "num_third_parties", "num_witnesses"
]
categorical_features = ["vehicle_make"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first", sparse=False), categorical_features),
])

# 3. Build pipeline with known contamination rate
iso_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("anomaly_detector", IsolationForest(
        n_estimators=100,
        contamination=0.01,     # we know ~1% anomalies were injected
        random_state=42,
        n_jobs=-1
    )),
])

# 4. Fit only on the training set
iso_pipeline.fit(X_train[numeric_features + categorical_features])

# 5. Score & predict on the test set
df_test = X_test.copy()
df_test["anomaly_score"] = iso_pipeline.decision_function(
    X_test[numeric_features + categorical_features]
)
preds = iso_pipeline.predict(X_test[numeric_features + categorical_features])
df_test["is_anomaly"] = np.where(preds == -1, 1, 0)

# 6. See the top 10 most anomalous test observations
suspects = df_test.sort_values("anomaly_score").head(10)
print(suspects[[
    "age", "months_as_customer", "vehicle_make",
    "repair_amount", "market_value", "anomaly_score", "is_anomaly", "true_anomaly"
]])

# 7. Evaluate detection performance
print("\nClassification Report (anomaly vs. normal):")
print(classification_report(df_test["true_anomaly"], df_test["is_anomaly"], digits=4))

      age  months_as_customer   vehicle_make  repair_amount  market_value  \
1422   16                   0   Perodua Myvi      150000.00       50000.0   
2656   18                   0   Perodua Myvi      150000.00       50000.0   
2042   19                   0   Perodua Myvi      150000.00       50000.0   
1041   21                  36     Honda City       21243.51       80000.0   
1381   53                  22        Mazda 3       30547.93      130000.0   
2475   50                  57  Nissan Almera       20414.67       70000.0   
1662   46                  38     Honda City       15015.11       80000.0   
1128   25                  12    Toyota Vios       16168.75       75000.0   
2284   22                  48     Honda City       18238.55       80000.0   
1518   55                  39     Honda City       20776.99       80000.0   

      anomaly_score  is_anomaly  
1422      -0.207765           1  
2656      -0.206470           1  
2042      -0.201528           1  
1041      -0.105



In [2]:
df_test

Unnamed: 0,age,months_as_customer,vehicle_age_years,vehicle_make,policy_expired_flag,deductible_amount,market_value,damage_severity_score,repair_amount,at_fault_flag,time_to_report_days,claim_reported_to_police_flag,license_type_missing_flag,num_third_parties,num_witnesses,anomaly_score,is_anomaly
2384,46,0,6,Proton X70,0,200.0,110000.0,0.56,22458.14,1,5,1,0,0,2,0.247475,0
2538,35,2,5,Perodua Myvi,0,300.0,50000.0,0.55,8742.14,0,8,0,0,1,3,0.249069,0
2176,69,2,11,Perodua Myvi,0,300.0,50000.0,0.57,9981.35,0,4,1,0,0,0,0.292059,0
897,31,2,8,Perodua Myvi,1,1000.0,50000.0,0.66,11869.11,0,3,0,0,2,0,0.205626,0
214,31,47,4,Nissan Almera,0,500.0,70000.0,0.73,17608.03,0,3,1,0,0,2,0.242426,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2936,46,14,12,Honda City,0,300.0,80000.0,0.68,18474.46,0,4,1,0,1,2,0.241899,0
1468,44,50,18,Perodua Myvi,0,1000.0,50000.0,0.75,14600.52,1,2,0,0,1,1,0.237310,0
561,50,6,12,Mazda 3,0,500.0,130000.0,0.72,31938.39,0,2,1,0,0,1,0.215296,0
282,38,5,1,Proton X70,0,1000.0,110000.0,0.57,28489.38,0,7,1,0,0,1,0.292773,0


In [16]:
import pickle

# Save the entire pipeline (preprocessor + model)
with open("Anomaly_Model.pkl", "wb") as f:
    pickle.dump(iso_pipeline, f)