In [1]:
import pandas as pd

# Assume you’ve run your generator and saved to CSV:
df = pd.read_csv("Training_Dataset.csv")

# Drop the columns we won’t use for anomaly detection:
#  - identifiers and long text
#  - approval_flag & coverage_amount (we’re unsupervised)
X = df.drop(columns=[
    "ic_number",
    "claim_description",
    "customer_background",
    "approval_flag",
    "coverage_amount"
])

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = [
    "age", "months_as_customer", "vehicle_age_years",
    "policy_expired_flag", "deductible_amount", "market_value",
    "damage_severity_score", "repair_amount", "at_fault_flag",
    "time_to_report_days", "claim_reported_to_police_flag",
    "license_type_missing_flag", "num_third_parties", "num_witnesses"
]
categorical_features = ["vehicle_make"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first", sparse=False), categorical_features),
])


In [4]:
# 0. define the pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest

iso_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("anomaly_detector", IsolationForest(
        n_estimators=100,
        contamination="auto",
        random_state=42
    )),
])

# 1. Fit pipeline
iso_pipeline.fit(X)

# 2. Compute anomaly scores
df["anomaly_score"] = iso_pipeline.decision_function(X)

# 3. Predict anomalies: +1 = normal, -1 = outlier
preds = iso_pipeline.predict(X)
df["is_anomaly"] = np.where(preds == -1, 1, 0)

# 4. Inspect the top 10 most “anomalous” claims
suspects = df.sort_values("anomaly_score").head(10)
print(suspects[[
    "age", "months_as_customer", "vehicle_make",
    "repair_amount", "market_value", "anomaly_score", "is_anomaly"
]])




      age  months_as_customer  vehicle_make  repair_amount  market_value  \
1183   43                  59    Honda City       54077.74      74396.41   
350    30                  59    Proton X70       54694.39     139718.20   
289    42                   6    Honda City       13355.77      29097.33   
1323   39                   7    Honda City      103241.51     142283.74   
1960   55                 151    Honda City        2100.68      20496.84   
1385   38                  66    Honda City        7648.49      46798.74   
2715   59                  78   Toyota Vios        1276.21      59655.20   
2242   49                  35    Honda City       25204.69      65597.50   
1419   45                   3   Toyota Vios        5686.11      13086.91   
947    27                  91  BMW 3 Series       27432.17      78180.35   

      anomaly_score  is_anomaly  
1183      -0.153353           1  
350       -0.126226           1  
289       -0.116991           1  
1323      -0.104320        

In [5]:
import numpy as np

# 1) Compute anomaly scores (higher = more “normal”)
df["anomaly_score"] = iso_pipeline.decision_function(X)

# 2) Predict: +1 = normal, -1 = anomaly → convert to 0/1
preds = iso_pipeline.predict(X)
df["is_anomaly"] = np.where(preds == -1, 1, 0)

# 3) View the top 10 most suspicious claims
suspects = df.sort_values("anomaly_score").head(10)
print(suspects[[
    "age", "months_as_customer", "vehicle_make",
    "repair_amount", "market_value", "anomaly_score", "is_anomaly"
]])


      age  months_as_customer  vehicle_make  repair_amount  market_value  \
1183   43                  59    Honda City       54077.74      74396.41   
350    30                  59    Proton X70       54694.39     139718.20   
289    42                   6    Honda City       13355.77      29097.33   
1323   39                   7    Honda City      103241.51     142283.74   
1960   55                 151    Honda City        2100.68      20496.84   
1385   38                  66    Honda City        7648.49      46798.74   
2715   59                  78   Toyota Vios        1276.21      59655.20   
2242   49                  35    Honda City       25204.69      65597.50   
1419   45                   3   Toyota Vios        5686.11      13086.91   
947    27                  91  BMW 3 Series       27432.17      78180.35   

      anomaly_score  is_anomaly  
1183      -0.153353           1  
350       -0.126226           1  
289       -0.116991           1  
1323      -0.104320        