In [1]:
# Mini Project: Auto Insurance Fraud Prediction
# ---------------------------------------------
# Goal:
# Use past insurance claims to predict whether a claim is fraudulent or not.
# This script loads the data, does a few simple checks, trains a model,
# and shows how well it performs.

import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

from joblib import dump

# NOTE: Need to change your path, if running code on your own device
# Load the data
DATA_PATH = r"C:\Users\kevin\Downloads\car_insurance_fraud_dataset.csv"
TARGET = "fraud_reported"   # Y = fraud, N = not fraud

df = pd.read_csv(DATA_PATH)

# Clean up the target column
df[TARGET] = df[TARGET].astype(str).str.strip().str.upper()

print("Rows, Columns:", df.shape)
print("\nFraud label counts:")
print(df[TARGET].value_counts())

fraud_rate = (df[TARGET] == "Y").mean()
print("\nFraud rate:", round(fraud_rate, 4))


# Simple data checks
# Check which columns have missing values
missing = df.isna().mean().sort_values(ascending=False).head(10)
print("\nTop missing columns:")
print(missing)

# Compare average claim amounts if the column exists
if "total_claim_amount" in df.columns:
    print("\nAverage total claim amount by fraud label:")
    print(df.groupby(TARGET)["total_claim_amount"].mean())

# Look at fraud rates for a few useful categorical columns
for col in ["incident_type", "collision_type", "police_report_available"]:
    if col in df.columns:
        print(f"\nFraud rate by {col}:")
        print(
            df.groupby(col)[TARGET]
              .apply(lambda x: (x == "Y").mean())
              .sort_values(ascending=False)
              .head(8)
        )


# Prepare data for modeling
# Remove ID-like columns (they don't help prediction)
id_columns = [
    "policy_number",
    "claim_id",
    "customer_id",
    "insured_zip",
    "incident_location"
]

df = df.drop(columns=[c for c in id_columns if c in df.columns], errors="ignore")

X = df.drop(columns=[TARGET])
y = (df[TARGET] == "Y").astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Separate numeric and categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# Fill missing values and encode categories
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)


# Train the model
model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced"  # helps with class imbalance
)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", model)
])

pipeline.fit(X_train, y_train)


# Evaluate the model
predictions = pipeline.predict(X_test)
probabilities = pipeline.predict_proba(X_test)[:, 1]

print("\nConfusion matrix:")
print(confusion_matrix(y_test, predictions))

print("\nClassification report:")
print(classification_report(y_test, predictions, digits=4))

print("ROC-AUC:", round(roc_auc_score(y_test, probabilities), 4))


# Save the trained model
os.makedirs("mini_outputs", exist_ok=True)
dump(pipeline, "mini_outputs/fraud_logreg_pipeline.joblib")

print("\nModel saved to: mini_outputs/fraud_logreg_pipeline.joblib")


Rows, Columns: (30000, 24)

Fraud label counts:
fraud_reported
N    26560
Y     3440
Name: count, dtype: int64

Fraud rate: 0.1147

Top missing columns:
authorities_contacted          0.252133
policy_id                      0.000000
policy_state                   0.000000
total_claim_amount             0.000000
claim_amount                   0.000000
police_report_available        0.000000
witnesses                      0.000000
bodily_injuries                0.000000
number_of_vehicles_involved    0.000000
incident_hour_of_the_day       0.000000
dtype: float64

Average total claim amount by fraud label:
fraud_reported
N    12768.44124
Y    12675.08682
Name: total_claim_amount, dtype: float64

Fraud rate by incident_type:
incident_type
Vehicle Theft               0.117794
Single Vehicle Collision    0.115707
Multi-vehicle Collision     0.115543
Parked Car                  0.109479
Name: fraud_reported, dtype: float64

Fraud rate by collision_type:
collision_type
Rear       0.114988
Sid