In [None]:
!pip install -q pandas numpy scikit-learn imbalanced-learn xgboost joblib matplotlib kaggle


In [None]:
!unzip -q "archive (1).zip" -d data

!ls data



y
'insurance fraud claims.csv'


In [None]:
import pandas as pd, textwrap
csv_path = "data/insurance fraud claims.csv"

df = pd.read_csv(csv_path, nrows=3)
print("preview：")
print(textwrap.fill(", ".join(df.columns), 100))


preview：
months_as_customer, age, policy_number, policy_bind_date, policy_state, policy_csl,
policy_deductable, policy_annual_premium, umbrella_limit, insured_zip, insured_sex,
insured_education_level, insured_occupation, insured_hobbies, insured_relationship, capital-gains,
capital-loss, incident_date, incident_type, collision_type, incident_severity,
authorities_contacted, incident_state, incident_city, incident_location, incident_hour_of_the_day,
number_of_vehicles_involved, property_damage, bodily_injuries, witnesses, police_report_available,
total_claim_amount, injury_claim, property_claim, vehicle_claim, auto_make, auto_model, auto_year,
fraud_reported, _c39


In [None]:
%%writefile fraud_experiment.py
#
#!/usr/bin/env python3
import os, joblib, argparse, pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

def load_data(path):
    df = pd.read_csv(path)

    # 1) find file name
    tgt_col = None
    for col in df.columns:
        if col.lower().startswith("fraud"):
            tgt_col = col
            break
    if tgt_col is None:
        raise ValueError("No column starting with 'fraud' found as target.")
    df = df.rename(columns={tgt_col: "FraudFound"})

    # 2)delete empty col
    df = df.dropna(axis=1, how="all")

    # 3) make Y/N / yes/no -> 1/0
    if df["FraudFound"].dtype == object:
        df["FraudFound"] = (
            df["FraudFound"].str.strip().str.upper().map({"Y": 1, "YES": 1, "N": 0, "NO": 0})
        )
    # use astype to convert
    df["FraudFound"] = df["FraudFound"].astype(int)

    return df


def make_preprocessor(df):
    num_cols = df.select_dtypes(include="number").columns.difference(["FraudFound"])
    cat_cols = df.select_dtypes(include="object").columns.difference(["FraudFound"])
    num_pipe = Pipeline([("impute", SimpleImputer(strategy="median")),
                         ("scale", StandardScaler())])
    cat_pipe = Pipeline([("impute", SimpleImputer(strategy="most_frequent")),
                         ("onehot", OneHotEncoder(handle_unknown="ignore",
                                                  sparse_output=True))])
    return ColumnTransformer([("num", num_pipe, num_cols),
                              ("cat", cat_pipe, cat_cols)])

def build_pipe(model, sampler, df):
    steps = [("pre", make_preprocessor(df))]
    if sampler == "smote":
        steps.append(("smote", SMOTE()))
    steps.append(("clf", model))
    return ImbPipeline(steps)

def main(csv):
    os.makedirs("results", exist_ok=True)
    df = load_data(csv)
    X, y = df.drop("FraudFound", axis=1), df["FraudFound"]
    models = {
        "LR":  LogisticRegression(max_iter=200, n_jobs=-1),
        "SVM": SVC(kernel="rbf", probability=True),
        "RF":  RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
        "XGB": XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1,
                              subsample=0.9, eval_metric="logloss",
                              tree_method="hist", random_state=42)
    }
    samplers = ["none", "smote"]

    rows = []
    for sampler in samplers:
        for name, model in models.items():
            pipe = build_pipe(model, sampler, df)
            cv = cross_validate(pipe, X, y,
                                scoring={"F1":"f1", "AUC":"roc_auc"},
                                cv=5, n_jobs=-1, return_estimator=True)
            rows.append({"Model":name,"Sampler":sampler,
                         "F1_mean":cv["test_F1"].mean(),"AUC_mean":cv["test_AUC"].mean()})
            print(f"✓ {name:3} | {sampler}  F1={cv['test_F1'].mean():.3f}")

    pd.DataFrame(rows).to_csv("results/metrics.csv", index=False)
    best = max(rows, key=lambda r: r["F1_mean"])
    pipe = build_pipe(models[best['Model']], best['Sampler'], df)
    Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)
    pipe.fit(Xtr,ytr); joblib.dump(pipe,"results/best_model.pkl")
    RocCurveDisplay.from_estimator(pipe,Xte,yte)
    plt.savefig("results/roc.png",dpi=300,bbox_inches='tight'); plt.close()
    print("✓ Artifacts saved to results/")

if __name__ == "__main__":
    ap=argparse.ArgumentParser(); ap.add_argument("--data",default="data/insurance fraud claims.csv")
    args=ap.parse_args(); main(args.data)


Writing fraud_experiment.py


In [None]:
!python fraud_experiment.py --data "data/insurance fraud claims.csv"


✓ LR  | none  F1=0.574
✓ SVM | none  F1=0.261
✓ RF  | none  F1=0.120
✓ XGB | none  F1=0.607
✓ LR  | smote  F1=0.634
✓ SVM | smote  F1=0.551
✓ RF  | smote  F1=0.268
✓ XGB | smote  F1=0.614
✓ Artifacts saved to results/


In [None]:
from google.colab import files
files.download('results/metrics.csv')
files.download('results/roc.png')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import joblib, pandas as pd
pipe = joblib.load("results/best_model.pkl") #load the best model to be used

sample = pd.read_csv("data/insurance fraud claims.csv").head().copy()
sample["Pred_label"] = pipe.predict(sample)
sample["Fraud_prob"] = pipe.predict_proba(sample)[:, 1]  # give the prob

display(sample[["Pred_label", "Fraud_prob"] + list(sample.columns[:5])])  # display the first 5


Unnamed: 0,Pred_label,Fraud_prob,months_as_customer,age,policy_number,policy_bind_date,policy_state
0,1,0.878862,328,48,521585,2014-10-17,OH
1,1,0.701836,228,42,342868,2006-06-27,IN
2,0,0.088484,134,29,687698,2000-09-06,OH
3,1,0.990074,256,41,227811,1990-05-25,IL
4,0,0.004454,228,44,367455,2014-06-06,IL
