In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

sns.set(style="whitegrid")

df = pd.read_csv("insurance_claims.csv")

print("Dataset shape:", df.shape)
df.head()

In [None]:
df = df.replace("?", np.nan)
df.head()

In [None]:
df['fraud_reported'] = df['fraud_reported'].map({'Y': 1, 'N': 0})

print(df['fraud_reported'].value_counts(dropna=False))
df['fraud_reported'].unique()

In [None]:
df.to_csv("insurance_claims_cleaned.csv", index=False)
print("Saved cleaned dataset â†’ insurance_claims_cleaned.csv")

In [None]:
df.isnull().sum()
(df.isnull().sum() / len(df)) * 100    # % missing

In [None]:
plt.figure(figsize=(12,6))
msno.matrix(df)
plt.title("Missing Value Matrix")
plt.show()

In [None]:
df.info()
df.describe()
df.describe(include='object')

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

df[numeric_cols].hist(figsize=(15,12), bins=30)
plt.suptitle("Numeric Feature Distributions", fontsize=16)
plt.show()

In [None]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Only plot cols with fewer than 20 categories
plot_cols = [col for col in categorical_cols if df[col].nunique() <= 20]

for col in plot_cols:
    plt.figure(figsize=(10,4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f"Category Counts: {col}")
    plt.show()

In [None]:
plt.figure(figsize=(5,4))
df['fraud_reported'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.xticks([0,1], ['Not Fraud', 'Fraud'])
plt.title("Fraud Class Distribution")
plt.show()

In [None]:
for col in numeric_cols:
    if col != "fraud_reported":
        # Skip columns with too many missing values
        if df[col].dropna().shape[0] < 5:
            print(f"Skipping {col}: not enough numeric data")
            continue

        # Skip columns that are constant (all values identical)
        if df[col].nunique() < 2:
            print(f"Skipping {col}: constant values")
            continue

        plt.figure(figsize=(8,4))
        sns.boxplot(x=df['fraud_reported'], y=df[col])
        plt.title(f"{col} vs Fraud")
        plt.xticks([0,1], ['Not Fraud', 'Fraud'])
        plt.show()

In [None]:
cat_for_analysis = [
    col for col in categorical_cols
    if df[col].nunique() < 30   # Remove high-cardinality columns
]

for col in cat_for_analysis:
    print(f"Fraud Rate by {col}")
    display(df.groupby(col)['fraud_reported'].mean().sort_values(ascending=False))
    print("-"*50)

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(df[numeric_cols].corr(), cmap='coolwarm', annot=False)
plt.title("Numeric Correlation Heatmap")
plt.show()

In [None]:
df_encoded = df.copy()

# Target encode categorical columns
for col in categorical_cols:
    fraud_means = df.groupby(col)['fraud_reported'].mean()
    df_encoded[col] = df[col].map(fraud_means)

plt.figure(figsize=(10,16))
corr = df_encoded.corr()['fraud_reported'].sort_values(ascending=False)
sns.heatmap(corr.to_frame(), annot=True, cmap='coolwarm')
plt.title("Fraud-Focused Correlation Heatmap (Target Encoded)")
plt.show()

In [None]:
important_features = corr[corr.abs() > 0.10]
important_features