In [1]:
import pandas as pd

# Load data
df = pd.read_csv("insurance_fraud_claims.csv")

# Basic overview
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Sample rows:\n", df.head())

# Data types and missing values summary
print(df.info())
print(df.isnull().sum())


Shape: (1000, 40)
Columns: ['months_as_customer', 'age', 'policy_number', 'policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss', 'incident_date', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage', 'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make', 'auto_model', 'auto_year', 'fraud_reported', '_c39']
Sample rows:
    months_as_customer  age  policy_number policy_bind_date policy_state  \
0                 328   48         521585       2014-10-17           OH   
1                 228   42         342868       2006-06-27     

In [2]:
# Drop completely empty or irrelevant column
df.drop(columns=['_c39'], inplace=True)

# Encode target variable: fraud_reported ('Y'->1, 'N'->0)
df['fraud_reported'] = df['fraud_reported'].map({'Y': 1, 'N': 0})

# Convert date columns to datetime dtype
df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'])
df['incident_date'] = pd.to_datetime(df['incident_date'])

# Fill missing values in 'authorities_contacted' with 'Unknown'
df['authorities_contacted'] = df['authorities_contacted'].fillna('Unknown')

# Replace '?' with 'Unknown' in 'police_report_available'
df['police_report_available'] = df['police_report_available'].replace('?', 'Unknown')

# Drop columns unlikely useful for modeling (IDs, unique codes, locations)
df.drop(columns=['policy_number', 'insured_zip', 'incident_location'], inplace=True)

# Drop columns with datetime type
datetime_cols = df.select_dtypes(include=['datetime64']).columns
df = df.drop(columns=datetime_cols)

# Convert remaining object columns to categorical for easier encoding later
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')

print("\nData info after cleaning:")
print(df.info())



Data info after cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   months_as_customer           1000 non-null   int64   
 1   age                          1000 non-null   int64   
 2   policy_state                 1000 non-null   category
 3   policy_csl                   1000 non-null   category
 4   policy_deductable            1000 non-null   int64   
 5   policy_annual_premium        1000 non-null   float64 
 6   umbrella_limit               1000 non-null   int64   
 7   insured_sex                  1000 non-null   category
 8   insured_education_level      1000 non-null   category
 9   insured_occupation           1000 non-null   category
 10  insured_hobbies              1000 non-null   category
 11  insured_relationship         1000 non-null   category
 12  capital-gains                1000 no

In [3]:
categorical_cols = [
    'policy_state', 'policy_csl', 'insured_sex', 'insured_education_level',
    'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type',
    'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state',
    'incident_city', 'property_damage', 'police_report_available', 'auto_make'
]


In [4]:
# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Frequency encode 'auto_model'
auto_model_freq = df['auto_model'].value_counts(normalize=True)
df['auto_model_freq_enc'] = df['auto_model'].map(auto_model_freq)

# Drop original 'auto_model' column
df_encoded = df_encoded.drop(columns=['auto_model'])

# Add the frequency encoded column
df_encoded['auto_model_freq_enc'] = df['auto_model_freq_enc']


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE  # Import SMOTE

# Assuming df_encoded is your preprocessed DataFrame and 'fraud_reported' is the target
df_encoded['fraud_reported'] = df_encoded['fraud_reported'].astype(int)

# Split features and target
X = df_encoded.drop('fraud_reported', axis=1)
y = df_encoded['fraud_reported']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize features (fit scaler only on resampled training data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Build Deep Learning model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train_scaled, y_train_resampled,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate on test set
y_pred_prob = model.predict(X_test_scaled).flatten()
y_pred_class = (y_pred_prob > 0.5).astype(int)

# Print evaluation
print(classification_report(y_test, y_pred_class))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.5614 - loss: 0.7029 - val_accuracy: 0.6639 - val_loss: 0.6284
Epoch 2/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7266 - loss: 0.5276 - val_accuracy: 0.9253 - val_loss: 0.3298
Epoch 3/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7993 - loss: 0.4354 - val_accuracy: 0.9336 - val_loss: 0.2531
Epoch 4/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8500 - loss: 0.3702 - val_accuracy: 0.9876 - val_loss: 0.1169
Epoch 5/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8825 - loss: 0.2961 - val_accuracy: 0.9876 - val_loss: 0.0766
Epoch 6/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8907 - loss: 0.2877 - val_accuracy: 0.9876 - val_loss: 0.0505
Epoch 7/50
[1m31/31[0m [32m━━━━