In [None]:
# Insurance Fraud Detection: A Comprehensive Analysis
# This notebook provides an end-to-end solution for detecting fraudulent insurance claims.
# It includes data preprocessing, exploratory data analysis, and predictive modeling 


# Importing essential libraries for analysis and modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')

# Setting up a professional visualization theme
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.family'] = 'Arial'

# --- 1. Data Loading and Initial Inspection ---
# Load the insurance claims dataset
data = pd.read_csv('insurance_claims.csv')

# Remove irrelevant column '_c39' as identified in the dataset
data.drop('_c39', axis=1, inplace=True)

# Display dataset overview
print("Dataset Overview:")
print(f"Shape: {data.shape}")
print("\nColumn Types:")
print(data.dtypes.value_counts())
print("\nFirst 5 Rows:")
print(data.head())

# --- 2. Handling Missing Values ---
def display_missing_values(df):
    """Display missing values in a formatted table."""
    missing = df.isnull().sum()
    missing_percent = 100 * missing / len(df)
    missing_table = pd.DataFrame({
        'Missing Values': missing,
        '% of Total': missing_percent.round(1)
    })
    missing_table = missing_table[missing_table['Missing Values'] > 0].sort_values('% of Total', ascending=False)
    print(f"\nMissing Values in Dataset ({df.shape[1]} columns):")
    print(missing_table)
    return missing_table

missing_values = display_missing_values(data)

# Impute missing values in 'authorities_contacted' with 'Unknown'
data['authorities_contacted'].fillna('Unknown', inplace=True)

# --- 3. Exploratory Data Analysis (EDA) ---
# Fraud distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='fraud_reported', data=data, palette='coolwarm')
plt.title('Distribution of Fraudulent vs Non-Fraudulent Claims', pad=20)
plt.xlabel('Fraud Reported (Y = Yes, N = No)')
plt.ylabel('Number of Claims')
plt.show()

# Fraud by incident type and severity
plt.figure(figsize=(12, 6))
sns.countplot(x='incident_type', hue='fraud_reported', data=data, palette='coolwarm')
plt.title('Fraud Distribution by Incident Type', pad=20)
plt.xlabel('Incident Type')
plt.ylabel('Number of Claims')
plt.legend(title='Fraud Reported', loc='upper right')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x='incident_severity', hue='fraud_reported', data=data, palette='coolwarm')
plt.title('Fraud Distribution by Incident Severity', pad=20)
plt.xlabel('Incident Severity')
plt.ylabel('Number of Claims')
plt.legend(title='Fraud Reported', loc='upper right')
plt.show()

# Correlation heatmap for numerical features
plt.figure(figsize=(14, 10))
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = data[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features', pad=20)
plt.show()

# --- 4. Data Preprocessing ---
# Encode categorical variables
label_encoders = {}
categorical_cols = data.select_dtypes(include=['object']).columns.drop('fraud_reported')
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Encode target variable
le_target = LabelEncoder()
data['fraud_reported'] = le_target.fit_transform(data['fraud_reported'])

# Split features and target
X = data.drop('fraud_reported', axis=1)
y = data['fraud_reported']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# --- 5. Random Forest Model ---
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test)
print("\nRandom Forest Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le_target.classes_))

# Plot confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.title('Random Forest Confusion Matrix', pad=20)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis')
plt.title('Top 10 Most Important Features (Random Forest)', pad=20)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

# --- 6. Deep Learning Model ---
# Build neural network
model_dl = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model_dl.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model_dl.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2)

# Evaluate model
y_pred_dl = (model_dl.predict(X_test) > 0.5).astype('int32')
print("\nDeep Learning Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dl):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dl, target_names=le_target.classes_))

# Plot confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_dl), annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.title('Deep Learning Confusion Matrix', pad=20)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot training history
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', color='navy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='orange')
plt.title('Deep Learning Model: Accuracy Over Epochs', pad=20)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', color='navy')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Deep Learning Model: Loss Over Epochs', pad=20)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()
# Insurance Fraud Detection: A Comprehensive Analysis
# This notebook provides an end-to-end solution for detecting fraudulent insurance claims.
# It includes data preprocessing, exploratory data analysis, and predictive modeling 
# with professional visualizations for a polished presentation.

# Importing essential libraries for analysis and modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import warnings
warnings.filterwarnings('ignore')

# Setting up a professional visualization theme
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.family'] = 'Arial'

# --- 1. Data Loading and Initial Inspection ---
# Load the insurance claims dataset
data = pd.read_csv('insurance_claims.csv')

# Remove irrelevant column '_c39' as identified in the dataset
data.drop('_c39', axis=1, inplace=True)

# Display dataset overview
print("Dataset Overview:")
print(f"Shape: {data.shape}")
print("\nColumn Types:")
print(data.dtypes.value_counts())
print("\nFirst 5 Rows:")
print(data.head())

# --- 2. Handling Missing Values ---
def display_missing_values(df):
    """Display missing values in a formatted table."""
    missing = df.isnull().sum()
    missing_percent = 100 * missing / len(df)
    missing_table = pd.DataFrame({
        'Missing Values': missing,
        '% of Total': missing_percent.round(1)
    })
    missing_table = missing_table[missing_table['Missing Values'] > 0].sort_values('% of Total', ascending=False)
    print(f"\nMissing Values in Dataset ({df.shape[1]} columns):")
    print(missing_table)
    return missing_table

missing_values = display_missing_values(data)

# Impute missing values in 'authorities_contacted' with 'Unknown'
data['authorities_contacted'].fillna('Unknown', inplace=True)

# --- 3. Exploratory Data Analysis (EDA) ---
# Fraud distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='fraud_reported', data=data, palette='coolwarm')
plt.title('Distribution of Fraudulent vs Non-Fraudulent Claims', pad=20)
plt.xlabel('Fraud Reported (Y = Yes, N = No)')
plt.ylabel('Number of Claims')
plt.show()

# Fraud by incident type and severity
plt.figure(figsize=(12, 6))
sns.countplot(x='incident_type', hue='fraud_reported', data=data, palette='coolwarm')
plt.title('Fraud Distribution by Incident Type', pad=20)
plt.xlabel('Incident Type')
plt.ylabel('Number of Claims')
plt.legend(title='Fraud Reported', loc='upper right')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12, 6))
sns.countplot(x='incident_severity', hue='fraud_reported', data=data, palette='coolwarm')
plt.title('Fraud Distribution by Incident Severity', pad=20)
plt.xlabel('Incident Severity')
plt.ylabel('Number of Claims')
plt.legend(title='Fraud Reported', loc='upper right')
plt.show()

# Correlation heatmap for numerical features
plt.figure(figsize=(14, 10))
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = data[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features', pad=20)
plt.show()

# --- 4. Data Preprocessing ---
# Encode categorical variables
label_encoders = {}
categorical_cols = data.select_dtypes(include=['object']).columns.drop('fraud_reported')
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Encode target variable
le_target = LabelEncoder()
data['fraud_reported'] = le_target.fit_transform(data['fraud_reported'])

# Split features and target
X = data.drop('fraud_reported', axis=1)
y = data['fraud_reported']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# --- 5. Random Forest Model ---
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test)
print("\nRandom Forest Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le_target.classes_))

# Plot confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.title('Random Forest Confusion Matrix', pad=20)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance, palette='viridis')
plt.title('Top 10 Most Important Features (Random Forest)', pad=20)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.show()

# --- 6. Deep Learning Model ---
# Build neural network
model_dl = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model_dl.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = model_dl.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2)

# Evaluate model
y_pred_dl = (model_dl.predict(X_test) > 0.5).astype('int32')
print("\nDeep Learning Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dl):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dl, target_names=le_target.classes_))

# Plot confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_dl), annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, yticklabels=le_target.classes_)
plt.title('Deep Learning Confusion Matrix', pad=20)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot training history
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', color='navy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='orange')
plt.title('Deep Learning Model: Accuracy Over Epochs', pad=20)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', color='navy')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Deep Learning Model: Loss Over Epochs', pad=20)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()


# Save the notebook for presentation
print("\nNotebook ready for presentation. Ensure 'insurance_claims.csv' is in the working directory.")
# Save the notebook for presentation
print("\nNotebook ready for presentation. Ensure 'insurance_claims.csv' is in the working directory.")