In [None]:
# EDA.ipynb
# ---------------------------------------------
# Task 1: Exploratory Data Analysis (EDA)
# Dataset: cleaned_data_sample.csv (100k rows)
# ---------------------------------------------

# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Configure plots
%matplotlib inline
sns.set(style='whitegrid')


In [None]:
# ---------------------------------------------
# Step 2: Load data
# ---------------------------------------------
data_file = "../data/processed/cleaned_data_sample.csv"

data = pd.read_csv(data_file)
print("Data loaded successfully!")
print(f"Shape: {data.shape}")

In [None]:
# ---------------------------------------------
# Step 3: Data overview
# ---------------------------------------------
print("\nColumns:\n", data.columns.tolist())
print("\nData Info:")
print(data.info())

# Check first few rows
data.head()

In [None]:
# ---------------------------------------------
# Step 4: Missing values
# ---------------------------------------------
missing = data.isnull().sum()
print("\nMissing values per column:")
print(missing[missing > 0])


In [None]:
# ---------------------------------------------
# Step 5: Convert numeric columns
# ---------------------------------------------
# Example numeric columns (adjust based on dataset)
numeric_cols = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate', 'SumInsured', 'CalculatedPremiumPerTerm']

for col in numeric_cols:
    if col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')

In [None]:
# ---------------------------------------------
# Step 6: Descriptive statistics
# ---------------------------------------------
print("\nDescriptive statistics for numeric columns:")
print(data[numeric_cols].describe())

In [None]:

# ---------------------------------------------
# Step 7: Feature engineering
# ---------------------------------------------
# Loss Ratio = TotalClaims / TotalPremium
if 'TotalClaims' in data.columns and 'TotalPremium' in data.columns:
    data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']
    print("\nLossRatio column added!")

In [None]:

# ---------------------------------------------
# Step 8: Univariate analysis
# ---------------------------------------------
# Histograms for numeric variables
for col in numeric_cols + ['LossRatio']:
    if col in data.columns:
        plt.figure(figsize=(6,4))
        sns.histplot(data[col].dropna(), bins=50, kde=True)
        plt.title(f'Distribution of {col}')
        plt.show()
# Bar plots for categorical variables
categorical_cols = ['Province', 'VehicleType', 'Gender', 'make']
for col in categorical_cols:
    if col in data.columns:
        plt.figure(figsize=(8,4))
        sns.countplot(y=col, data=data, order=data[col].value_counts().index[:20])
        plt.title(f'Counts of {col} (Top 20)')
        plt.show()


In [None]:
# ---------------------------------------------
# Step 9: Bivariate analysis
# ---------------------------------------------
# Scatter plot: TotalPremium vs TotalClaims
if 'TotalPremium' in data.columns and 'TotalClaims' in data.columns:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='TotalPremium', y='TotalClaims', data=data, alpha=0.3)
    plt.title('TotalPremium vs TotalClaims')
    plt.show()

# Correlation heatmap for numeric columns
plt.figure(figsize=(10,8))
sns.heatmap(data[numeric_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# LossRatio by Province
if 'Province' in data.columns and 'LossRatio' in data.columns:
    plt.figure(figsize=(8,4))
    sns.boxplot(x='Province', y='LossRatio', data=data)
    plt.xticks(rotation=45)
    plt.title("Loss Ratio by Province")
    plt.show()

# LossRatio by VehicleType
if 'VehicleType' in data.columns and 'LossRatio' in data.columns:
    plt.figure(figsize=(8,4))
    sns.boxplot(x='VehicleType', y='LossRatio', data=data)
    plt.xticks(rotation=45)
    plt.title("Loss Ratio by VehicleType")
    plt.show()

In [None]:
# ---------------------------------------------
# Step 10: Outlier detection
# ---------------------------------------------
for col in numeric_cols:
    if col in data.columns:
        plt.figure(figsize=(6,4))
        sns.boxplot(x=data[col])
        plt.title(f'Boxplot of {col} to detect outliers')
        plt.show()


In [None]:
# ---------------------------------------------
# Step 11: Top 3 insights (example)
# ---------------------------------------------
# 1. Provinces with highest average loss ratio
if 'Province' in data.columns and 'LossRatio' in data.columns:
    province_loss = data.groupby('Province')['LossRatio'].mean().sort_values(ascending=False)
    print("\nTop 5 Provinces by Loss Ratio:\n", province_loss.head())

# 2. Vehicle makes with highest average claim
if 'make' in data.columns and 'TotalClaims' in data.columns:
    make_claim = data.groupby('make')['TotalClaims'].mean().sort_values(ascending=False)
    print("\nTop 5 Vehicle Makes by Average Claim:\n", make_claim.head())

# 3. Gender differences in average loss ratio
if 'Gender' in data.columns and 'LossRatio' in data.columns:
    gender_loss = data.groupby('Gender')['LossRatio'].mean()
    print("\nAverage Loss Ratio by Gender:\n", gender_loss)

print("\nEDA completed successfully!")
