# Exploratory Data Analysis (EDA)

This notebook contains the exploratory data analysis for the insurance risk analytics project.

## Objectives

1. Data Understanding
2. Data Quality Assessment
3. Univariate Analysis
4. Bivariate/Multivariate Analysis
5. Temporal Trends Analysis
6. Outlier Detection


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path().resolve().parent))

from src.data.load_data import load_insurance_data, get_data_info
from src.utils.config import RAW_DATA_DIR

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


In [None]:
# Load data
df = load_insurance_data()

# Display basic info
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Info:")
df.info()


In [None]:
# Data quality assessment
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df)


In [None]:
# Calculate Loss Ratio
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']

# Descriptive statistics
print("Descriptive Statistics for Key Financial Variables:")
financial_cols = ['TotalPremium', 'TotalClaims', 'LossRatio', 'SumInsured', 'CustomValueEstimate']
print(df[financial_cols].describe())
