# Exploratory Data Analysis for Auto Insurance Fraud Dataset
This notebook loads and explores the datasets provided: `pa_decisions_2022_2023.csv` and `rl_tensors_2022_2023.npz`.

## 1. Imports and Dataset Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq

# Load PA decisions dataset
table = pq.read_table("../data/processed/pa_decisions_2022_2023.parquet")
pa_df = table.to_pandas()

# Load RL tensors dataset
rl_npz = np.load('../data/processed/rl_tensors_2022_2023.npz', allow_pickle=True)

# Show first few rows of PA decisions
pa_df.head()

## 2. Basic Information

In [None]:
# Shape of dataset
print("PA Decisions shape:", pa_df.shape)

# Column info
pa_df.info()

# Basic statistics
pa_df.describe()

## 3. Missing Values

In [None]:
# Check for missing values
missing_values = pa_df.isnull().sum()
missing_percent = 100 * missing_values / len(pa_df)
missing_df = pd.DataFrame({"Missing Count": missing_values, "Missing %": missing_percent})
missing_df.sort_values(by="Missing %", ascending=False)

## 4. Categorical Features Exploration

In [None]:
categorical_cols = pa_df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(pa_df[col].value_counts())
    sns.countplot(y=col, data=pa_df, order=pa_df[col].value_counts().index)
    plt.show()

## 5. Numerical Features Exploration

In [None]:
numerical_cols = pa_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
pa_df[numerical_cols].hist(bins=30, figsize=(15, 10), layout=(len(numerical_cols)//3+1, 3))
plt.tight_layout()
plt.show()

## 6. Correlations

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(pa_df[numerical_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

## 7. Target Variable Analysis (if there is a fraud/decision column)

In [None]:
if 'fraud' in pa_df.columns:
    sns.countplot(x='fraud', data=pa_df)
    plt.title("Fraud vs Non-Fraud Cases")
    plt.show()
    print(pa_df['fraud'].value_counts(normalize=True))

## 8. RL Tensors Exploration

In [None]:
print("Keys in RL npz file:", rl_npz.files)

# Display shapes of all arrays
for key in rl_npz.files:
    print(f"{key}: shape={rl_npz[key].shape}, dtype={rl_npz[key].dtype}")

## 9. Example Visualization for RL Tensors (if numeric)

In [None]:
# Replace 'observations' with actual key names if available
if 'observations' in rl_npz.files:
    obs = rl_npz['observations']
    print("Observations stats:")
    print("Min:", obs.min(), "Max:", obs.max(), "Mean:", obs.mean(), "Std:", obs.std())

    plt.figure(figsize=(10, 4))
    plt.hist(obs.flatten(), bins=50)
    plt.title("Distribution of RL Observations")
    plt.show()

## 10. Summary
- Number of records, columns
- Missing values overview
- Key categorical and numerical distributions
- Correlations
- RL tensor shapes and basic stats

In [None]:
print("EDA Complete: Dataset overview ready for modeling or further analysis.")