# Exploratory Data Analysis for Auto Insurance Fraud Dataset
This notebook loads and explores the datasets provided: `pa_decisions_2022_2023.csv` and `rl_tensors_2022_2023.npz`.

## 1. Imports and Dataset Loading

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq

# Load PA decisions dataset
table = pq.read_table("../data/processed/pa_decisions_2022_2023.parquet")
pa_df = table.to_pandas()

# Load RL tensors dataset
rl_npz = np.load('../data/processed/rl_tensors_2022_2023.npz', allow_pickle=True)

# Show first few rows of PA decisions
pa_df.head()

Unnamed: 0,game_pk,game_date,game_year,pitch_number,at_bat_number,batter,pitcher_on_mound,home_team,away_team,stand,...,lineup_idx,next_hitters_ids,time_index_pa,delta_re_pa,half_inning_over,game_over,reward_folded,next_state_idx,action_idx,next_pitcher_id
0,661032,2022-04-26,2022,1,1,664702,663776,LAA,CLE,R,...,0,"[642708, 608070, 614177, 680911, 640458]",1650931200000000011,-0.247,False,False,0.740466,2,0,-1
1,661032,2022-04-26,2022,1,2,642708,663776,LAA,CLE,R,...,1,"[608070, 614177, 680911, 640458, 676391]",1650931200000000015,-0.246,False,False,0.49845,2,0,-1
2,661032,2022-04-26,2022,1,3,608070,663776,LAA,CLE,R,...,2,"[614177, 680911, 640458, 676391, 595978]",1650931200000000017,-0.255,True,False,0.255,2,0,-1
3,661032,2022-04-26,2022,1,7,614177,663776,LAA,CLE,R,...,3,"[680911, 640458, 676391, 595978, 665926]",1650931200000000029,-0.246,False,False,0.740446,5,0,-1
4,661032,2022-04-26,2022,1,8,680911,663776,LAA,CLE,R,...,4,"[640458, 676391, 595978, 665926, 664702]",1650931200000000033,-0.246,False,False,0.49944,5,0,-1


## 2. Basic Information

In [2]:
# Shape of dataset
print("PA Decisions shape:", pa_df.shape)

# Column info
pa_df.info()

# Basic statistics
pa_df.describe()

PA Decisions shape: (407660, 63)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407660 entries, 0 to 407659
Data columns (total 63 columns):
 #   Column                              Non-Null Count   Dtype         
---  ------                              --------------   -----         
 0   game_pk                             407660 non-null  Int64         
 1   game_date                           407660 non-null  datetime64[ns]
 2   game_year                           407660 non-null  Int64         
 3   pitch_number                        407660 non-null  Int64         
 4   at_bat_number                       407660 non-null  Int64         
 5   batter                              407660 non-null  Int64         
 6   pitcher_on_mound                    407660 non-null  Int64         
 7   home_team                           407660 non-null  object        
 8   away_team                           407660 non-null  object        
 9   stand                               407660 non-n

Unnamed: 0,game_pk,game_date,game_year,pitch_number,at_bat_number,batter,pitcher_on_mound,inning,outs,balls,...,pitch_count,n_priorpa_thisgame_player_at_bat_y,tto,lineup_idx,time_index_pa,delta_re_pa,reward_folded,next_state_idx,action_idx,next_pitcher_id
count,407660.0,407660,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,...,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0,407660.0
mean,692558.881313,2022-12-24 13:11:19.307266048,2022.496021,1.0,38.633133,623655.338655,619991.870627,4.959952,0.975948,0.0,...,28.674354,1.49111,0.466766,4.183763,1.671887e+18,0.001411,-0.010024,203830.794726,0.264885,56398.633773
min,661032.0,2022-03-17 00:00:00,2022.0,1.0,1.0,405395.0,405395.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.647475e+18,-0.646,-7.71138,2.0,0.0,-1.0
25%,662393.75,2022-06-26 00:00:00,2022.0,1.0,19.0,595281.0,594577.0,3.0,0.0,0.0,...,7.0,0.0,0.0,2.0,1.656202e+18,-0.267,-0.2598,101916.25,0.0,-1.0
50%,715745.0,2022-10-12 00:00:00,2022.0,1.0,38.0,643393.0,641729.0,5.0,1.0,0.0,...,19.0,1.0,0.0,4.0,1.665533e+18,-0.247,0.196906,203830.0,0.0,-1.0
75%,717673.0,2023-06-23 00:00:00,2023.0,1.0,57.0,666211.0,665152.0,7.0,2.0,0.0,...,47.0,2.0,1.0,6.0,1.687478e+18,0.348,0.46891,305745.0,0.0,-1.0
max,748585.0,2023-10-31 00:00:00,2023.0,1.0,133.0,807799.0,806288.0,18.0,2.0,0.0,...,126.0,7.0,4.0,21.0,1.69871e+18,2.612,1.149316,407659.0,10.0,806288.0
std,27521.228777,,0.499985,0.0,22.455897,58427.158417,60079.043687,2.587286,0.816651,0.0,...,26.513684,1.242937,0.702647,2.995651,1.645552e+16,0.444072,0.685971,117681.453785,1.253929,179258.808347


## 3. Missing Values

In [3]:
# Check for missing values
missing_values = pa_df.isnull().sum()
missing_percent = 100 * missing_values / len(pa_df)
missing_df = pd.DataFrame({"Missing Count": missing_values, "Missing %": missing_percent})
missing_df.sort_values(by="Missing %", ascending=False)

Unnamed: 0,Missing Count,Missing %
on_3b,366737,89.961488
on_2b,325411,79.824118
on_1b,273537,67.099298
launch_speed_angle,147224,36.114409
bb_type,132503,32.503312
...,...,...
game_over,0,0.000000
reward_folded,0,0.000000
next_state_idx,0,0.000000
action_idx,0,0.000000


## 4. Categorical Features Exploration

In [None]:
categorical_cols = pa_df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(pa_df[col].value_counts())
    sns.countplot(y=col, data=pa_df, order=pa_df[col].value_counts().index)
    plt.show()

## 5. Numerical Features Exploration

In [None]:
numerical_cols = pa_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
pa_df[numerical_cols].hist(bins=30, figsize=(15, 10), layout=(len(numerical_cols)//3+1, 3))
plt.tight_layout()
plt.show()

## 6. Correlations

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(pa_df[numerical_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

## 7. Target Variable Analysis (if there is a fraud/decision column)

In [None]:
if 'fraud' in pa_df.columns:
    sns.countplot(x='fraud', data=pa_df)
    plt.title("Fraud vs Non-Fraud Cases")
    plt.show()
    print(pa_df['fraud'].value_counts(normalize=True))

## 8. RL Tensors Exploration

In [None]:
print("Keys in RL npz file:", rl_npz.files)

# Display shapes of all arrays
for key in rl_npz.files:
    print(f"{key}: shape={rl_npz[key].shape}, dtype={rl_npz[key].dtype}")

## 9. Example Visualization for RL Tensors (if numeric)

In [None]:
# Replace 'observations' with actual key names if available
if 'observations' in rl_npz.files:
    obs = rl_npz['observations']
    print("Observations stats:")
    print("Min:", obs.min(), "Max:", obs.max(), "Mean:", obs.mean(), "Std:", obs.std())

    plt.figure(figsize=(10, 4))
    plt.hist(obs.flatten(), bins=50)
    plt.title("Distribution of RL Observations")
    plt.show()

## 10. Summary
- Number of records, columns
- Missing values overview
- Key categorical and numerical distributions
- Correlations
- RL tensor shapes and basic stats

In [None]:
print("EDA Complete: Dataset overview ready for modeling or further analysis.")