# Notebook 1 – Exploratory Data Analysis (EDA)

This notebook installs dependencies via `setup.py`, loads `public_cases.json`, computes summary statistics, and explores correlations.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
with open('public_cases.json') as f:
    data = json.load(f)
print(f'Total records: {len(data)}')
print('Record keys:', data[0].keys())
print('Input keys:', data[0]['input'].keys())

df = pd.json_normalize(data)
df.columns = [c.replace('input.', '') for c in df.columns]
print(df.head())


In [None]:
# Summary statistics
stats = df[['trip_duration_days','miles_traveled','total_receipts_amount','expected_output']].agg(['mean','median','std'])
print(stats)


In [None]:
# Histograms
fig, axes = plt.subplots(2,2, figsize=(10,8))
cols = ['trip_duration_days','miles_traveled','total_receipts_amount','expected_output']
for ax, col in zip(axes.flat, cols):
    sns.histplot(df[col], ax=ax, kde=True)
    ax.set_title(col)
plt.tight_layout()
plt.show()


In [None]:
# Correlation analysis

df['miles_per_day'] = df['miles_traveled'] / df['trip_duration_days']
df['receipts_per_day'] = df['total_receipts_amount'] / df['trip_duration_days']
print(df[['miles_per_day','receipts_per_day','expected_output']].corr())

sns.scatterplot(x='miles_per_day', y='expected_output', data=df)
plt.title('Miles per Day vs Reimbursement')
plt.show()

sns.scatterplot(x='receipts_per_day', y='expected_output', data=df)
plt.title('Receipts per Day vs Reimbursement')
plt.show()


## Findings
- Five-day trips often pay a bit more than surrounding durations.
- Mileage reimbursement growth slows after ~100 miles.
- Receipts around $600-800 total (or $100-120/day) seem to align with higher reimbursements.
These patterns generally match the interview notes.