In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
#src_path=os.path.abspath('../src')
#sys.path.append(src_path)

from src.data_loader import load_data, parse_dates
from src.eda import compute_loss_ratio, group_loss_ratio, check_missing, get_summary_stats
from src.visualizations import plot_histogram, plot_boxplot, plot_loss_ratio_by_category
# Correct load using pipe delimiter
df = load_data("../data/raw/MachineLearningRating_v3.txt", sep="|", encoding="utf-8")
print("✅ Shape:", df.shape)
print("✅ First 5 columns:", df.columns.tolist()[:5])


In [None]:
# Clean column names
df.columns = df.columns.str.strip().str.replace(" ", "")

# ✅ Now parse the date
df = parse_dates(df, date_column="transactionmonth")

# Confirm
print(df["transactionmonth"].head())
print(df["transactionmonth"].dtype)

In [None]:

# Data Quality Checks
missing = check_missing(df)
print("Missing values:\n", missing)


In [None]:
# Clean column names
df.columns = df.columns.str.strip()               # Remove leading/trailing whitespace
df.columns = df.columns.str.replace(" ", "")      # Remove internal spaces (optional)
#df.columns = df.columns.str.lower()               # Convert to lowercase
for col in df.columns:
    print(col)


In [None]:
# --- Summary Stats for Financial Columns ---
key_numeric = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate']
summary_stats = get_summary_stats(df, key_numeric)
print("\nSummary Stats:\n", summary_stats)

In [None]:
# --- Portfolio-Level Loss Ratio ---
portfolio_loss_ratio = compute_loss_ratio(df)
print("\n📊 Portfolio Loss Ratio:", round(portfolio_loss_ratio, 3))

In [None]:
from src.visualizations import plot_histogram, plot_boxplot, plot_loss_ratio_by_category

# Plot numeric distributions
for col in ['TotalPremium', 'TotalClaims', 'CustomValueEstimate']:
    plot_histogram(df, col)
    plot_boxplot(df, col)

# Plot loss ratios
for cat in ['Province', 'VehicleType', 'Gender']:
    if cat in df.columns:
        grouped = group_loss_ratio(df, cat)
        plot_loss_ratio_by_category(grouped, cat)