# Title Insurance Cost Prediction - E2E Notebook

This notebook helps you:
- Download or load the dataset
- Explore features (EDA)
- Train and evaluate multiple models
- Save the best model

Run the cells top-to-bottom. Update the `DATASET_PATH` and `TARGET_COLUMN` as needed.


In [5]:
# Config
DATASET_PATH = "insurance.csv"  # Your dataset file
TARGET_COLUMN = "charges"  # Target column in insurance dataset
ID_COLS = []  # No ID columns in this dataset
RANDOM_STATE = 42
VALID_SIZE = 0.2

import os
os.makedirs("data", exist_ok=True)
os.makedirs("artifacts", exist_ok=True)



In [6]:
# Optional: Download from Kaggle (uncomment and set dataset)
# Requires Kaggle API to be configured per README
# Example uses a placeholder dataset path; replace with the correct dataset slug
# ex: kaggle datasets download -d someuser/title-insurance-pricing -p data --unzip

import subprocess
from shutil import which

if which("kaggle") is None:
    print("Kaggle CLI not found. Configure Kaggle per README if you want to auto-download.")
else:
    print("Kaggle CLI is available. To download, run a cell like:")
    print("!kaggle datasets download -d someuser/title-insurance-pricing -p data --unzip")



Kaggle CLI not found. Configure Kaggle per README if you want to auto-download.


In [7]:
# Load data
import pandas as pd

df = pd.read_csv(DATASET_PATH)
print(df.shape)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'insurance.csv'

In [None]:
# Basic EDA
import numpy as np

print("Columns:", df.columns.tolist())
print("\nDtypes:\n", df.dtypes)
print("\nMissing values:\n", df.isna().sum().sort_values(ascending=False).head(20))

numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != TARGET_COLUMN]
categorical_cols = [c for c in df.columns if c not in numeric_cols + [TARGET_COLUMN]]
print("\nNumeric cols:", numeric_cols[:20])
print("Categorical cols:", categorical_cols[:20])

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.histplot(df[TARGET_COLUMN], kde=True)
plt.title("Target distribution")
plt.show()

corr_candidates = [c for c in numeric_cols + [TARGET_COLUMN] if pd.api.types.is_numeric_dtype(df[c])]
if len(corr_candidates) <= 60:
    plt.figure(figsize=(min(0.6*len(corr_candidates)+4, 16), 10))
    sns.heatmap(df[corr_candidates].corr(numeric_only=True), cmap="coolwarm", center=0)
    plt.title("Correlation heatmap (numeric)")
    plt.show()


In [None]:
# Train/validation split and preprocessing + models
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None

try:
    from lightgbm import LGBMRegressor
except Exception:
    LGBMRegressor = None

features = [c for c in df.columns if c not in ID_COLS + [TARGET_COLUMN]]
X = df[features]
y = df[TARGET_COLUMN].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=VALID_SIZE, random_state=RANDOM_STATE
)

numeric_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
categorical_cols = [c for c in X_train.columns if c not in numeric_cols]

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

pre = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols),
])

candidates = {}
candidates["linear_regression"] = Pipeline([
    ("pre", pre),
    ("est", LinearRegression()),
])

candidates["random_forest"] = Pipeline([
    ("pre", pre),
    ("est", RandomForestRegressor(n_estimators=400, n_jobs=-1, random_state=RANDOM_STATE)),
])

if XGBRegressor is not None:
    candidates["xgboost"] = Pipeline([
        ("pre", pre),
        ("est", XGBRegressor(
            n_estimators=800,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=RANDOM_STATE,
            n_jobs=-1,
            tree_method="hist",
        )),
    ])

if LGBMRegressor is not None:
    candidates["lightgbm"] = Pipeline([
        ("pre", pre),
        ("est", LGBMRegressor(
            n_estimators=1200,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=RANDOM_STATE,
            n_jobs=-1,
        )),
    ])

results = {}
for name, pipe in candidates.items():
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_valid)
    rmse = float(np.sqrt(mean_squared_error(y_valid, pred)))
    mae = float(mean_absolute_error(y_valid, pred))
    r2 = float(r2_score(y_valid, pred))
    results[name] = {"rmse": rmse, "mae": mae, "r2": r2}

results


In [None]:
# Pick best and save model
import joblib

best_name = min(results, key=lambda k: results[k]["rmse"])
print("Best model:", best_name, results[best_name])

best_pipe = candidates[best_name]
best_pipe.fit(X, y)

joblib.dump(best_pipe, "artifacts/best_model.joblib")
print("Saved to artifacts/best_model.joblib")


In [None]:
# Quick inference demo
import pandas as pd

# Create sample data from the original dataset (without target column)
sample = df.sample(5, random_state=RANDOM_STATE).drop(columns=[TARGET_COLUMN])
print("Sample input data:")
print(sample)
print("\nMaking predictions...")

# Load the saved model and predict
loaded = joblib.load("artifacts/best_model.joblib")
preds = loaded.predict(sample)

# Show results
output = sample.copy()
output["prediction"] = preds
print("\nPredictions:")
print(output)

# Also show actual values for comparison
actual = df.loc[sample.index, TARGET_COLUMN]
output["actual"] = actual
print("\nPredictions vs Actual:")
print(output[["prediction", "actual"]])
