In [3]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load data
df = pd.read_csv("Training_Dataset.csv")

# 2. Drop unused cols
df = df.drop(columns=["ic_number", "approval_flag", "plate_number"])

# 3. Features & target
X = df.drop(columns="coverage_amount")
y = df["coverage_amount"]

# 4. Identify columns
numeric_features = [
    "age", "months_as_customer", "vehicle_age_years",
    "policy_expired_flag", "deductible_amount", "market_value",
    "damage_severity_score", "repair_amount", "at_fault_flag",
    "time_to_report_days", "claim_reported_to_police_flag",
    "license_type_missing_flag", "num_third_parties", "num_witnesses"
]
categorical_features = ["vehicle_make"]

# 5. Preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first", sparse=False), categorical_features),
])

# 6. Choose a regressor
regressor = Ridge(alpha=1.0)  
# regressor = RandomForestRegressor(n_estimators=100, random_state=42)

pipe = Pipeline([
    ("preproc", preprocessor),
    ("reg", regressor),
])

# 7. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 8. Fit
pipe.fit(X_train, y_train)

# 9. Predict & evaluate on hold-out
y_pred = pipe.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"HOLD-OUT MSE: {mse:.2f}")
print(f"HOLD-OUT R²:  {r2:.3f}")

# 10. Cross-validated performance
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_mse = -cross_val_score(pipe, X, y, cv=cv, scoring="neg_mean_squared_error")
cv_r2  = cross_val_score(pipe, X, y, cv=cv, scoring="r2")

print(f"5-fold CV MSE: {cv_mse.mean():.2f} ± {cv_mse.std():.2f}")
print(f"5-fold CV R²:  {cv_r2.mean():.3f} ± {cv_r2.std():.3f}")



HOLD-OUT MSE: 26070133.58
HOLD-OUT R²:  0.805
5-fold CV MSE: 29756285.36 ± 5958872.84
5-fold CV R²:  0.749 ± 0.064




In [2]:
# 11. Serialize the trained pipeline
joblib.dump(pipe, "Coverage_Model.pkl")
print("Trained regression pipeline saved to coverage_model.pkl")

Trained regression pipeline saved to coverage_model.pkl
