In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin

#Define FeatureEngineer (same as training)
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.risk_map = {"House": 3, "Condo": 2, "Apartment": 1}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Cap outliers
        def cap_outliers(series, lower=0.01, upper=0.99):
            lower_bound = series.quantile(lower)
            upper_bound = series.quantile(upper)
            return np.clip(series, lower_bound, upper_bound)

        if "Previous Claims" in X.columns:
            X["Previous Claims"] = cap_outliers(X["Previous Claims"])

        #Log transforms
        for col in ["Annual Income", "Health Score"]:
            if col in X.columns:
                X[col] = np.log1p(X[col])

        #Binning
        if "Annual Income" in X.columns:
            X["Income_Band"] = pd.qcut(X["Annual Income"], q=5, labels=False, duplicates="drop")
        if "Health Score" in X.columns:
            X["Health_Band"] = pd.qcut(X["Health Score"], q=5, labels=False, duplicates="drop")
        if "Credit Score" in X.columns:
            X["Credit_Band"] = pd.qcut(X["Credit Score"], q=5, labels=False, duplicates="drop")

        #Age groups
        if "Age" in X.columns:
            X["Age_Group"] = pd.cut(X["Age"], bins=[18, 30, 45, 60, 80],
                                    labels=["Young", "Mid", "Mature", "Senior"])
        if "Smoking Status" in X.columns and "Age" in X.columns:
            X["Age_Smoking"] = X["Age"] * (X["Smoking Status"] == "Yes").astype(int)

        #Vehicle policy
        if "Vehicle Age" in X.columns and "Policy Type" in X.columns:
            X["VehiclePolicy"] = X["Vehicle Age"].astype(str) + "_" + X["Policy Type"].astype(str)

        #Duration categories
        if "Insurance Duration" in X.columns:
            X["Duration_Category"] = pd.cut(
                X["Insurance Duration"], bins=[0, 2, 5, 10, 20],
                labels=["Short", "Mid", "Long", "Very Long"]
            )

        #Claims per year
        if "Previous Claims" in X.columns and "Insurance Duration" in X.columns:
            X["Claims_per_Year"] = X["Previous Claims"] / (X["Insurance Duration"] + 1)

        # Property risk
        if "Property Type" in X.columns:
            X["Property_Risk"] = X["Property Type"].map(self.risk_map)

        return X

#Load trained pipeline
model = joblib.load("best_model.pkl")

#Load Test Data
test = pd.read_csv("test.csv")

#Keep ID column for submission
if "id" in test.columns:
    test_ids = test["id"]
    test = test.drop(columns=["id"])
else:
    test_ids = pd.Series(range(1, len(test) + 1), name="ID")

#Fix Policy Start Date features
if "Policy Start Date" in test.columns:
    test["Policy Start Date"] = pd.to_datetime(test["Policy Start Date"], errors="coerce")
    test["Policy_Start_Year"] = test["Policy Start Date"].dt.year
    test["Policy_Start_Month"] = test["Policy Start Date"].dt.month
    test = test.drop(columns=["Policy Start Date"], errors="ignore")

print("✅ Test data ready:", test.shape)

#Predictions
preds_log = model.predict(test)
preds = np.expm1(preds_log)   #inverse log1p

#Save Submission
submission = pd.DataFrame({
    "ID": test_ids,
    "Premium Amount": preds
})

submission.to_csv("submission.csv", index=False)

print("✅ Predictions saved to submission.csv")
print(submission.head())

✅ Test data ready: (800000, 20)
✅ Predictions saved to submission.csv
        ID  Premium Amount
0  1200000      750.093445
1  1200001      823.804993
2  1200002      775.093872
3  1200003      766.412476
4  1200004      729.217773
