In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

np.random.seed(42)
n_normal = 90000
n_extreme = 10000

policy_types = ["Health", "Life", "Property", "Vehicle"]
regions = ["North", "South", "East", "West", "Central"]

def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))

start_date = datetime.strptime("2015-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2023-12-31", "%Y-%m-%d")

normal_data = {
    "Policy_ID": [f"N{100000+i}" for i in range(n_normal)],
    "Policy_Type": np.random.choice(policy_types, n_normal),
    "Region": np.random.choice(regions, n_normal),
    "Start_Date": [random_date(start_date, end_date).strftime("%d-%m-%Y") for _ in range(n_normal)],
    "Sum_Insured": np.random.randint(100000, 500000, n_normal),
    "Premium": np.random.randint(1000, 5000, n_normal),
    "Claim_Count": np.random.poisson(1, n_normal),
    "Total_Claim_Amount": np.random.randint(0, 50000, n_normal)
}

extreme_data = {
    "Policy_ID": [f"X{200000+i}" for i in range(n_extreme)],
    "Policy_Type": np.random.choice(["Health", "Life", "Property"], n_extreme, p=[0.4, 0.4, 0.2]),
    "Region": np.random.choice(["South", "North", "Central"], n_extreme),
    "Start_Date": [random_date(start_date, end_date).strftime("%d-%m-%Y") for _ in range(n_extreme)],
    "Sum_Insured": np.random.randint(300000, 1000000, n_extreme),
    "Premium": np.random.randint(5000, 20000, n_extreme),
    "Claim_Count": np.random.randint(10, 30, n_extreme),
    "Total_Claim_Amount": np.random.randint(100000, 1000000, n_extreme)
}

df_normal = pd.DataFrame(normal_data)
df_extreme = pd.DataFrame(extreme_data)
df_combined = pd.concat([df_normal, df_extreme], ignore_index=True)

df_combined.to_csv("insurance_training_data_with_extremes.csv", index=False)
df_combined.shape


(100000, 8)

In [2]:
import pandas as pd

# Load the synthetic training data with normal + extreme cases
df = pd.read_csv("insurance_training_data_with_extremes.csv")

# Show a preview
df.head()


Unnamed: 0,Policy_ID,Policy_Type,Region,Start_Date,Sum_Insured,Premium,Claim_Count,Total_Claim_Amount
0,N100000,Property,South,29-09-2017,235435,4678,1,41360
1,N100001,Vehicle,South,06-06-2019,184003,2148,0,38847
2,N100002,Health,West,01-12-2019,387500,4182,0,31394
3,N100003,Property,North,16-08-2019,190032,2520,0,4318
4,N100004,Property,South,13-10-2023,493778,3993,2,42513


In [3]:
import numpy as np
from datetime import datetime

# Parse Start_Date correctly (format is "DD-MM-YYYY")
df["Start_Date"] = pd.to_datetime(df["Start_Date"], format="%d-%m-%Y", errors="coerce")

# Calculate policy age in days
df["Policy_Age_Days"] = (pd.to_datetime("today") - df["Start_Date"]).dt.days

# Drop rows with missing Start_Date if any
df = df.dropna(subset=["Policy_Age_Days"])

# Preview result
df[["Start_Date", "Policy_Age_Days"]].head()


Unnamed: 0,Start_Date,Policy_Age_Days
0,2017-09-29,2838
1,2019-06-06,2223
2,2019-12-01,2045
3,2019-08-16,2152
4,2023-10-13,633


In [4]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=["Policy_Type", "Region"], drop_first=False)

# Preview new columns
df_encoded.head()


Unnamed: 0,Policy_ID,Start_Date,Sum_Insured,Premium,Claim_Count,Total_Claim_Amount,Policy_Age_Days,Policy_Type_Health,Policy_Type_Life,Policy_Type_Property,Policy_Type_Vehicle,Region_Central,Region_East,Region_North,Region_South,Region_West
0,N100000,2017-09-29,235435,4678,1,41360,2838,False,False,True,False,False,False,False,True,False
1,N100001,2019-06-06,184003,2148,0,38847,2223,False,False,False,True,False,False,False,True,False
2,N100002,2019-12-01,387500,4182,0,31394,2045,True,False,False,False,False,False,False,False,True
3,N100003,2019-08-16,190032,2520,0,4318,2152,False,False,True,False,False,False,True,False,False
4,N100004,2023-10-13,493778,3993,2,42513,633,False,False,True,False,False,False,False,True,False


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define features (drop ID and Start_Date)
features = [col for col in df_encoded.columns if col not in ["Policy_ID", "Start_Date"]]

# Split into X and y
X = df_encoded[features]
y = df_encoded["Total_Claim_Amount"]  # 👈 We're predicting claim amount as proxy for risk

# Optional: scale y if using large values
# y = np.log1p(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=300, max_depth=12, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
from math import sqrt
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ Model Trained!")
print(f"📉 RMSE: {rmse:.4f}")
print(f"📈 R² Score: {r2:.4f}")


✅ Model Trained!
📉 RMSE: 32.2910
📈 R² Score: 1.0000


In [7]:
import joblib

# Save model as a .pkl file
joblib.dump(model, "risk_model.pkl")


['risk_model.pkl']

In [8]:
from google.colab import files
files.download("risk_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>