In [30]:
import pandas as pd
import statsmodels.api as sm
import pandas as pd 
import numpy as np 

# Load
df = pd.read_csv("historical_data.csv")

# One-hot encode categoricals
df = pd.get_dummies(df, columns=["province", "carcolour"], drop_first=True)
bool_cols = df.select_dtypes(include=["bool"]).columns
df[bool_cols] = df[bool_cols].astype(int)



In [29]:
# Response variable
y = df["claims"]

# Predictors (exclude 'id', 'claims', 'exp_years')
X = df.drop(columns=["id", "claims", "exp_years"])

# Add constant
X = sm.add_constant(X)

# Fit model with exposure offset
model = sm.GLM(
    y,
    X,
    family=sm.families.Poisson(),
    offset=np.log(df["exp_years"])
).fit()

print(model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 claims   No. Observations:                80000
Model:                            GLM   Df Residuals:                    79980
Model Family:                 Poisson   Df Model:                           19
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -32288.
Date:                Mon, 17 Nov 2025   Deviance:                       43565.
Time:                        08:41:10   Pearson chi2:                 8.09e+04
No. Iterations:                     6   Pseudo R-squ. (CS):            0.01824
Covariance Type:            nonrobust                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -1.4256      0.285  

In [15]:
new_df = pd.read_csv("new_policies.csv")

In [16]:
new_df.head()


Unnamed: 0,id,province,carcolour,hp,gender,age,lic_years,pastclaims,exp_years
0,1,WC,White,218,1,29,11,2,0.481862
1,2,FS,White,189,0,27,8,2,0.232717
2,3,GP,Blue,166,1,31,13,0,0.169747
3,4,GP,Yellow,94,0,31,13,0,0.156057
4,5,MP,Blue,308,0,33,15,0,0.427105


In [17]:
new_df = pd.get_dummies(new_df, columns=["province", "carcolour"], drop_first=True)
bool_cols = new_df.select_dtypes(include=["bool"]).columns
new_df[bool_cols] = df[bool_cols].astype(int)

In [18]:
new_df.head()

Unnamed: 0,id,hp,gender,age,lic_years,pastclaims,exp_years,province_FS,province_GP,province_KZN,...,province_MP,province_NC,province_NW,province_WC,carcolour_Blue,carcolour_Green,carcolour_Red,carcolour_Silver,carcolour_White,carcolour_Yellow
0,1,218,1,29,11,2,0.481862,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,2,189,0,27,8,2,0.232717,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,3,166,1,31,13,0,0.169747,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,4,94,0,31,13,0,0.156057,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,308,0,33,15,0,0.427105,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [19]:
X_new = new_df.drop(columns=["id", "exp_years"])
X_new = sm.add_constant(X_new)

In [20]:
X.head()

Unnamed: 0,const,hp,gender,age,lic_years,pastclaims,province_FS,province_GP,province_KZN,province_LIM,province_MP,province_NC,province_NW,province_WC,carcolour_Blue,carcolour_Green,carcolour_Red,carcolour_Silver,carcolour_White,carcolour_Yellow
0,1.0,74,0,18,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
1,1.0,283,0,81,63,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
2,1.0,122,0,54,35,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0
3,1.0,172,0,39,21,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1.0,161,0,54,35,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [21]:
prediction = model.predict(X_new, offset=np.log(new_df["exp_years"]))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [22]:
prediction 

0        0.137353
1        0.056276
2        0.035942
3        0.023374
4        0.113807
           ...   
14547    0.040070
14548    0.048379
14549    0.069917
14550    0.043779
14551    0.004666
Name: exp_years, Length: 14552, dtype: float64

In [23]:
annual_frequency = prediction / new_df["exp_years"]


In [24]:
annual_frequency

0        0.285047
1        0.241820
2        0.211741
3        0.149780
4        0.266461
           ...   
14547    0.147832
14548    0.123570
14549    0.290197
14550    0.124923
14551    0.131109
Name: exp_years, Length: 14552, dtype: float64

In [27]:
simulated_claims = np.random.poisson(lam=prediction)
simulated_claims

array([0, 0, 0, ..., 0, 0, 0], shape=(14552,), dtype=int32)

In [34]:
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split

In [35]:
# new

# Response variable
y = df["claims"]

# Predictors (exclude 'id', 'claims', 'exp_years')
X = df.drop(columns=["id", "claims"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
model = PoissonRegressor(alpha=1, fit_intercept=True, max_iter=1000)
model.fit(X_train, y_train, sample_weight=df.loc[y_train.index, "exp_years"])
y_pred = model.predict(X_test)

In [36]:
#from sklearn.metrics import mean_poisson_deviance
    # y_true and y_pred are arrays of true and predicted counts
#mpd = mean_poisson_deviance(y_true, y_pred)
from sklearn.metrics import (
    mean_absolute_error,
    mean_poisson_deviance,
    mean_squared_error,
)

In [38]:
y_pred

array([0.20527501, 0.12798628, 0.21128533, ..., 0.1834419 , 0.12324621,
       0.07873606], shape=(16000,))

In [41]:
mean_absolute_error(y_test, y_pred)


0.2579646637688003

In [42]:
mean_squared_error(y_test, y_pred)


0.1475550255297249

In [43]:
mean_poisson_deviance(y_test, y_pred)

0.575143575686425