In [490]:
import pandas as pd
import re
import datetime
import numpy as np
from dateutil.relativedelta import relativedelta
from datetime import date
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.linear_model import PoissonRegressor

In [444]:
claims = pd.read_parquet("claims.parquet")

In [448]:
policies = pd.read_excel("policies.xlsx",engine = "openpyxl")

In [451]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [452]:
policies['pol'].value_counts()

f976a747    4
1edad92c    4
206daa37    4
03d18350    4
bd8d5692    4
           ..
c0c31cc6    1
66e20e0c    1
5df4372c    1
5d3eca51    1
7908e09d    1
Name: pol, Length: 1000, dtype: int64

In [496]:
def read_files(parquet_file, csv_file, xlsx_file):
    claims = pd.read_parquet(parquet_file)
    policies = pd.read_excel(xlsx_file, engine = "openpyxl")
    properties = pd.read_csv(csv_file)
    claims_clean = date_process(claims, "start_date")
    policies_start_clean = date_process(policies, "start")
    policies_clean = date_process(policies_start_clean, "end")
    final_data = data_process(claims_clean, policies_clean, properties)
    return final_data

In [497]:
def date_process(df, column):
    df1 = df[column].str.extract('(?P<day>\d+)(?P<month>[A-Za-z]{3})(?P<year>.*)')
    df1['year'].replace(regex=True, inplace=True, to_replace=r'[^0-9]', value=r'')
    df2 = pd.concat([df, df1], axis=1)
    df2.drop(column, axis = 1, inplace = True)
    df2[column] = pd.to_datetime(df2['year'].astype(str)  + df2['month'] + df2['day'].astype(str), format='%Y%b%d', errors = 'coerce')
    df2.drop(['day', 'month', 'year'], axis = 1, inplace = True)
    return df2

In [498]:
def data_process(claims, policies, properties):
    claims.dropna(inplace = True)
    policies.dropna(inplace = True)
    policies['duration'] = policies['end'] - policies['start']
    policies['duration'] = (policies['duration']/ datetime.timedelta(days = 365)).astype(int)
    claims_grouped = claims.groupby(['property', 'pol', 'start_date']).count().reset_index()
    interim_result = pd.merge(claims_grouped, properties, left_on = "property", right_on = "prop_id", how = "inner")
    interim_result.drop(['pol_x', 'property'], inplace = True, axis = 1)
    interim_result.rename(columns = {'pol_y': 'pol', 'amount': 'claims'}, inplace = True)
    result = pd.merge(interim_result, policies, left_on = ['pol', 'start_date'], right_on = ['pol', 'start'], how = "inner")
    result.drop(['end', 'start', 'prop_id', 'start_date', 'pol'], inplace = True, axis = 1)
    result['exposure'] = (result['sqft']/1000) * result['duration']
    result['claim_frequency'] = result['claims']/result['exposure']
    return result

In [499]:
result_final = read_files("claims.parquet", "properties.csv", "policies.xlsx")

In [505]:
df_train, df_test = train_test_split(result_final, test_size=0.2, random_state=0)

In [506]:
numeric_transformer = Pipeline(
    steps = [
        ("scaler", StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ['sqft', 'age', 'exposure', 'claims']),
        ("cat", categorical_transformer, ['state']),
    ]
)

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge(alpha = 1e-6)),
    ]
)

In [529]:
X_train = df_train.drop(columns = ['claim_frequency'])
y_train = df_train['claim_frequency']

X_test = df_test.drop(columns = ['claim_frequency'])
y_test = df_test['claim_frequency']

In [509]:
from sklearn.linear_model import Ridge

pd.DataFrame(cross_validate(
    pipe, X_train, y_train, return_train_score=True, 
    scoring=["neg_mean_absolute_error"]
))


Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error
0,0.026948,0.021511,-0.108987,-0.1183
1,0.028242,0.009153,-0.120165,-0.111731
2,0.022598,0.008371,-0.114362,-0.116266
3,0.023083,0.008475,-0.115167,-0.116023
4,0.022605,0.008435,-0.116796,-0.112906


In [515]:
preprocessor.fit(X_train)
X_transformed = preprocessor.transform(X_train)

In [516]:
import statsmodels.api as sm

gamma_model = sm.GLM(y_train, X_transformed, family=sm.families.Gamma())
gamma_results = gamma_model.fit()
print(gamma_results.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:        claim_frequency   No. Observations:                14242
Model:                            GLM   Df Residuals:                    14236
Model Family:                   Gamma   Df Model:                            5
Link Function:          inverse_power   Scale:                         0.34653
Method:                          IRLS   Log-Likelihood:                 13370.
Date:                Fri, 26 Nov 2021   Deviance:                       9768.7
Time:                        05:02:16   Pearson chi2:                 4.93e+03
No. Iterations:                    26                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             3.2806      0.017    195.867      0.0

In [530]:
from sklearn.linear_model import PoissonRegressor

pipe_1 = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("regressor", PoissonRegressor(alpha=1e-6, max_iter=300)),
    ]
).fit(X_train, y_train, regressor__sample_weight = X_train['exposure'])

In [531]:
pipe_1.predict(X_test)

array([0.14210745, 0.03227437, 0.19305008, ..., 0.24241986, 0.22168222,
       0.08315073])