In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

In [29]:
# Define custom objective function: Adjusted R2 score
def adjusted_r2(y_true, y_pred, n_features):
    n_samples = len(y_true)
    r2 = r2_score(y_true, y_pred)
    adjusted_r2 = 1 - (1 - r2) * (n_samples - 1) / (n_samples - n_features - 1)
    return adjusted_r2

# Define the custom scoring function
def custom_scoring(estimator, X, y):
    y_pred = estimator.predict(X)
    return adjusted_r2(y, y_pred, X.shape[1])

In [30]:
# Read in data set
df = pd.read_csv("data/data.csv", parse_dates=['DateTimeOfAccident', 'DateReported'])

# Copy original data frame
df_engineer = df.copy()

# Dummy encode marital status and gender
df_engineer['MaritalStatus'] = df_engineer['MaritalStatus'].apply(lambda x: 0 if x == 'M' else 1)
df_engineer['Gender'] = df_engineer['Gender'].apply(lambda x: 0 if x == 'M' else 1)

# Create feature of time since report
df_engineer['time_to_report'] = df_engineer['DateTimeOfAccident'] - df_engineer['DateReported']

# Create time since first time point in data to proxy for inflation
df_engineer['trend'] = df_engineer['DateTimeOfAccident'] - df_engineer['DateTimeOfAccident'].min()

# Create Target: Claim complexity (absolute difference)
df_engineer['claim_complexity'] = abs(df_engineer['UltimateIncurredClaimCost'] - df_engineer['InitialIncurredClaimsCost'])

In [32]:
# Declare target and feature set
y = df_engineer['claim_complexity']
X = df_engineer[['InitialIncurredClaimsCost', 'trend', 'Gender', 'time_to_report', 'MaritalStatus', 'Age',
                 'DependentChildren', 'DependentsOther', 'WeeklyWages', 'HoursWorkedPerWeek'] + 
                 [f'ClaimDescriptionKeyword_{i}' for i in range(12)]]

# Convert time columns to hours
X['trend'] = X['trend'] / pd.Timedelta(hours=1)
X['time_to_report'] = X['time_to_report'] / pd.Timedelta(hours=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Identify the continuous features to standardize
continuous_features = ['InitialIncurredClaimsCost', 'trend', 'time_to_report', 'MaritalStatus', 'Age',
                       'DependentChildren', 'DependentsOther'] + [f'ClaimDescriptionKeyword_{i}' for i in range(12) if i != 5]

# Standardize the continuous features
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test_scaled[continuous_features] = scaler.transform(X_test[continuous_features])

# Define the parameter grid for alpha values
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['trend'] = X['trend'] / pd.Timedelta(hours=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['time_to_report'] = X['time_to_report'] / pd.Timedelta(hours=1)


In [33]:
# Create a Lasso model
lasso = Lasso()

# Perform grid search with cross-validation
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Get the best alpha value
best_alpha = grid_search.best_params_['alpha']
print("Best alpha:", best_alpha)

# Train the Lasso model with the best alpha value
best_lasso = Lasso(alpha=best_alpha)
best_lasso.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_lasso.predict(X_test_scaled)

# Calculate the out-of-sample adjusted R-squared
out_of_sample_adjusted_r2 = adjusted_r2(y_test, y_pred, X_test.shape[1])
print("Out-of-sample Adjusted R-squared:", out_of_sample_adjusted_r2)

# Display the coefficients in ascending order of the features
coef_dict = dict(zip(X.columns, best_lasso.coef_))
sorted_coef = sorted(coef_dict.items(), key=lambda x: x[1])
print("Coefficients in ascending order:")
for feature, coef in sorted_coef:
    print(f"{feature}: {coef}")

Best alpha: 100
Out-of-sample Adjusted R-squared: 0.2405233043013283
Coefficients in ascending order:
ClaimDescriptionKeyword_6: -243.4729289411492
ClaimDescriptionKeyword_10: -138.6583898791414
ClaimDescriptionKeyword_3: -84.73294361379969
ClaimDescriptionKeyword_2: -59.822585681300154
DependentsOther: -13.295105780064748
trend: -5.608041499221645
Gender: 0.0
MaritalStatus: -0.0
ClaimDescriptionKeyword_0: 0.0
ClaimDescriptionKeyword_8: -0.0
WeeklyWages: 0.12565365942949136
HoursWorkedPerWeek: 0.6236037848038544
DependentChildren: 23.007528145111625
Age: 109.08009218935064
ClaimDescriptionKeyword_5: 124.34577936519285
ClaimDescriptionKeyword_4: 207.31084923947577
ClaimDescriptionKeyword_9: 235.235652809322
ClaimDescriptionKeyword_7: 290.1347005050749
time_to_report: 380.576361320848
ClaimDescriptionKeyword_11: 412.3010857212126
ClaimDescriptionKeyword_1: 775.2162564308585
InitialIncurredClaimsCost: 13456.485948999949
