<div style="border-radius:10px; border:#242e87 solid; padding: 15px; background-color: #9feced; font-size:100%; text-align:left">

<h3 align="left"><font color='#242e87'>💡 Inspiration:</font></h3>
    
* Purpose: to predict Base Price for Insurance
    
* We are going to use two separate models in order to predict 
    * Frequency of insurance claim
    * Severity of insurance claim
    <br>    <br>
* Then, we are going to multiply those in order to find potential cost for each person

**Main plan is trying to create a base model for each frequency and severity first and then improve after comparing the predictions for base price**


## Importing Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
from category_encoders import TargetEncoder
import xgboost as xgb
from xgboost import plot_importance
from PIL import Image
import requests
from io import BytesIO
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading Data

In [None]:
df = pd.read_csv("/kaggle/input/car-insurance-claim/file(3).csv")
df = pd.DataFrame(df)
df.head()

In [None]:
# The column descriptions can be seen here
url = "https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F2566208%2F50dd4f899017ad7c6ba848cfa513a853%2FScreenshot_2020-04-17%20GreyAtom%20-%20Learning%20Platform(1).png?generation=1587141048165615&alt=media"
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img

In [None]:
df.info()

In [None]:
# There are some missing values
# But I am not going to handle those yet
df.isna().sum()

In [None]:
# Let's handle the weird columns first
df = df.applymap(lambda x: x.replace('$', '') if isinstance(x, str) else x)
df = df.replace(to_replace=r'^z_', value='', regex=True)
df = df.applymap(lambda x: x.replace(',', '') if isinstance(x, str) else x)
df

In [None]:
# Turn the columns into floats if available after the cleaning
for i in df.columns:
    if df[i].dtypes == "object":
        try:
            df[i] = df[i].astype(float)
        except:
            pass

In [None]:
df.info()

In [None]:
# Creating real severity and frequence columns
df_freq_sev = df.groupby('ID').agg({'CLAIM_FLAG': 'mean', 'CLM_AMT': 'mean'})
df_freq_sev = df_freq_sev.reset_index()
df_freq_sev.columns  = ['ID', 'FREQUENCY', 'SEVERITY']
df_freq_sev

In [None]:
# Since we are going to use the aggregated versions of those we are dropping the columns
df_merged = df.drop(columns = ['CLAIM_FLAG', 'CLM_AMT'])

# Then merge with the df_freq_sev
df_final = pd.merge(left = df_freq_sev, right=df_merged, how = 'left', on = 'ID')
df_final

In [None]:
# Dropping duplicate entries
df_final.drop_duplicates(inplace= True)

## Train and Test split 
Before anything I do, to avoid data leakage I am splitting the data

In [None]:
train_data, test_data = train_test_split(df_final, test_size=0.2, 
                                         random_state=42, stratify=df_final.FREQUENCY)

## Some Utility Functions

In [None]:
def plot_preds(y_true, y_preds):
    plt.hist(y_preds, bins = 100, alpha = 0.5, label = 'Predictions')
    plt.hist(y_true, bins = 100, alpha = 0.5, label = 'Real')
    
    plt.xlabel('Sample')
    plt.ylabel('Value')
    plt.title('Prediction vs Real')
    
    plt.legend()
    
    plt.show()

In [None]:
def create_xgb_model(params):
    return xgb.XGBRegressor(**params)

## Frequency Prediction
We are going to use a modified metric and XGBoost Regressor as model

In [None]:
# We want to model this using Poisson since we want to find the occurence count for each value
df_final['FREQUENCY'].hist()

In [None]:
def preprocess_for_freq(train_data, test_data):
    
    X_train = train_data.drop(['FREQUENCY', 'ID', "SEVERITY"], axis=1)
    y_train = train_data['FREQUENCY']
    
    X_test = test_data.drop(['FREQUENCY', 'ID', "SEVERITY"], axis=1)
    y_test = test_data['FREQUENCY']
    
    # Again to avoid data leakage I am fitting the encoder to only train and encoding the test using train fitted encoder
    encoder = TargetEncoder()
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_test_encoded = encoder.transform(X_test, y_test)

    return X_train_encoded, X_test_encoded, y_train, y_test

In [None]:
def train_test_freq(model, train_data = train_data, test_data = test_data):
    
    # Process the data
    X_train, X_test, y_train, y_test = preprocess_for_freq(train_data, test_data)

    # Fit the model
    model_fit = model.fit(X_train, y_train)

    y_pred = model_fit.predict(X_test)

    # We want to fit the means 
    metric = (1 - y_test.mean() / y_pred.mean())

    print('Final metric: %f' % (metric))

    return model_fit, y_pred

In [None]:
xgb_freq_params = {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 250}
xgb_freq_model = create_xgb_model(xgb_freq_params)

In [None]:
model_freq, preds_freq = train_test_freq(xgb_freq_model, train_data, test_data)

## Severity Prediction
We are going to use a modified metric and XGBoost Regressor as model

In [None]:
# We are going to remove 0 values in order to model Severity
df_final['SEVERITY'].hist(bins = 100)

In [None]:
# This looks like a Gamma distribution and we are going to use Gamma as objective in our XGBRegressor model
df_final[df_final['SEVERITY'] != 0]['SEVERITY'].hist(bins = 100)

In [None]:
def preprocess_for_sev(train_data, test_data):
    # Make the train data non-zero to make the Gamma function work 
    train_nonzero = train_data[train_data['SEVERITY'] != 0]    
    
    X_train = train_nonzero.drop(['FREQUENCY', 'ID', "SEVERITY"], axis=1)
    y_train = train_nonzero['SEVERITY']
    
    X_test = test_data.drop(['FREQUENCY', 'ID', "SEVERITY"], axis=1)
    y_test = test_data['SEVERITY']
    
    encoder = TargetEncoder()
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_test_encoded = encoder.transform(X_test, y_test)
    
    return X_train_encoded, X_test_encoded, y_train, y_test

In [None]:
xgb_sev_params = {'objective': 'reg:gamma', 'learning_rate': 0.1,
          'max_depth': 5, 'alpha': 10, 'n_estimators': 250}
xgb_sev_model = create_xgb_model(xgb_sev_params)

In [None]:
def train_test_sev(model, train_data = train_data, test_data = test_data):
    # Process the data
    X_train, X_test, y_train, y_test = preprocess_for_sev(train_data, test_data)

    # Fit the model
    model_fit = model.fit(X_train, y_train)

    y_pred = model_fit.predict(X_test)

    # Because we have a Gamma-like distribution we want to fit the median
    metric = (1- np.median(y_test) / np.median(y_pred))

    print('Final metric: %f' %  metric)

    return model_fit, y_pred

In [None]:
# Plot looks like it can be improved but let's check and compare on Base Price
model_sev, preds_sev = train_test_sev(xgb_sev_model, train_data, test_data)
plot_preds(test_data[test_data['SEVERITY'] != 0]['SEVERITY'], preds_sev)

## Final Calculations

In [None]:
# Let's merge the predictions for severity and frequency
preds_freq_series = pd.Series(preds_freq, index=test_data.index, name = 'FREQ_PREDS')
df_merged_final = pd.concat([test_data, preds_freq_series], axis = 1)

preds_sev_series = pd.Series(preds_sev, index=test_data.index, name = 'SEV_PREDS')
df_merged_final = pd.concat([df_merged_final, preds_sev_series], axis = 1)
df_merged_final.head(2)

In [None]:
# Create Base Price columns based on freq and sev
df_merged_final['BASE_PRICE_PREDS'] = df_merged_final['FREQ_PREDS'] * df_merged_final['SEV_PREDS']
df_merged_final['BASE_PRICE_REAL'] = df_merged_final['FREQUENCY'] * df_merged_final['SEVERITY']
df_merged_final.head(2)

In [None]:
# Looks like we are in debt for 750k for now
percent_dif = df_merged_final['BASE_PRICE_PREDS'].sum() / df_merged_final['BASE_PRICE_REAL'].sum()
real_dif = df_merged_final['BASE_PRICE_PREDS'].sum() - df_merged_final['BASE_PRICE_REAL'].sum()
real_price = df_merged_final['BASE_PRICE_REAL'].sum()
print(f'Real Base Price: {real_price}\n\
Percentage difference between real and predictions: {percent_dif}\n\
Real difference between real and predictions: {real_dif} ')

In [None]:
# We are trying to cover the large losses with distributing the cost of those to more profitable people 
# It looks promising yet it can be improved still
plt.hist(df_merged_final['BASE_PRICE_PREDS'],alpha = 0.5, bins = 100, label = 'Pred')
plt.hist(df_merged_final[(df_merged_final['BASE_PRICE_REAL']< 20000) & (df_merged_final['BASE_PRICE_REAL'] !=0) ]['BASE_PRICE_REAL'],
         alpha = 0.5, bins = 100,label = 'Real')

plt.legend()

plt.show()

## Optimizing the model

In [None]:
# We are going to model the large losses and add the large loss effect to our predictions in order to push our predictions to right
train_clipped = train_data[train_data['SEVERITY'] < 7500]

In [None]:
xgb_sev_model_clipped = create_xgb_model(xgb_sev_params)
model_sev_clipped, preds_sev_clipped = train_test_sev(xgb_sev_model_clipped, train_clipped, test_data)
plot_preds(test_data[test_data['SEVERITY'] != 0]['SEVERITY'], preds_sev)

In [None]:
xgb_freq_model_clipped = create_xgb_model(xgb_freq_params)
model_freq_clipped, preds_freq_clipped = train_test_freq(xgb_freq_model_clipped, train_clipped, test_data)

In [None]:
preds_freq_clipped_series = pd.Series(preds_freq_clipped,index = test_data.index,name = 'FREQ_CLIPPED_PREDS')
df_clipped_final = pd.concat([test_data, preds_freq_clipped_series], axis = 1)
preds_sev_clipped_series = pd.Series(preds_sev_clipped,index = test_data.index,name = 'SEV_CLIPPED_PREDS')
df_clipped_final = pd.concat([df_clipped_final, preds_sev_clipped_series], axis = 1)
df_clipped_final

In [None]:
df_clipped_final['BASE_PRICE_PREDS'] = df_clipped_final['FREQ_CLIPPED_PREDS'] * df_clipped_final['SEV_CLIPPED_PREDS']
df_clipped_final['BASE_PRICE_REAL'] = df_clipped_final['FREQUENCY'] * df_clipped_final['SEVERITY']
df_clipped_final.head(2)

In [None]:
df_clipped_final["LARGE_LOSS_EFFECT"] = len(df_clipped_final["BASE_PRICE_REAL"]) / len(test_data) * df_clipped_final.BASE_PRICE_REAL.mean()
df_clipped_final.head(2)

In [None]:
df_clipped_final['BASE_PRICE_PREDS_w_LL'] = df_clipped_final['BASE_PRICE_PREDS'] + df_clipped_final['LARGE_LOSS_EFFECT']
df_clipped_final.head(2)

In [None]:
plot_preds(df_clipped_final[df_clipped_final['BASE_PRICE_REAL']!= 0]['BASE_PRICE_REAL'], df_clipped_final['BASE_PRICE_PREDS_w_LL'])

In [None]:
# Looks like we are in debt for 750k for now
percent_dif = df_clipped_final['BASE_PRICE_PREDS_w_LL'].sum() / df_clipped_final['BASE_PRICE_REAL'].sum()
real_dif = df_clipped_final['BASE_PRICE_PREDS_w_LL'].sum() - df_clipped_final['BASE_PRICE_REAL'].sum()
real_price = df_clipped_final['BASE_PRICE_REAL'].sum()
print(f'Real Base Price: {real_price}\n\
Percentage difference between real and predictions: {percent_dif}\n\
Real difference between real and predictions: {real_dif} ')

For the future work, some columns may be grouped or further processed to create more meaningful features.