In [1]:
from sklearn import preprocessing
import numpy as np
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt

def read_data_file(filename):
    Data = pd.read_csv(filename,low_memory = False)

    # Drop unneccessary columns #

    Data = Data.drop('Vehicle_Annual_Miles', 1)
    Data = Data.drop('Vehicle_Comprehensive_Coverage_Limit', 1)
    Data = Data.drop('Driver_Minimum_Age', 1)
    Data = Data.drop('Driver_Maximum_Age', 1)
    Data = Data.drop('EEA_PolicyYear', 1)
    Data = Data.drop('Vehicle_New_Cost_Amount', 1)
    Data = Data.drop('Vehicle_Make_Description', 1)
    Data = Data.drop('EEA_Policy_Zip_Code_3', 1)
    # Clearing unneccessary rows #

    Data = Data[Data.EEA_Policy_Tenure != -1]
    Data = Data[Data.Vehicle_Symbol != -1]
    Data = Data[Data.Vehicle_Days_Per_Week_Driven != -1]
    Data = Data[Data.Vehicle_Anti_Theft_Device != 'Unknown']

    # Replace missing Data #

    Data['Policy_Zip_Code_Garaging_Location'] = Data['Policy_Zip_Code_Garaging_Location'].replace('Unknown', '00000')
    Data['Vehicle_Miles_To_Work'] = Data['Vehicle_Miles_To_Work'].replace('-1', np.nan)
    Data['Vehicle_Passive_Restraint'] = Data['Vehicle_Passive_Restraint'].replace('Unknown', 'Y')
    # Data['EEA_Policy_Zip_Code_3'] = Data['EEA_Policy_Zip_Code_3'].replace('Unknown', '000')
    Data['Vehicle_Med_Pay_Limit'] = Data['Vehicle_Med_Pay_Limit'].replace('-1', np.nan)
    Data['Vehicle_Physical_Damage_Limit'] = Data['Vehicle_Physical_Damage_Limit'].replace('-1', np.nan)
    Data['Vehicle_Collision_Coverage_Deductible'] = Data['Vehicle_Collision_Coverage_Deductible'].replace('-1', np.nan)
    # Data['EEA_Prior_Bodily_Injury_Limit'] = Data['EEA_Prior_Bodily_Injury_Limit'].replace('-1', np.nan)
    # Fill nan data #

    Data['Vehicle_Miles_To_Work'].fillna((Data['Vehicle_Miles_To_Work'].mean()), inplace=True)
    Data['Vehicle_Med_Pay_Limit'].fillna((Data['Vehicle_Med_Pay_Limit'].mean()), inplace=True)
    Data['EEA_Prior_Bodily_Injury_Limit'].fillna('0000', inplace=True)
    Data['Vehicle_Bodily_Injury_Limit'].fillna('0000', inplace=True)
    Data['Vehicle_Physical_Damage_Limit'].fillna('0000', inplace=True)
    Data['Vehicle_Collision_Coverage_Deductible'].fillna('0000', inplace=True)

    # move to after splitting up data
    # #Categorizing continuous data #

    # Data['Vehicle_Miles_To_Work']=pd.cut(Data['Vehicle_Miles_To_Work'], bins=[0, 20,40,60,80,100], include_lowest=True, labels=['lowest', 'low', 'mid', 'high', 'highest'])

    # Data['Annual_Premium'] = pd.cut(Data['Annual_Premium'],9, include_lowest=True , labels= ['highest', 'very high', 'moderately high', 'higher', 'medium', 'lower', 'moderately low', 'very low', 'lowest'])

    # Save modified data to new csv file #

    # Data.to_csv('new_data.csv', sep=',', encoding='utf-8')
    return Data

Data = read_data_file('training_data_2016.csv')

In [2]:
# get some training and test data sets

# training data for categorical model
category_training_data_size = 200000
#training data for regression model
claims_training_data_size = 200000
# testing data
testing_data_size = 1000
number_test_sets = 100

# get data with equal number claims and not
category_training_data = Data[:category_training_data_size].copy(deep=True)

#Categorizing continuous data #

category_training_data['Vehicle_Miles_To_Work']=pd.cut(category_training_data['Vehicle_Miles_To_Work'], bins=[0, 20,40,60,80,100], include_lowest=True, labels=['lowest', 'low', 'mid', 'high', 'highest'])

category_training_data['Annual_Premium'] = pd.cut(category_training_data['Annual_Premium'],9, include_lowest=True , labels= ['highest', 'very high', 'moderately high', 'higher', 'medium', 'lower', 'moderately low', 'very low', 'lowest'])


claims_training_data = Data[:category_training_data_size].copy(deep=True)#Data[category_training_data_size: category_training_data_size + claims_training_data_size].copy(deep=True)


training_data_size = 200000#category_training_data_size + claims_training_data_size
test_sets = []
for i in range(number_test_sets):
    t_set = Data[training_data_size + i * testing_data_size: training_data_size + (i + 1) * testing_data_size].copy(deep=True)
    test_sets.append(t_set)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge

# train model to determine claim or not

rfc = RandomForestClassifier(max_features='sqrt')#, max_depth=20)

X_t = category_training_data.ix[:, 0:-5]
Y_t = category_training_data.ix[:, -4]

for col in X_t[1:]:
    X_t[col] = X_t[col].astype('category')

X_t = X_t.apply(lambda x: x.cat.codes)
Y_t = Y_t.apply(lambda x: 1 if x > 0 else 0)

model = Ridge()
rfe = RFE(model, 23)
fit = rfe.fit(X_t, Y_t)
print("Num Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

X_t=fit.transform(X_t)

print(X_t.shape)
rfc.fit(X_t, Y_t)

print("Done!")

Num Features:  23
Selected Features:  [ True  True  True  True False False False False False False False False
 False  True False  True False False False False False  True  True  True
 False  True False False  True  True  True False  True False False False
 False False False False  True False  True  True  True  True False False
 False  True  True False  True False  True]
Feature Ranking:  [ 1  1  1  1  2 32 19 25 23 30 26 27 18  1 16  1 14 29 21 17 11  1  1  1 24
  1 22  3  1  1  1 15  1  9 10  7  6  5  8  4  1 13  1  1  1  1 12 31 28  1
  1 20  1 33  1]
(200000, 23)
Done!


In [None]:
# get scaled data sets
from sklearn.preprocessing import StandardScaler  
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# pca = PCA(n_components=.95)

categorical_columns = ['Policy_Company', 'Policy_Installment_Term', 'Policy_Billing_Code', 'Policy_Method_Of_Payment', 'Policy_Reinstatement_Fee_Indicator', 'Policy_Zip_Code_Garaging_Location', 'Vehicle_Performance', 'Vehicle_Number_Of_Drivers_Assigned', 'Vehicle_Usage', 'Vehicle_Anti_Theft_Device', 'Vehicle_Passive_Restraint', 'Vehicle_Med_Pay_Limit', 'Vehicle_Bodily_Injury_Limit', 'Vehicle_Comprehensive_Coverage_Indicator', 'Vehicle_Comprehensive_Coverage_Limit', 'Vehicle_Collision_Coverage_Indicator', 'Vehicle_Collision_Coverage_Deductible', 'Vehicle_Youthful_Driver_Indicator', 'Vehicle_Youthful_Driver_Training_Code', 'Vehicle_Youthful_Good_Student_Code', 'Vehicle_Safe_Driver_Discount_Indicator', 'EEA_Liability_Coverage_Only_Indicator', 'EEA_Multi_Auto_Policies_Indicator', 'EEA_Policy_Zip_Code_3', 'EEA_Agency_Type', 'EEA_Packaged_Policy_Indicator', 'EEA_Full_Coverage_Indicator', 'EEA_Prior_Bodily_Injury_Limit', 'SYS_Renewed', 'SYS_New_Business']

for col in claims_training_data.ix[1:]:
    if col in categorical_columns:
        claims_training_data[col] = claims_training_data[col].astype('category')

cat_columns = claims_training_data.select_dtypes(['category']).columns

claims_training_data[cat_columns] = claims_training_data[cat_columns].apply(lambda x: x.cat.codes)

claims_training_data_with_claim = claims_training_data[claims_training_data.Claim_Count > 0].copy(deep=True)
claims_training_data_without_claims = claims_training_data[claims_training_data.Claim_Count == 0]
truncated_without_claims = claims_training_data_without_claims.head(n=8*claims_training_data_with_claim.shape[0])

claims_training_data = claims_training_data_with_claim

print("With claims", claims_training_data_with_claim.shape)
print("without", claims_training_data_without_claims.shape)
print(claims_training_data.shape)
    
def run_fit_reg(hiddenLayerssize, claims_training_data):
    

    X = claims_training_data.ix[:,0:-5]
    losses = claims_training_data.ix[:,-4]
    Y = losses#np.divide(losses, premium)
    # scaler_x = StandardScaler()  
    # print(X)
    # X = pca.fit_transform(X)
    # print("X after pca", X)
    # X = scaler_x.fit_transform(X)  

    pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=.95))])

    X = pipeline.fit_transform(X)

    print(X.shape)

    from sklearn.neural_network import MLPRegressor

    mlp = MLPRegressor(max_iter=2000, hidden_layer_sizes=(hiddenLayerssize,hiddenLayerssize))#, hiddenLayerssize))

    mlp.fit(X, Y)

    print("Done!")
    return pipeline, mlp

With claims (7864, 60)
without (192136, 60)
(7864, 60)


In [None]:
# run both models
pipeline, mlp = run_fit_reg(10, claims_training_data)


#get classifications

predicted_claims = []
measureds = []
for i in range(len(test_sets)):
    X_t = test_sets[i].ix[:, 0:-5].copy(deep=True)
    Y_t = test_sets[i].ix[:, -4].copy(deep=True)
    for col in X_t:
        X_t[col] = X_t[col].astype('category')
    X_t = X_t.apply(lambda x: x.cat.codes)
    Y_t = Y_t.apply(lambda x: 1 if x > 0 else 0)
#     X_t = enc.transform(X_t)
#     X_t = t_svd.transform(X_t)
#     prediction = clf.predict(csr_matrix(X_t))
    prediction = rfc.predict(fit.transform(X_t))
    
    predicted_claims.append(prediction)
    measureds.append(Y_t.values)
    
# grab each record that will likely have a claim and predict    


predictions = []
measureds = []
scores = []
for i in range(len(test_sets)):
    
    X_t = test_sets[i].ix[:,0:-5].copy(deep=True)
    Y_t = test_sets[i].ix[:,-4].copy(deep=True)
    premium = sum(X_t.ix[:,-1])
    measured = sum(Y_t)
    
    k = 0
    for j in range(X_t.shape[0]):
        if predicted_claims[i][j] == 0:
            X_t.drop(X_t.index[k], inplace=True)
            Y_t.drop(Y_t.index[k], inplace=True)
        else:
            k = k + 1
            
    
    for col in X_t.ix[1:]:
        if col in categorical_columns:
            X_t[col] = X_t[col].astype('category')
    X_t[cat_columns] = X_t[cat_columns].apply(lambda x: x.cat.codes)
    
    X_t = pipeline.transform(X_t)
    prediction = mlp.predict(X_t)
    prediction = sum(prediction)
    
    predictions.append(prediction / premium)
    measureds.append(measured / premium)

print("Done!")


(7864, 37)
Done!


In [None]:
plt.plot(predictions, 'r--')#np.divide(predictions, 10), 'r--')
plt.plot(measureds)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(measureds, predictions))

In [None]:
# read in 600 excel files lol 
predictions = []
for i in range(1,601):
    file_name = "test_portfolios/test_portfolio_"+str(i)+".csv"
    TestData = read_data_file(file_name)
#     print(TestData.shape)
#     print(type(TestData))
    TestData = TestData.drop('PolicyNo', 1)
    X_t = TestData
    premium = sum(X_t.ix[:,-1])
    print("Premium:", premium)
    for col in X_t:
        X_t[col] = X_t[col].astype('category')
    X_t = X_t.apply(lambda x: x.cat.codes)
    
    # classificaztaion
    
    prediction = rfc.predict(fit.transform(X_t))
    predicted_claims = prediction
#     print ("Predicted claims:", predicted_claims)
    #regression
    X_t = TestData
    k = 0
    for j in predicted_claims:
        if j == 0:
            X_t.drop(X_t.index[k], inplace=True)
        else:
            k = k + 1
            
    for col in X_t:
        if col in categorical_columns:
            X_t[col] = X_t[col].astype('category')
    X_t[cat_columns] = X_t[cat_columns].apply(lambda x: x.cat.codes)
    
    X_t = pipeline.transform(X_t)
    prediction = mlp.predict(X_t)
    prediction = sum(prediction)
    
    print("Sum Predicitpon", prediction)
    predictions.append(prediction / premium)
    
print(predictions)