In [1]:
# -*- coding: utf-8 -*-
"""
This script runs the multi GAN and allows you to step through each part
# divide y by exposure in xpxixpy
"""

# import modules
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.autograd.variable import Variable
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from patsy import dmatrices
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Import created modules
from Functions.MC_WGAN_GP.gan_scripts.auto_loader import PolicyDataset
from Functions.MC_WGAN_GP.gan_scripts.generator2_v2 import Generator2
from Functions.MC_WGAN_GP.gan_scripts.discriminator2_v3 import Discriminator2
from Functions.MC_WGAN_GP.gan_scripts.gradiant_penalty import calculate_gradient_penalty
from Functions.MC_WGAN_GP.gan_scripts.undo_dummy import back_from_dummies

%load_ext autoreload
%autoreload 2
pd.set_option('display.max_columns', 999)

In [2]:
policy1 = pd.read_pickle("./data/common_dataprep/train.pickle")

cont_vars = ['VehPower', 
                     'VehAge',
                     'DrivAge',
                     'DensityGLM',
                     'BonusMalus',
            'Exposure']
cats_vars = ["ClaimNb",
            'VehBrand',
            'VehGas',
            'Region',
            "AreaGLM"]

policy1[cont_vars] = policy1[cont_vars].astype(float)
policy1[cats_vars] = policy1[cats_vars].astype('category')

policy2 = pd.get_dummies(policy1[cont_vars + cats_vars])

In [3]:
policy3 = pd.DataFrame(preprocessing.StandardScaler().fit_transform(policy2), columns = policy2.columns)

In [4]:
# Take a sampel of the data for quickly training
pol_dat  = policy3#.sample(n = 10000, random_state = 1)

second_inds = np.arange(0,(pol_dat.shape[0]-1))
val_inds = np.random.choice(second_inds, size=np.floor(pol_dat.shape[0]*.1).astype('int'), replace=False, p=None)
train_inds = np.setdiff1d(second_inds, val_inds)

val = pol_dat.iloc[val_inds]
train = pol_dat.iloc[train_inds]


In [5]:
train.to_pickle("./data/gan_dataprep/train_gan.pickle")
val.to_pickle("./data/gan_dataprep/val_gan.pickle")


In [6]:
train

Unnamed: 0,VehPower,VehAge,DrivAge,DensityGLM,BonusMalus,Exposure,ClaimNb_0.0,ClaimNb_1.0,ClaimNb_2.0,ClaimNb_3.0,ClaimNb_4.0,VehBrand_B1,VehBrand_B10,VehBrand_B11,VehBrand_B12,VehBrand_B13,VehBrand_B14,VehBrand_B2,VehBrand_B3,VehBrand_B4,VehBrand_B5,VehBrand_B6,VehGas_Diesel,VehGas_Regular,Region_R11,Region_R21,Region_R22,Region_R23,Region_R24,Region_R25,Region_R26,Region_R31,Region_R41,Region_R42,Region_R43,Region_R52,Region_R53,Region_R54,Region_R72,Region_R73,Region_R74,Region_R82,Region_R83,Region_R91,Region_R93,Region_R94,AreaGLM_1,AreaGLM_2,AreaGLM_3,AreaGLM_4,AreaGLM_5,AreaGLM_6
0,-0.71,-1.29,0.67,0.60,-0.62,-1.18,-4.35,4.48,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,-0.98,0.98,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,2.64,-0.09,-0.24,-0.36,-0.08,-0.43,-0.35,-0.63,1.87,-0.50,-0.16
1,-0.71,-1.29,0.67,0.60,-0.62,0.66,-4.35,4.48,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,-0.98,0.98,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,2.64,-0.09,-0.24,-0.36,-0.08,-0.43,-0.35,-0.63,1.87,-0.50,-0.16
2,-0.22,-0.92,0.46,-1.06,-0.62,0.61,-4.35,4.48,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,1.02,-1.02,-0.34,-0.07,9.14,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,-0.38,-0.09,-0.24,-0.36,-0.08,-0.43,2.82,-0.63,-0.54,-0.50,-0.16
5,0.26,-1.29,-0.89,-0.57,0.53,-0.71,-4.35,4.48,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,1.02,-1.02,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,-0.38,-0.09,4.24,-0.36,-0.08,-0.43,-0.35,1.59,-0.54,-0.50,-0.16
6,0.26,-1.29,-0.89,-0.57,0.53,0.50,-4.35,4.48,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,1.02,-1.02,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,-0.38,-0.09,4.24,-0.36,-0.08,-0.43,-0.35,1.59,-0.54,-0.50,-0.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541825,2.21,-1.29,-0.32,0.15,-0.62,-1.44,0.23,-0.22,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,1.02,-1.02,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,-0.38,-0.09,-0.24,2.75,-0.08,-0.43,-0.35,-0.63,1.87,-0.50,-0.16
541826,-0.22,-0.55,-0.39,1.03,0.53,-1.44,0.23,-0.22,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,-0.98,0.98,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,-0.38,-0.09,-0.24,2.75,-0.08,-0.43,-0.35,-0.63,-0.54,1.99,-0.16
541827,-1.20,-1.29,0.60,1.14,-0.62,-1.45,0.23,-0.22,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,-0.98,0.98,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,-0.38,-0.09,-0.24,2.75,-0.08,-0.43,-0.35,-0.63,-0.54,1.99,-0.16
541829,-0.22,-0.92,-0.04,0.65,-0.62,-1.45,0.23,-0.22,-0.05,-0.01,-0.01,-0.56,-0.16,-0.14,1.75,-0.13,-0.08,-0.56,-0.29,-0.20,-0.23,-0.21,1.02,-1.02,-0.34,-0.07,-0.11,-0.11,-0.56,-0.13,-0.13,-0.21,-0.14,-0.06,-0.04,-0.25,-0.26,-0.17,-0.22,-0.16,-0.08,2.64,-0.09,-0.24,-0.36,-0.08,-0.43,-0.35,-0.63,1.87,-0.50,-0.16


In [7]:

# # Wrangle train data
# td = back_from_dummies(train_half)
# td['ClaimNb'] = td['ClaimNb'].astype('int')
# y_real, X_real = dmatrices('ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
#                  data=td,
#                  return_type='dataframe')
# td['Exposure'] = td['ExposureCat'].astype('float32')/11
# def xpxixpy(X,y):
#             return np.dot(np.linalg.inv(np.dot(X.T,X)), np.dot(X.T,y))
# xy = xpxixpy(X_real,y_real)


# # Fit a poisson Model
# poisson_mod = sm.GLM(y_real,X_real,family = sm.families.Poisson(), offset = td['Exposure']).fit()
# original_params = poisson_mod.params

# lower = poisson_mod.params - 1.96*poisson_mod.bse  
# upper = poisson_mod.params + 1.96*poisson_mod.bse 


# # Fit a random forest
# real_features= X_real
# real_feature_list = list(real_features.columns)
# real_features = np.array(real_features)
# y_rep = np.squeeze(y_real)/np.squeeze(td['Exposure'])
# rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# rf.fit(real_features, y_rep)

# # Wrangle Test Data
# test2 = back_from_dummies(test)
# test2['ClaimNb'] = test2['ClaimNb'].astype('int')
# y_test, X_test = dmatrices('ClaimNb ~ CarAge + DriverAge + Power + Brand + Gas + Region + DensityCat',
#                  data=test2,
#                  return_type='dataframe')
# test2['Exposure'] = test2['ExposureCat'].astype('float32')/11
# y_test_resp = np.squeeze(y_test)/np.squeeze(test2['Exposure'])


# # make predictions on test data with models trained on train data
# real_pois_preds = poisson_mod.predict(X_test)
# real_predictions = rf.predict(X_test)
# importances_real = rf.feature_importances_

