## 0. Imports, Paths and Helper Functions

### 0.1 Imports

In [1]:
import json
import psycopg2
import itertools
import pickle



import numpy             as np
import pandas            as pd
import seaborn           as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as ms


from IPython.core.display  import HTML
from IPython.display       import Image
from tabulate              import tabulate
from sklearn.preprocessing import RobustScaler, StandardScaler,MinMaxScaler


from scipy                 import stats           as ss


### 0.2 Paths

In [2]:
data = '../data/'
raw  = 'raw/'
interim = 'interim/'
processed = 'processed/'

modules_path = '../models/'

images_path = '../reports/figures/'

### 0.3 Helper Functions

In [3]:
def cramer_v(x,y):
    cm = pd.crosstab(x,y).values
    n = cm.sum()
    r,k = cm.shape
    
    chi2 = ss.chi2_contingency(cm)[0]
    chi2corr = max(0,chi2 - (k-1)*(r-1)/(n-1))
    
    kcorr=k - (k-1)**2/(n-1) 
    rcorr=r - (r-1)**2/(n-1) 
    
    v = np.sqrt( (chi2corr/n) / (min (kcorr-1,rcorr-1)))
    return v


def categorical_feature_correlation(df_cat):
    cat_list = df_cat.columns.to_list()
    dic_corr = {cat : [] for cat in cat_list}
    
    for a,b in itertools.product(cat_list,repeat=2):
        dic_corr[a].append(cramer_v(df_cat[a],df_cat[b]))

    return pd.DataFrame(dic_corr,index=cat_list)


def cat_info(ax,total,size = 25,file=False):
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height,
                '{:1.2f}%'.format(height/total*100),
                ha="center",size=size) 
    plt.show()
    
    
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [4]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 5.0 Data Preparation

In [5]:
df5 = pd.read_csv(data+interim+'fe-filtering.csv')

### 5.1 - Creating train, validation and test datasets

In [6]:
x = df5.drop(['response'],axis=1)
y = df5['response'].copy()

x_train, x_validation,y_train,y_validation = ms.train_test_split(x,y,test_size=0.20)

df5 = pd.concat([x_train,y_train],axis=1)

In [7]:
x = df5.drop(['response'],axis=1)
y = df5['response']

x_train,x_test,y_train,y_test = ms.train_test_split(x,y,test_size=0.20)

In [8]:
df5.to_csv(data+interim+'train.csv',index=False)
pd.concat([x_validation,y_validation],axis=1).to_csv(data+interim+'validation.csv',index=False)
pd.concat([x_test,y_test],axis=1).to_csv(data+interim+'test.csv',index=False)

### 5.2 Rescaling and Standardization

In [9]:

annual_premium_scaler = RobustScaler() 
age_scaler = MinMaxScaler()
vintage_scaler = MinMaxScaler()

df5['age'] = age_scaler.fit_transform(df5[['age']])
pickle.dump(age_scaler,open(modules_path+'age_scaler.pkl','wb'))


df5['annual_premium'] = annual_premium_scaler.fit_transform(df5[['annual_premium']])
pickle.dump(annual_premium_scaler,open(modules_path+'annual_premium_scaler.pkl','wb'))


df5['vintage'] = vintage_scaler.fit_transform(df5[['vintage']])
pickle.dump(vintage_scaler,open(modules_path+'vintage_scaler.pkl','wb'))

### 5.2 Variables Encoding

In [10]:
#gender - Caracteriza sexo do cliente  ---> OneHotEncoding/LabelEncoding?
gender_encoder = {'male':0,'female':1}

df5.loc[:,'gender'] = df5['gender'].map(gender_encoder)

#region_code - codigo do cliente ---> frequencyEncoding
region_code_encoder = df5.groupby('region_code').size()/len(df5)
df5.loc[:,'region_code']=df5['region_code'].map(region_code_encoder)
pickle.dump(region_code_encoder,open(modules_path+'region_code_encoder.pkl','wb'))

#vehicle_age - categoriza idade do veiculo ---> frequencyEncoding
vehicle_age_encoder = df5.groupby('vehicle_age').size()/len(df5)
df5.loc[:,'vehicle_age']=df5['vehicle_age'].map(vehicle_age_encoder)
pickle.dump(vehicle_age_encoder,open(modules_path+'vehicle_age_encoder.pkl','wb'))

#policy_sales_channel - comunicacao com os clientes --->FrequencyEncoding
policy_sales_channel_encoder = df5.groupby('policy_sales_channel').size()/len(df5)
df5.loc[:,'policy_sales_channel']=df5['policy_sales_channel'].map(policy_sales_channel_encoder)
pickle.dump(policy_sales_channel_encoder,open(modules_path+'policy_sales_channel_encoder.pkl','wb'))

#gender,driving_license, vehicle_damage,Previously_insured ---> Usar OHE no proximo ciclo
#Features em FrequencyEncoding ---> Usar TargetEncoding no prox ciclo.


### 5.3 Validation Data Preparation

In [11]:
#Rescaling and Standardization

x_validation['age'] = age_scaler.transform(x_validation[['age']].values)
x_validation['annual_premium'] = annual_premium_scaler.transform(x_validation[['annual_premium']].values)
x_validation['vintage'] = vintage_scaler.transform(x_validation[['vintage']].values)

In [12]:
#Encoding


x_validation.loc[:,'gender'] = x_validation['gender'].map(gender_encoder)
x_validation.loc[:,'region_code'] = x_validation['region_code'].map(region_code_encoder)
x_validation.loc[:,'vehicle_age'] = x_validation['vehicle_age'].map(vehicle_age_encoder)
x_validation.loc[:,'policy_sales_channel'] = x_validation['policy_sales_channel'].map(policy_sales_channel_encoder)

In [13]:
x_validation.isna().sum() #discover this in next cycle

id                      0
gender                  0
age                     0
region_code             0
policy_sales_channel    5
driving_license         0
vehicle_age             0
vehicle_damage          0
previously_insured      0
annual_premium          0
vintage                 0
dtype: int64

In [14]:
x_validation.fillna(0,inplace = True)

### 5.4 Saving Processed Data

In [15]:
validation = x_validation.copy()
validation['response'] = y_validation

df5.to_csv(data+processed+'train_processed.csv',index=False)
validation.to_csv(data+processed+'validation_processed.csv',index=False)