In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='monokai')

In [1]:
#dependencies
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import datetime as dt

In [37]:
print('loading data...')
data1 = pd.read_csv('../data/train_month_1.csv',parse_dates = [29,30,32])
data2 = pd.read_csv('../data/train_month_2.csv',parse_dates = [29,30,32])
data3 = pd.read_csv('../data/train_month_3_with_target.csv',parse_dates = [29,30,32])
print('data loaded...\n')


constant_all = []
for col in data1.columns:
    if data1[col].equals(data2[col]) & data2[col].equals(data3[col]):
        constant_all.append(col)

print('-'*65)
print('features that are constant across the three previous months (in sample):')
print(constant_all)

def type_caster_and_transformer(data, time):
    """
    Casting to appropriate data types, dropping columns that are constant over time 
    """
    
    # convert bool
    cols_binary = ['homebanking_active', 'has_homebanking',
       'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap',
       'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance',
       'has_personal_loan', 'has_mortgage_loan', 'has_current_account',
       'has_pension_saving', 'has_savings_account',
       'has_savings_account_starter', 'has_current_account_starter','customer_self_employed']
    data[cols_binary] = data[cols_binary].astype('bool')
    if 'target' in data.columns:
        data['target'] = data['target'].astype('bool')
        
    # convert objects
    cols_object = ['customer_occupation_code', 'customer_postal_code', 'customer_education', 
                   'customer_children','customer_relationship','customer_gender']
    data[cols_object] = data[cols_object].astype('object')

    
    # these should be constant in the last three month
    cols_drop = ['customer_since_all','customer_since_bank'
                 ,'customer_gender','customer_birth_date']
    data.drop(columns = cols_drop, inplace = True)
    
    # add time 
    data['month'] = int(time)
    return data

type_caster_and_transformer(data1,(-2)); 
type_caster_and_transformer(data2,(-1))
type_caster_and_transformer(data3,(0))

data1 = pd.concat([data1,data3.target],axis = 1)
data2 =pd.concat([data2,data3.target],axis = 1)

loading data...
data loaded...

-----------------------------------------------------------------
features that are constant across the three previous months (in sample):
['client_id', 'customer_since_all', 'customer_since_bank', 'customer_gender', 'customer_birth_date', 'customer_postal_code', 'customer_occupation_code', 'customer_education']


In [38]:
data = pd.concat([data1,data2,data3],ignore_index=True)

Unnamed: 0,client_id,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,...,visits_distinct_so,visits_distinct_so_areas,customer_postal_code,customer_occupation_code,customer_self_employed,customer_education,customer_children,customer_relationship,month,target
0,910df42ad36243aa4ce16324cd7b15b0,False,False,False,False,False,False,True,False,False,...,1.0,1.0,3630,9.0,False,0.0,,,-2,False
1,4e19dc3a54323c5bbfc374664b950cd1,True,True,False,False,False,False,False,False,False,...,1.0,1.0,2460,9.0,False,,mature,couple,-2,False
2,f5d08db1b86c0cb0f566bf446cff1fb4,True,True,False,False,False,False,True,False,False,...,1.0,1.0,2660,9.0,False,,,single,-2,False
3,26170ecf63653e215c52f4262c1c4859,False,False,False,False,False,False,True,False,False,...,1.0,1.0,6600,9.0,False,,,,-2,False
4,c078009957dffb64f20e61b41220a976,False,False,False,False,False,False,False,False,False,...,1.0,1.0,8550,9.0,False,,mature,couple,-2,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191086,0a58f2eb841ddac0626dacac6ca69524,True,True,True,False,False,False,False,False,False,...,2.0,1.0,8000,9.0,False,,no,couple,0,False
191087,193be2222be99bf04f42193b5cdfb95d,False,False,True,False,False,False,True,False,True,...,2.0,1.0,2020,9.0,False,1.0,,,0,False
191088,fa9f074ec8cad610ccaec2270021490e,False,True,False,False,False,True,True,False,True,...,3.0,1.0,1070,9.0,False,3.0,,single,0,False
191089,5236064e97c655b0ad99ed2155e1139e,False,False,False,False,False,False,False,False,False,...,1.0,1.0,2560,9.0,False,,no,couple,0,False


In [None]:
data = data1.merge(right = data2, on='client_id', suffixes = ('_2','_1')).merge(right = data3, on = 'client_id')
data.head()

In [None]:
var = 'has_personal_loan'
df = data[[var+'_2',var+'_1',var,'target']]
df.reset_index(inplace = True)
df_melt = df.melt( id_vars = ('client_id','target'),var_name='time', value_name="value")
g = sb.catplot(data = df_melt, x = 'time', y = 'value', hue = 'target', kind = 'violin')
g.set(title = f"{var} (%)")

In [None]:
cols_binary = ['homebanking_active', 'has_homebanking',
       'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap',
       'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance',
       'has_personal_loan', 'has_mortgage_loan', 'has_current_account',
       'has_pension_saving', 'has_savings_account',
       'has_savings_account_starter', 'has_current_account_starter']

for var in cols_binary:
    if (var not in duplicates):
        plt.figure()
        df = data[[var+'_2',var+'_1',var,'target']]
        df.reset_index(inplace = True)
        df_melt = df.melt( id_vars = ('client_id','target'),var_name='time', value_name="value")
        g = sb.catplot(data = df_melt
                       , x = 'time'
                       , y = 'value'
                       , hue = 'target'
                       , kind = 'point'
                       , dodge = True)
        g.set(title = f'{var}(%)')
    else:
        continue


In [None]:
from numpy import median

cols_cont = ['bal_insurance_21', 'bal_insurance_23', 'cap_life_insurance_fixed_cap',
       'cap_life_insurance_decreasing_cap', 'prem_fire_car_other_insurance',
       'bal_personal_loan', 'bal_mortgage_loan', 'bal_current_account',
       'bal_pension_saving', 'bal_savings_account',
       'bal_savings_account_starter', 'bal_current_account_starter']

for var in cols_cont:
    if (var not in duplicates):
        plt.figure()
        df = data[[var+'_2',var+'_1',var,'target']]
        df.reset_index(inplace = True)
        df_melt = df.melt( id_vars = ('client_id','target'),var_name='time', value_name="value")
        g = sb.catplot(data = df_melt, x = 'time', y = 'value', hue = 'target', kind = 'violin')
        g.set(title = f'{var} mean')
    else:
        continue
        