# Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
%matplotlib inline
import pandas as pd 
from tabulate import tabulate
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo

In [2]:
train1 = pd.read_csv('train_month_3_with_target.csv')
train2 = pd.read_csv('train_month_2.csv')
train3 = pd.read_csv('train_month_1.csv')
test1 = pd.read_csv('test_month_3.csv')
test2 = pd.read_csv('test_month_2.csv')
test3 = pd.read_csv('test_month_1.csv')

copy_df = pd.read_csv('train_month_3_with_target.csv')

# Define functions

# Data pre-processing

## Dataset Cleaning

In [3]:
#train1.info()
train1.describe()
train1.isna().any()
train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63697 entries, 0 to 63696
Data columns (total 40 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   client_id                          63697 non-null  object 
 1   homebanking_active                 63697 non-null  int64  
 2   has_homebanking                    63697 non-null  int64  
 3   has_insurance_21                   63697 non-null  int64  
 4   has_insurance_23                   63697 non-null  int64  
 5   has_life_insurance_fixed_cap       63697 non-null  int64  
 6   has_life_insurance_decreasing_cap  63697 non-null  int64  
 7   has_fire_car_other_insurance       63697 non-null  int64  
 8   has_personal_loan                  63697 non-null  int64  
 9   has_mortgage_loan                  63697 non-null  int64  
 10  has_current_account                63697 non-null  int64  
 11  has_pension_saving                 63697 non-null  int

In [4]:
train1['customer_since_all'] = pd.to_datetime(train1['customer_since_all'])
train1['customer_since_bank'] = pd.to_datetime(train1['customer_since_bank'])
train1['customer_birth_date'] = pd.to_datetime(train1['customer_birth_date'])

cat_vars = []
cont_vars = []
columns = list(train1.drop(columns = ['client_id']).columns)
for i in columns:
    if (len(train1.loc[:,i].unique()) >= 20):
        cont_vars.append(i)
    else:
        cat_vars.append(i)
        
print('Continious variables are:',cont_vars)
print('------------------')
print('Categorical variables are:',cat_vars)
# Here we have our list of categorical and continious variables

Continious variables are: ['bal_insurance_21', 'bal_insurance_23', 'cap_life_insurance_fixed_cap', 'cap_life_insurance_decreasing_cap', 'prem_fire_car_other_insurance', 'bal_personal_loan', 'bal_mortgage_loan', 'bal_current_account', 'bal_pension_saving', 'bal_savings_account', 'bal_savings_account_starter', 'bal_current_account_starter', 'customer_since_all', 'customer_since_bank', 'customer_birth_date', 'customer_postal_code']
------------------
Categorical variables are: ['homebanking_active', 'has_homebanking', 'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap', 'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance', 'has_personal_loan', 'has_mortgage_loan', 'has_current_account', 'has_pension_saving', 'has_savings_account', 'has_savings_account_starter', 'has_current_account_starter', 'visits_distinct_so', 'visits_distinct_so_areas', 'customer_gender', 'customer_occupation_code', 'customer_self_employed', 'customer_education', 'customer_children', 

### Categorical features unique values - checking consistency

In [5]:
# To check we will print all unique values
for col in cat_vars:
    print(col,train1[col].unique())

homebanking_active [0 1]
has_homebanking [0 1]
has_insurance_21 [0 1]
has_insurance_23 [0 1]
has_life_insurance_fixed_cap [0 1]
has_life_insurance_decreasing_cap [0 1]
has_fire_car_other_insurance [1 0]
has_personal_loan [0 1]
has_mortgage_loan [0 1]
has_current_account [1 0]
has_pension_saving [0 1]
has_savings_account [1 0]
has_savings_account_starter [0 1]
has_current_account_starter [0 1]
visits_distinct_so [1. 2. 3. 4. 6. 5. 7.]
visits_distinct_so_areas [1. 2. 3. 5. 4. 6.]
customer_gender [1 2]
customer_occupation_code [ 9. nan  7.  8.  4.  5.  0.  6.  3.  1.  2.]
customer_self_employed [0 1]
customer_education [ 0. nan  2.  1.  4.  3.  5.  6.]
customer_children [nan 'mature' 'no' 'young' 'preschool' 'adolescent' 'grownup' 'onebaby'
 'yes']
customer_relationship [nan 'couple' 'single']
target [0 1]


In [6]:
# looks good, should transform into categorical

### Missing Data

In [7]:
missing_df = train1.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/train1.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%
29,customer_since_all,234,0.367364
30,customer_since_bank,249,0.390913
34,customer_occupation_code,2002,3.143005
38,customer_relationship,14899,23.390427
37,customer_children,23364,36.679906
36,customer_education,47125,73.983076


In [8]:
missing_df = train2.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/train1.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%
29,customer_since_all,234,0.367364
30,customer_since_bank,249,0.390913
34,customer_occupation_code,2002,3.143005
38,customer_relationship,14476,22.726345
37,customer_children,23065,36.210497
36,customer_education,47125,73.983076


In [9]:
missing_df = train3.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/train1.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%
29,customer_since_all,234,0.367364
30,customer_since_bank,249,0.390913
34,customer_occupation_code,2002,3.143005
38,customer_relationship,14456,22.694946
37,customer_children,23056,36.196367
36,customer_education,47125,73.983076


In [10]:
for x in ['customer_since_all','customer_since_bank','customer_occupation_code','customer_relationship',
         'customer_children','customer_education']:
    print(str(x) + ": " + str(train2.loc[train1[x].isna(),x].value_counts().sum()))
    print(str(x) + ": " + str(train3.loc[train1[x].isna(),x].value_counts().sum()))
    print("-----------------------------------------")

customer_since_all: 0
customer_since_all: 0
-----------------------------------------
customer_since_bank: 0
customer_since_bank: 0
-----------------------------------------
customer_occupation_code: 0
customer_occupation_code: 0
-----------------------------------------
customer_relationship: 1000
customer_relationship: 1015
-----------------------------------------
customer_children: 977
customer_children: 984
-----------------------------------------
customer_education: 0
customer_education: 0
-----------------------------------------


In [11]:
# Fix customer relationship and customer children for training set
train1['customer_children2'] = train2['customer_children']
train1['customer_children3'] = train3['customer_children']

train1.customer_children = np.where(train1.customer_children.isnull(), train1.customer_children2, train1.customer_children)
train1.customer_children = np.where(train1.customer_children.isnull(), train1.customer_children3, train1.customer_children)


train1['customer_relationship2'] = train2['customer_relationship']
train1['customer_relationship3'] = train3['customer_relationship']

train1.customer_relationship = np.where(train1.customer_relationship.isnull(), train1.customer_relationship2, train1.customer_relationship)
train1.customer_relationship = np.where(train1.customer_relationship.isnull(), train1.customer_relationship3, train1.customer_relationship)

train2['customer_children'] = train1['customer_children']
train3['customer_children'] = train1['customer_children']

train2['customer_relationship'] = train1['customer_relationship']
train3['customer_relationship'] = train1['customer_relationship']

train1 = train1.drop(columns = ['customer_relationship2','customer_relationship3',
                                'customer_children2','customer_children3'])

In [12]:
for x in ['customer_since_all','customer_since_bank','customer_occupation_code','customer_relationship',
         'customer_children','customer_education']:
    print(str(x) + ": " + str(test2.loc[test2[x].isna(),x].value_counts().sum()))
    print(str(x) + ": " + str(test3.loc[test2[x].isna(),x].value_counts().sum()))
    print("-----------------------------------------")

customer_since_all: 0
customer_since_all: 0
-----------------------------------------
customer_since_bank: 0
customer_since_bank: 0
-----------------------------------------
customer_occupation_code: 0
customer_occupation_code: 0
-----------------------------------------
customer_relationship: 0
customer_relationship: 16
-----------------------------------------
customer_children: 0
customer_children: 9
-----------------------------------------
customer_education: 0
customer_education: 0
-----------------------------------------


In [13]:
# Fix customer relationship and customer children for training set
test1['customer_children2'] = test2['customer_children']
test1['customer_children3'] = test3['customer_children']

test1.customer_children = np.where(test1.customer_children.isnull(), test1.customer_children2, test1.customer_children)
test1.customer_children = np.where(test1.customer_children.isnull(), test1.customer_children3, test1.customer_children)


test1['customer_relationship2'] = test2['customer_relationship']
test1['customer_relationship3'] = test3['customer_relationship']

test1.customer_relationship = np.where(test1.customer_relationship.isnull(), test1.customer_relationship2, test1.customer_relationship)
test1.customer_relationship = np.where(test1.customer_relationship.isnull(), test1.customer_relationship3, test1.customer_relationship)

test2['customer_children'] = test1['customer_children']
test3['customer_children'] = test1['customer_children']

test2['customer_relationship'] = test1['customer_relationship']
test3['customer_relationship'] = test1['customer_relationship']

test1 = test1.drop(columns = ['customer_relationship2','customer_relationship3',
                                'customer_children2','customer_children3'])

In [14]:
#Check how many churners are for the missing rows in customer_since_all & 
train1['target'].loc[(train1['customer_since_all'].isna()) & (train1['customer_since_bank'].isna())].sum()
# We will drop columns with missing customer_since_all and customer_since_bank since they are very few churners among the NaN values

12

In [15]:
df_list=[train1,train2,train3,test1,test2,test3]

In [16]:
def drop_rows(x):
    x.drop(x[x['customer_since_all'].isna()].index,inplace = True)
    x.drop(x[x['customer_since_bank'].isna()].index,inplace = True)

In [17]:
for x in df_list:
    drop_rows(x)

In [18]:
missing_df = train1.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/train1.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%
34,customer_occupation_code,1980,3.120666
38,customer_relationship,13832,21.80053
37,customer_children,22305,35.154772
36,customer_education,46944,73.988148


Before we do anything with any of the columns with remainder of the missing values, we will select features

### hide temp explore

In [19]:
# sns.countplot(x='customer_education',data=train1,hue='target',palette="coolwarm_r")

In [20]:
# for x in range(0,10):
#     print(f'job_code {x}', len(train1.loc[(train1['customer_occupation_code']==x)]))
    
# sns.histplot(x='customer_occupation_code',data=train1,hue='target',palette="coolwarm_r")

In [21]:
# sns.histplot(x='customer_occupation_code',data=train1,hue='target',palette="coolwarm_r",multiple = "fill")

In [22]:


# a = len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 0)])+len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 1)]) 

# b = len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 0)])+ len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 1)]) 
# table=[['Relationship','0','1'],
#         [str(len(train1.loc[(train1['customer_relationship']=='couple')])) + ' Couple',round(len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 0)])/a,3 ),round(len(train1.loc[(train1['customer_relationship']=='couple') & (train1['target']== 1)])/a,3) ],
#         [str(len(train1.loc[(train1['customer_relationship']=='single')])) + ' Single',round(len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 0)])/b,3) ,round(len(train1.loc[(train1['customer_relationship']=='single') & (train1['target']== 1)])/b,3) ]]
# print(tabulate(table, headers='firstrow'))



In [23]:
# rep = {'mature':1,'no':0, 'young':1,'preschool':1,'adolescent':1,'grownup':1,'onebaby':1
#  ,'yes':1}
# train1['customer_children']= train1['customer_children'].replace(rep) 

In [24]:
# c = len(train1.loc[(train1['customer_children']==0) & (train1['target']== 0)])+len(train1.loc[(train1['customer_children']==0) & (train1['target']== 1)]) 

# d = len(train1.loc[(train1['customer_children']==1) & (train1['target']== 0)])+ len(train1.loc[(train1['customer_children']==1) & (train1['target']== 1)]) 
# table=[['Children','0','1'],
#         [str(len(train1.loc[(train1['customer_children']==0)])) + ' Without kids',round(len(train1.loc[(train1['customer_children']==0) & (train1['target']== 0)])/c,2 ),round(len(train1.loc[(train1['customer_children']==0) & (train1['target']== 1)])/c,2) ],
#         [str(len(train1.loc[(train1['customer_children']==1)])) + ' With kids',round(len(train1.loc[(train1['customer_children']==1) & (train1['target']== 0)])/d,2) ,round(len(train1.loc[(train1['customer_children']==1) & (train1['target']== 1)])/d,2) ]]
# print(tabulate(table, headers='firstrow'))

In [25]:
# train1['customer_children'] = pd.Categorical(train1['customer_children'])
# sns.histplot(x='customer_children',data=train1,hue='target',palette="coolwarm_r")

In [26]:
# plot = sns.histplot(x='customer_children',data=train1,hue='target',palette="coolwarm_r",multiple = "fill")
# plot.set(ylim=(0,0.06))

In [27]:
# thjis var seems useful what do we do about missing values

In [28]:
# train1_test = train1.copy()
# # train1_test['customer_children'].isna()
# train1_test['customer_children'] = pd.to_numeric(train1_test['customer_children'])
# train1_test.loc[train1_test['customer_children'].isna(),'customer_children'] = 2

In [29]:
# train1_test['customer_children'].replace({0:'no children',1:'children',2:'missing'})
# train1_test['customer_children'] = pd.Categorical(train1_test['customer_children'])

In [30]:
# plot = sns.histplot(x='customer_children',data=train1_test,hue='target',palette="coolwarm_r",multiple = "fill")
# plot.set(ylim=(0,0.06))

## Create date column

In [31]:
def striptime(x):
    x['customer_since_all'] = pd.to_datetime(x['customer_since_all'])
    x['customer_since_bank'] = pd.to_datetime(x['customer_since_bank'])
    x['customer_birth_date'] = pd.to_datetime(x['customer_birth_date'])
    x['Birth_year'] = x['customer_birth_date'].dt.strftime('%Y').astype(str).astype(int)
    x['Year_since_all'] = x['customer_since_all'].dt.strftime('%Y').astype(str).astype(int)
    x['Year_since_bank'] = x['customer_since_bank'].dt.strftime('%Y').astype(str).astype(int)
    x.drop(['customer_since_all','customer_since_bank','customer_birth_date'], axis=1, inplace=True)

In [32]:
# def drop_col(x):
#     x.drop(['customer_children','customer_relationship','customer_occupation_code',
#             'customer_education',],axis=1, inplace=True) 
#     x.dropna(axis=0, inplace=True) 

In [33]:
for x in df_list:
    x = striptime(x)
#     print(x.shape)

## Feature Engineering

### Create Age variables

First, let's extract the customer's Age and drop Birth_year

In [34]:
def client_age(x):
    x['Age'] = x['Birth_year'].apply(lambda x: 2018 -x)
for x in df_list:
    client_age(x)

- We could see if there is a significant (large) difference in say balance or savings as this could be an indicator that the client is about to churn. (can just be stored as a boolean). 
- can also compute a boolean regarding the change in some services that the client has. Say he dropped in insurrance 21 last month or smtg. also as a boolean 

### Removing outliers based on Birth year/Age

We have noticed some of the clients are born in the 80s. Therefore we will assign a threshhold of 100years for the client's age.  In addition, the client's birth year cannot greater than the year he started using the banks services.

In [35]:
def remove_out(x):
    x.drop(x[x['Birth_year'] < 1919].index, inplace = True)
    x.drop(x.loc[x['Birth_year'] > (x['Year_since_all'] | x['Year_since_bank'])].index, inplace=True)

Additionally, customers aged 18 years old cannot own mature children or grownups. Hence they will be considered as outliers.

In [36]:
def remove_out2(x):
    x.drop(x.loc[(x['customer_children'] =="preschool") & (x['Age'] <21  )].index, inplace=True)
    x.drop(x.loc[(x['customer_children'] =="young") & (x['Age'] <28  )].index, inplace=True)
    x.drop(x.loc[(x['customer_children'] =="adolescent") & (x['Age'] <32  )].index, inplace=True)
    x.drop(x.loc[(x['customer_children'] =="grownup") & (x['Age'] <36  )].index, inplace=True)
    x.drop(x.loc[(x['customer_children'] =="mature") & (x['Age'] <42 )].index, inplace = True)

In [37]:
for x in df_list:
    remove_out(x)
    remove_out2(x)

### Create client since variable

In [38]:
def client_since(x):
    x['Year_since_all'] = x['Year_since_all'].apply(lambda x: 2018 -x)
    x['Year_since_bank'] = x['Year_since_bank'].apply(lambda x: 2018 -x)
    
for x in df_list:
    client_since(x)

### Create difference variables

In [39]:
#creates a new column in the dataset with a boolean regarding if the values between the 3 columns are the same. 

def change(orig1,orig2,orig3, cols_bool,cols_cont,key ='client_id'):
    columns= [key] + cols_bool + cols_cont
    merged_1 = pd.merge(orig1,orig2[columns], how = 'left',on = key, suffixes=['','-1'])
    merged = pd.merge(merged_1,orig3[columns], how = 'left',on = key, suffixes=['','-2'])
    to_drop =[]
    for var in cols_bool:
        merged['ch_{}'.format(var)] = np.where((merged[var]==merged['{}-1'.format(var)])&(
                                   merged[var] == merged['{}-2'.format(var)]),0,1)
        merged['ch_{}'.format(var)] = pd.Categorical(merged['ch_{}'.format(var)])
        merged[var] = pd.Categorical(merged[var])
        to_drop+=['{}-1'.format(var),'{}-2'.format(var)]
    for var in cols_cont:
        merged['diff_mth1_{}'.format(var)] = -merged[var]+merged['{}-1'.format(var)]
        merged['diff_mth2_{}'.format(var)] = -merged[var]+merged['{}-2'.format(var)]  
        to_drop+=['{}-1'.format(var),'{}-2'.format(var)]
    merged.drop(to_drop,axis=1,inplace=True)
    return merged

In [40]:
# create a new column regarding if there was a change in any var that starts w has_... in the last couple months
# will be 1 if there was any change in the last 2 months 0 otherwise
to_bool = ['homebanking_active', 'has_homebanking',
       'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap',
       'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance',
       'has_personal_loan', 'has_mortgage_loan', 'has_current_account',
       'has_pension_saving', 'has_savings_account',
       'has_savings_account_starter', 'has_current_account_starter']
to_diff_cont = ['bal_insurance_21', 'bal_insurance_23','bal_personal_loan', 
        'bal_mortgage_loan', 'bal_current_account',
        'bal_pension_saving', 'bal_savings_account',
        'bal_savings_account_starter', 'bal_current_account_starter',
        'cap_life_insurance_fixed_cap','cap_life_insurance_decreasing_cap']
train_s = change(train1,train2,train3, to_bool,to_diff_cont)
train_s.columns

Index(['client_id', 'homebanking_active', 'has_homebanking',
       'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap',
       'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance',
       'has_personal_loan', 'has_mortgage_loan', 'has_current_account',
       'has_pension_saving', 'has_savings_account',
       'has_savings_account_starter', 'has_current_account_starter',
       'bal_insurance_21', 'bal_insurance_23', 'cap_life_insurance_fixed_cap',
       'cap_life_insurance_decreasing_cap', 'prem_fire_car_other_insurance',
       'bal_personal_loan', 'bal_mortgage_loan', 'bal_current_account',
       'bal_pension_saving', 'bal_savings_account',
       'bal_savings_account_starter', 'bal_current_account_starter',
       'visits_distinct_so', 'visits_distinct_so_areas', 'customer_gender',
       'customer_postal_code', 'customer_occupation_code',
       'customer_self_employed', 'customer_education', 'customer_children',
       'customer_relationship',

In [41]:
train_s.isna().sum()

client_id                                      0
homebanking_active                             0
has_homebanking                                0
has_insurance_21                               0
has_insurance_23                               0
                                              ..
diff_mth2_bal_current_account_starter          0
diff_mth1_cap_life_insurance_fixed_cap         0
diff_mth2_cap_life_insurance_fixed_cap         0
diff_mth1_cap_life_insurance_decreasing_cap    0
diff_mth2_cap_life_insurance_decreasing_cap    0
Length: 77, dtype: int64

In [42]:
train1['cap_life_insurance_fixed_cap'].unique()

array([     0,   3120,  27730, 100000,   1180,   2480,   1610,   3970,
        63000,  12550,  16960,  40000,   1110,  82350,   2890,  18720,
        49580,  76450,  24790,   2810,  10000,  54540,   1730,    580,
        62000,   3130,  27210,  52330,  12390,    810,  50000,  20700,
         3010,   3500,  16410,  38790,   9300,   2900,  69570,   3720,
       125000,   2030,  12240,  12500, 171050,   1800,    670,  25010,
         5000,   2440,  22400,    250,   1430,  37180,    430,   4310,
        25000,    460,   6200,   7640,  22500,   7080,    530, 140000,
        27160,  11850,    620,  59490,   3360,  56890,   1240,  25510,
       104000,   9000,   9080,  10310,  85430, 127880, 200000,  91100,
        14230,  41980,  30230,     90,   9340,   4960,   2090,  20000,
        71690,  47830,  43380,    960, 150000,    990,  16110,   5040,
       220000,   8250,   3200,  19900,  10860,   3400,  49470,   3620,
         1990,   3060,   1260,   2260,   8750,  14390,   7680,   2450,
      

In [43]:
test_s = change(test1,test2,test3, to_bool, to_diff_cont)

## Feature selection

### Categorical vars

In [44]:
cat_variables = ['homebanking_active', 'has_homebanking',
       'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap',
       'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance',
       'has_personal_loan', 'has_mortgage_loan', 'has_current_account',
       'has_pension_saving', 'has_savings_account',
       'has_savings_account_starter', 'has_current_account_starter','visits_distinct_so', 'visits_distinct_so_areas', 'customer_gender',
       'customer_self_employed','ch_homebanking_active',
       'ch_has_homebanking', 'ch_has_insurance_21', 'ch_has_insurance_23',
       'ch_has_life_insurance_fixed_cap',
       'ch_has_life_insurance_decreasing_cap',
       'ch_has_fire_car_other_insurance', 'ch_has_personal_loan',
       'ch_has_mortgage_loan', 'ch_has_current_account',
       'ch_has_pension_saving', 'ch_has_savings_account',
       'ch_has_savings_account_starter', 'ch_has_current_account_starter','customer_children']
for var in cat_variables:
    train_s[var] = pd.Categorical(train_s[var]) 
    test_s[var] = pd.Categorical(test_s[var])

In [45]:
# sns.set(rc={'figure.figsize':(5,3)}) 
# fig, axes = plt.subplots(11, 3, figsize=(100, 60))

# i = 0
# j = 0
# for variable in cat_variables:
#     sns.histplot(ax=axes[i, j],data=train_s, x=variable, hue = "target" , multiple="stack",hue_order = [0,1])
#     j = j + 1
#     if (j > 2):
#         j = 0
#         i = i + 1

In [46]:
# fig, axes = plt.subplots(11, 3, figsize=(100, 60))

# i = 0
# j = 0
# for variable in cat_variables:
#     sns.histplot(ax=axes[i, j],data=train_s, x=variable, hue = "target" , multiple="fill",hue_order = [0,1])
#     j = j + 1
#     if (j > 2):
#         j = 0
#         i = i + 1

In [47]:
# sns.set(rc={'figure.figsize':(5,3)}) 
# fig, axes = plt.subplots(11, 3, figsize=(100, 60))

# i = 0
# j = 0
# for variable in cat_variables:
#     plot = sns.histplot(ax=axes[i, j],data=train_s, x=variable, hue = "target" , multiple="fill",hue_order = [0,1])
#     plot.set(ylim=(0, 0.1))
#     j = j + 1
#     if (j > 2):
#         j = 0
#         i = i + 1

In [48]:
vars_to_explore = ['has_insurance_23','has_insurance_21','has_fire_car_other_insurance',
                   'ch_has_insurance_21','ch_has_savings_account','ch_has_current_account',
                   'ch_homebanking_active','has_pension_saving','has_life_insurance_fixed_cap',
                   'customer_postal_code','has_current_account_starter','ch_has_current_account_starter',
                   'has_savings_account_starter','visits_distinct_so_areas','ch_has_life_insurance_decreasing_cap',
                   'ch_has_mortgage_loan','ch_has_savings_account_starter','ch_has_fire_car_other_insurance',
                   'ch_has_insurance_23','visits_distinct_so','ch_has_homebanking','ch_has_personal_loan',
                   'ch_has_pension_saving','ch_has_current_account_starter']

In [49]:
keep_cat = []
for x in cat_variables:
    if x not in vars_to_explore:
        keep_cat += [x]

In [50]:
from scipy.stats import chi2_contingency 
from scipy.stats import chi2

In [51]:
remove_cat = []
for x in vars_to_explore:
    test = pd.crosstab(train_s['target'],train_s[x],margins = False)
    stat, p, dof, expected = chi2_contingency(test)
    critical = chi2.ppf(0.99, dof)
    if abs(stat) >= critical:
        keep_cat = keep_cat + [x]
        print(x + ': Dependent (reject H0)')
    else:
        remove_cat = remove_cat + [x]
        print(x + ': Independent (fail to reject H0)')

has_insurance_23: Independent (fail to reject H0)
has_insurance_21: Dependent (reject H0)
has_fire_car_other_insurance: Dependent (reject H0)
ch_has_insurance_21: Independent (fail to reject H0)
ch_has_savings_account: Independent (fail to reject H0)
ch_has_current_account: Independent (fail to reject H0)
ch_homebanking_active: Independent (fail to reject H0)
has_pension_saving: Independent (fail to reject H0)
has_life_insurance_fixed_cap: Independent (fail to reject H0)
customer_postal_code: Dependent (reject H0)
has_current_account_starter: Dependent (reject H0)
ch_has_current_account_starter: Independent (fail to reject H0)
has_savings_account_starter: Dependent (reject H0)
visits_distinct_so_areas: Dependent (reject H0)
ch_has_life_insurance_decreasing_cap: Dependent (reject H0)
ch_has_mortgage_loan: Independent (fail to reject H0)
ch_has_savings_account_starter: Independent (fail to reject H0)
ch_has_fire_car_other_insurance: Dependent (reject H0)
ch_has_insurance_23: Independent 

In [52]:
print(keep_cat)

['homebanking_active', 'has_homebanking', 'has_life_insurance_decreasing_cap', 'has_personal_loan', 'has_mortgage_loan', 'has_current_account', 'has_savings_account', 'customer_gender', 'customer_self_employed', 'ch_has_life_insurance_fixed_cap', 'customer_children', 'has_insurance_21', 'has_fire_car_other_insurance', 'customer_postal_code', 'has_current_account_starter', 'has_savings_account_starter', 'visits_distinct_so_areas', 'ch_has_life_insurance_decreasing_cap', 'ch_has_fire_car_other_insurance', 'visits_distinct_so']


### Continious vars

In [53]:
cont_vars = ['bal_insurance_21', 'bal_insurance_23','bal_personal_loan', 
'bal_mortgage_loan', 'bal_current_account',
'bal_pension_saving', 'bal_savings_account',
'bal_savings_account_starter', 'bal_current_account_starter',
'cap_life_insurance_fixed_cap','cap_life_insurance_decreasing_cap',
'diff_mth1_bal_insurance_21', 'diff_mth2_bal_insurance_21',
'diff_mth1_bal_insurance_23', 'diff_mth2_bal_insurance_23',
'diff_mth1_bal_personal_loan', 'diff_mth2_bal_personal_loan',
'diff_mth1_bal_mortgage_loan', 'diff_mth2_bal_mortgage_loan',
'diff_mth1_bal_current_account', 'diff_mth2_bal_current_account',
'diff_mth1_bal_pension_saving', 'diff_mth2_bal_pension_saving',
'diff_mth1_bal_savings_account', 'diff_mth2_bal_savings_account',
'diff_mth1_bal_savings_account_starter',
'diff_mth2_bal_savings_account_starter',
'diff_mth1_bal_current_account_starter',
'diff_mth2_bal_current_account_starter',
'diff_mth1_cap_life_insurance_fixed_cap',
'diff_mth2_cap_life_insurance_fixed_cap',
'diff_mth1_cap_life_insurance_decreasing_cap',
'diff_mth2_cap_life_insurance_decreasing_cap','Age','Year_since_all','Year_since_bank']

In [54]:
# sns.set(rc={'figure.figsize':(5,3)}) 
# fig, axes = plt.subplots(12, 3, figsize=(100, 60))

# i = 0
# j = 0
# for variable in cont_vars:
#     sns.histplot(ax=axes[i, j],data=train_s, x=variable, hue = "target" , multiple="stack",hue_order = [0,1],
#                 bins = 50)
#     j = j + 1
#     if (j > 2):
#         j = 0
#         i = i + 1

In [55]:
# sns.set(rc={'figure.figsize':(5,3)}) 
# fig, axes = plt.subplots(12, 3, figsize=(100, 60))

# i = 0
# j = 0
# for variable in cont_vars:
#     plot = sns.histplot(ax=axes[i, j],data=train_s, x=variable, hue = "target" , multiple="fill",hue_order = [0,1],
#                         bins = 50)
#     plot.set(ylim=(0, 0.06))
#     j = j + 1
#     if (j > 2):
#         j = 0
#         i = i + 1

In [56]:
keep_cont = ['bal_savings_account','diff_mth2_bal_savings_account','bal_current_account',
             'diff_mth1_bal_current_account','diff_mth1_bal_savings_account','diff_mth2_bal_current_account',
             'Age','Year_since_all','Year_since_bank']

In [57]:
final_vars = keep_cat + keep_cont
len(final_vars)

29

In [177]:
train_final = train_s[['target']+final_vars]

In [160]:
test_final = test_s[['client_id'] + final_vars]

## Final training set missing data

In [161]:
missing_df = train_final.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/train_final.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%
12,customer_children,22147,37.89634


In [162]:
missing_df = test_final.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/test_final.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%
11,customer_children,9553,38.193667


In [163]:
rep = {'onebaby':'yes','preschool':'yes', 'young':'yes','adolescent':'yes','grownup':'yes','mature':'yes',
      np.nan:'missing','nan':'yes'}
train_final['customer_children'] = train_final['customer_children'].astype(str).replace(rep)
test_final['customer_children'] = test_final['customer_children'].astype(str).replace(rep)


In [164]:
missing_df = test_final.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['%']=(missing_df['missing values'])/test_final.shape[0]*100
missing_df = missing_df[missing_df['missing values'] >0].sort_values('%')
missing_df

Unnamed: 0,variable,missing values,%


In [165]:
# the old missing values that shouldn't be there have been fixed
# train_final = train_final.dropna()
# test_final = test_final.dropna()

In [166]:
print("From the initial data set, we lost " + str(round((copy_df['target'].sum()-train_final['target'].sum())/copy_df['target'].sum() *100)) +"%" + " churners after cleaning the data set. We will move on to the modeling step.")

From the initial data set, we lost 6% churners after cleaning the data set. We will move on to the modeling step.


## PCA - not used

In [67]:
## Dimension reductioin

In [68]:
# #check if df is factorable 
# train_fa = train_s.drop(['client_id','target'],axis=1)
# chi_square_value,p_value=calculate_bartlett_sphericity(train_fa)
# chi_square_value, p_value

In [69]:
# _,kmo_model=calculate_kmo(train_fa)

In [70]:
# fa = FactorAnalyzer(n_factors=30,rotation=None )
# fa.fit(train_fa )
# # Check Eigenvalues
# ev, v = fa.get_eigenvalues()
# ev

In [71]:
# # Create scree plot
# plt.scatter(range(1,train_fa.shape[1]+1),ev)
# plt.plot(range(1,train_fa.shape[1]+1),ev)
# plt.title('Scree Plot')
# plt.xlabel('Factors')
# plt.ylabel('Eigenvalue')
# plt.grid()
# plt.show()

In [72]:
#  #Create factor analysis object and perform factor analysis
# fa = FactorAnalyzer(20, rotation="varimax")
# fa.fit(train_fa)

In [73]:
# pd.DataFrame(fa.loadings_, index=train_fa.columns)

In [74]:
# check variance explained 
# pd.DataFrame(fa.get_factor_variance(), index=['SS Loadings','Proportion Var','Cumulative Var'])

## Prepare training Data

In [75]:
train_final.columns

Index(['target', 'homebanking_active', 'has_homebanking',
       'has_life_insurance_decreasing_cap', 'has_personal_loan',
       'has_mortgage_loan', 'has_current_account', 'has_savings_account',
       'customer_gender', 'customer_self_employed',
       'ch_has_life_insurance_fixed_cap', 'customer_children',
       'has_insurance_21', 'has_fire_car_other_insurance',
       'customer_postal_code', 'has_current_account_starter',
       'has_savings_account_starter', 'visits_distinct_so_areas',
       'ch_has_life_insurance_decreasing_cap',
       'ch_has_fire_car_other_insurance', 'visits_distinct_so',
       'bal_savings_account', 'diff_mth2_bal_savings_account',
       'bal_current_account', 'diff_mth1_bal_current_account',
       'diff_mth1_bal_savings_account', 'diff_mth2_bal_current_account', 'Age',
       'Year_since_all', 'Year_since_bank'],
      dtype='object')

In [179]:
# create matrix with interation terms
y = train_final.iloc[:,0]
X = pd.get_dummies(train_final,drop_first=True).iloc[:,1:]

In [192]:
test_final = pd.get_dummies(test_final, columns=['homebanking_active', 'has_homebanking',
       'has_life_insurance_decreasing_cap', 'has_personal_loan',
       'has_mortgage_loan', 'has_current_account', 'has_savings_account',
       'customer_gender', 'customer_self_employed',
       'ch_has_life_insurance_fixed_cap', 'customer_children',
       'has_insurance_21', 'has_fire_car_other_insurance',
       'customer_postal_code', 'has_current_account_starter',
       'has_savings_account_starter', 'visits_distinct_so_areas',
       'ch_has_life_insurance_decreasing_cap',
       'ch_has_fire_car_other_insurance', 'visits_distinct_so'] ,drop_first=True)

## Prepare testing Data

In [None]:
final_vars

# Model Creation

## Selecting best models

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings("ignore")



from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,f1_score, ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,plot_roc_curve,auc,precision_recall_curve,plot_precision_recall_curve,average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=4,stratify=y)

In [129]:
# Classifiers to use
models = []
# models.append(['KNeigbors',KNeighborsClassifier()])
# models.append(['RandomForest',RandomForestClassifier(random_state=4)])
# models.append(['AdaBoostClassifier',AdaBoostClassifier(random_state=4)])
# models.append(['LogisticsRegression', LogisticRegression(solver = 'saga',max_iter = 1000)])
# models.append(['GradientBoost',GradientBoostingClassifier(learning_rate=0.1,random_state=4)])
models.append(['XGBClassifier',xgb.XGBClassifier(eval_metric='logloss', random_state=4, n_jobs =4,use_label_encoder=False)])


We first fit the models without adding or changing any parameters with and without cross-validation.\
Our future goal is to improve the model for correctly predicting true positives. Hence, we are interested in the precision score of our models. \

In [130]:
lst_1 = []
for m in range(len(models)):
    lst_2 = []
    model = models[m][1]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    cm = confusion_matrix(y_test,y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    AUC_cv = cross_val_score(estimator= model, X = X_train,y = y_train, cv=5,scoring = 'roc_auc')
    Precision_cv = cross_val_score(estimator= model, X = X_train,y = y_train, cv=5,scoring = 'precision')
    Recall_cv = cross_val_score(estimator= model, X = X_train,y = y_train, cv=5,scoring = 'recall')
    F1_cv = cross_val_score(estimator= model, X = X_train,y = y_train, cv=5,scoring = 'f1')
    recall =  tp/(tp + fn) # more imp
    specificity = tn/(tn+fp) # most imp
    precision = tp/(tp+fp) # least imp
    accuracy = (tp + tn)/(tp + tn + fp + fn)
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
    AUC = auc(false_positive_rate, true_positive_rate)
    
    print(models[m][0],':')
    print(cm)
    print('AUC: {:.3f}'.format(AUC))
    print('10-CV AUC: {:.3f}'.format(AUC_cv.mean())) 
    print('10-CV Accuracy Standard Deviation: {:.3f}'.format(AUC_cv.std())) 

    
    
    lst_2.append(models[m][0])
    lst_2.append(AUC)
    lst_2.append(AUC_cv.mean())
    lst_2.append(AUC_cv.std())

    lst_2.append(Precision_cv.mean())
    lst_2.append(Precision_cv.std())
    lst_2.append(Recall_cv.mean())
    lst_2.append(Recall_cv.std())
    lst_2.append(F1_cv.mean())
    lst_2.append(F1_cv.std())    
    lst_1.append(lst_2)

XGBClassifier :
[[16053    10]
 [ 1462     8]]
AUC: 0.812
10-CV AUC: 0.697
10-CV Accuracy Standard Deviation: 0.020


In [131]:
df2 = pd.DataFrame(lst_1,columns=['Model','AUC','10-CV AUC','10-CV AUC std','10-CV Precision','10-CV Precisionstd','10-CV Recall','10-CV Recall std','F1','F1-std'])

df2.sort_values(by=['AUC'],inplace=True,ascending=False)
df2.reset_index(drop = True).round(decimals = 3)

Unnamed: 0,Model,AUC,10-CV AUC,10-CV AUC std,10-CV Precision,10-CV Precisionstd,10-CV Recall,10-CV Recall std,F1,F1-std
0,XGBClassifier,0.812,0.697,0.02,0.162,0.092,0.004,0.003,0.008,0.005


From the above matrix and the table shown above, XGBClassifier works best in catching True & False positives. However, the models accuracy drops by 30% after cross validation. Additionally, the low AUC score after 10 folds cv shows that we are overfitting the data. \
The main reason that we are having this issue is because we have an imbalanced target variable. Therefore we will try the following techniques to improve the model:
- Over sampling churned targets
- Smart sampling churned targets
- Adding misclassification costs (We will start with the inverse class distribution)

Our aim forward is to improve True positives/False positives ratio .\
  

# Wissam

##  Sampling techniques

### Over Sampling

In [126]:
models = []

#models.append(['AdaBoostClassifier',AdaBoostClassifier(random_state=4)])

# models.append(['GradientBoost',GradientBoostingClassifier(learning_rate=0.1,random_state=4)])
models.append(['XGBClassifier',xgb.XGBClassifier(eval_metric='logloss',random_state=4, n_jobs =4,use_label_encoder=False)])

In [124]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=4)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [127]:
lst_1 = []
for m in range(len(models)):
    lst_2 = []
    model = models[m][1]
    model.fit(X_resampled,y_resampled)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    cm = confusion_matrix(y_test,y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    AUC_cv = cross_val_score(estimator= model, X = X_resampled,y = y_resampled, cv=5,scoring = 'roc_auc')
    Precision_cv = cross_val_score(estimator= model, X = X_resampled,y = y_resampled, cv=5,scoring = 'precision')
    Recall_cv = cross_val_score(estimator= model, X = X_resampled,y = y_resampled, cv=5,scoring = 'recall')
    F1_cv = cross_val_score(estimator= model, X = X_resampled,y = y_resampled, cv=5,scoring = 'f1')
    recall =  tp/(tp + fn) # more imp
    specificity = tn/(tn+fp) # most imp
    precision = tp/(tp+fp) # least imp
    accuracy = (tp + tn)/(tp + tn + fp + fn)
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
    AUC = auc(false_positive_rate, true_positive_rate)
    
    print(models[m][0],':')
    print(cm)
    print('AUC: {:.3f}'.format(AUC))
    print('10-CV AUC: {:.3f}'.format(AUC_cv.mean())) 
    print('10-CV Accuracy Standard Deviation: {:.3f}'.format(AUC_cv.std())) 

    
    
    lst_2.append(models[m][0])
    lst_2.append(AUC)
    lst_2.append(AUC_cv.mean())
    lst_2.append(AUC_cv.std())

    lst_2.append(Precision_cv.mean())
    lst_2.append(Precision_cv.std())
    lst_2.append(Recall_cv.mean())
    lst_2.append(Recall_cv.std())
    lst_2.append(F1_cv.mean())
    lst_2.append(F1_cv.std())    
    lst_1.append(lst_2)

XGBClassifier :
[[15059  1004]
 [  969   501]]
AUC: 0.792
10-CV AUC: 0.981
10-CV Accuracy Standard Deviation: 0.002


In [128]:
df2 = pd.DataFrame(lst_1,columns=['Model','AUC','10-CV AUC','10-CV AUC std','10-CV Precision','10-CV Precisionstd','10-CV Recall','10-CV Recall std','F1','F1-std'])

df2.sort_values(by=['AUC'],inplace=True,ascending=False)
df2.reset_index(drop = True).round(decimals = 3)

Unnamed: 0,Model,AUC,10-CV AUC,10-CV AUC std,10-CV Precision,10-CV Precisionstd,10-CV Recall,10-CV Recall std,F1,F1-std
0,XGBClassifier,0.792,0.981,0.002,0.917,0.005,0.941,0.007,0.929,0.006


### Smart Sampling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=4,stratify=y)

In [112]:
from imblearn.over_sampling import SMOTENC

In [118]:
cat_col_index = list(range(10,37))
sm = SMOTENC(categorical_features=cat_col_index, random_state=4, sampling_strategy=0.2)
X_trainres, y_trainres = sm.fit_resample(X_train, y_train)

In [None]:
y_trainres.sum()/len(y_trainres)

In [None]:
y_train.sum()/len(y_train)

In [139]:
for x in [0.1,0.15,0.2,0.25,0.3]: 
    cat_col_index = list(range(10,37))
    sm = SMOTENC(categorical_features=cat_col_index, random_state=4, sampling_strategy=x)
    X_trainres, y_trainres = sm.fit_resample(X_train, y_train)

    lst_1 = []
    for m in range(len(models)):
        lst_2 = []
        model = models[m][1]
        model.fit(X_trainres,y_trainres)
        y_test= model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:,1]
        cm = confusion_matrix(y_test,y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        AUC_cv = cross_val_score(estimator= model, X = X_trainres,y = y_trainres, cv=5,scoring = 'roc_auc')
        
        recall =  tp/(tp + fn) # more imp
        specificity = tn/(tn+fp) # most imp
        precision = tp/(tp+fp) # least imp
        accuracy = (tp + tn)/(tp + tn + fp + fn)
        
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
        AUC = auc(false_positive_rate, true_positive_rate)
        
        print(models[m][0],':')
        print(cm)
        # print('AUC: {:.3f}'.format(AUC))
        # print('10-CV AUC: {:.3f}'.format(AUC_cv.mean())) 
        # print('10-CV Accuracy Standard Deviation: {:.3f}'.format(AUC_cv.std())) 
        
        lst_2.append(models[m][0])
        lst_2.append(AUC)
        lst_2.append(AUC_cv.mean())
        lst_2.append(AUC_cv.std())
        lst_1.append(lst_2)
        df2 = pd.DataFrame(lst_1,columns=['Model','AUC','10-CV AUC','10-CV Acc std'])

        df2.sort_values(by=['AUC'],inplace=True,ascending=False)
        df2.reset_index(drop = True).round(decimals = 3)

XGBClassifier :
[[17395    13]
 [  120     5]]
XGBClassifier :
[[17274    12]
 [  241     6]]
XGBClassifier :
[[17127    13]
 [  388     5]]
XGBClassifier :
[[17037    12]
 [  478     6]]
XGBClassifier :
[[16932    12]
 [  583     6]]


In [101]:
df2 = pd.DataFrame(lst_1,columns=['Model','AUC','10-CV AUC','10-CV Acc std'])

df2.sort_values(by=['AUC'],inplace=True,ascending=False)
df2.reset_index(drop = True).round(decimals = 3)

Unnamed: 0,Model,AUC,10-CV AUC,10-CV Acc std
0,RandomForest,1.0,0.985,0.015
1,AdaBoostClassifier,1.0,0.865,0.021
2,XGBClassifier,1.0,0.964,0.025


In summary, the models are performing better on the test set after sampling the data. So far, XGBClassifier is the best candidate to model our data.\

Note: Over-sampling and smart sampling gave the same results. Hence, we will move forward with over-sampling for its quick computation time.

# Tune Parameters

## Testing on X_train y_train

In [140]:
clf = XGBClassifier(eval_metric='logloss',random_state=4, n_jobs =4,use_label_encoder=False)
grid_values = {'scale_pos_weight':[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'precision')
grid_clf_acc.fit(X_train, y_train)



GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=4,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=4, reg_alpha=None,
                                   

In [141]:
y_pred_acc = grid_clf_acc.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))

#Logistic Regression (Grid Search) Confusion matrix
confusion_matrix(y_test,y_pred_acc)

Accuracy Score : 0.9616722751383107
Precision Score : 0.31718061674008813
Recall Score : 0.12224108658743633
F1 Score : 0.17647058823529413


array([[16789,   155],
       [  517,    72]], dtype=int64)

In [142]:
print('\n Best hyperparameters:')
print(grid_clf_acc.best_params_)


 Best hyperparameters:
{'scale_pos_weight': 6}


In [143]:
print('\n Best hyperparameters:')
print(grid_clf_acc.best_params_)


 Best hyperparameters:
{'scale_pos_weight': 6}


## Testing on X_resampled y_resampled

In [144]:
clf = XGBClassifier(eval_metric='logloss',random_state=4, n_jobs =4,use_label_encoder=False)
grid_values = {'scale_pos_weight':[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'precision')
grid_clf_acc.fit(X_resampled, y_resampled)



y_pred_acc = grid_clf_acc.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))

#Logistic Regression (Grid Search) Confusion matrix
confusion_matrix(y_test,y_pred_acc)

Accuracy Score : 0.702218673358809
Precision Score : 0.0752017608217168
Recall Score : 0.6960950764006791
F1 Score : 0.13573911604039066


array([[11902,  5042],
       [  179,   410]], dtype=int64)

In [145]:
print('\n Best hyperparameters:')
print(grid_clf_acc.best_params_)


 Best hyperparameters:
{'scale_pos_weight': 5}


## Testing on SMOTE

In [146]:
clf = XGBClassifier(eval_metric='logloss',random_state=4, n_jobs =4,use_label_encoder=False)
grid_values = {'scale_pos_weight':[5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'precision')
grid_clf_acc.fit(X_trainres, y_trainres)



y_pred_acc = grid_clf_acc.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))

#Logistic Regression (Grid Search) Confusion matrix
confusion_matrix(y_test,y_pred_acc)

Accuracy Score : 0.8817087777334169
Precision Score : 0.21409318444358877
Recall Score : 0.9439728353140917
F1 Score : 0.34902699309478974


array([[14903,  2041],
       [   33,   556]], dtype=int64)

In [147]:
print('\n Best hyperparameters:')
print(grid_clf_acc.best_params_)


 Best hyperparameters:
{'scale_pos_weight': 5}


# Wissam 2.0 to be improved

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
# To oversample and undersample data
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
# creating a list of numerical variables
numerical_features = numeric_columns
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = cat_columns.remove('Churn')
# creating a transformer for categorical variables, which will first apply simple imputer and 
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data 
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
model = Pipeline(
    steps=[
        ("CT", NPSTF("MultipleLines")),
        ("pre", preprocessor),
        ("class balance", NearMiss(version=1)),
        (
            "XGB",
            XGBClassifier(random_state=1,subsample= 0.9, reg_lambda= 5, n_estimators= 50, \
                          learning_rate= 0.1, gamma= 1, eval_metric='logloss'),
        )
    ]
)

In [None]:
# Separating target variable and other variables
## Encoding Existing and Attrited customers to 0 and 1 respectively, for analysis.
data["Churn"].replace("No", 0, inplace=True)
data["Churn"].replace("Yes", 1, inplace=True)
X = data.drop(columns="Churn")
Y = data["Churn"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.20, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
# Fit the model on training data
model.fit(X_train, y_train)