### Libraries & Data Import 

In [2]:
import pdb
import pandas as pd
import numpy as np
import os
import random
from scipy import stats
from scipy.stats import gaussian_kde
from datetime import datetime, timedelta
from faker import Faker
np.random.seed(3101)
random_state = np.random.RandomState(3101)
fake = Faker()
pd.set_option('display.float_format', lambda x: '{:.6f}'.format(x))

In [8]:
bank_df_train = pd.read_csv('./data/main/train.csv')
n_train = bank_df_train.shape[0]

### Data Cleaning

#### Remove Irrelevant Columns 

In [9]:
bank_df_train = bank_df_train.drop(["id", "CustomerId", "Surname", "CreditScore", "Geography", "HasCrCard"],axis = 1)

### Adding new columns 

#### Customer Id 

In [20]:
cust_id = np.arange(1,n_train+1,1)
bank_df_train['CustomerId'] = cust_id

<bound method NDFrame.head of         Gender       Age  Tenure       Balance  NumOfProducts  IsActiveMember  \
0         Male 33.000000       3      0.000000              2        0.000000   
1         Male 33.000000       1      0.000000              2        1.000000   
2         Male 40.000000      10      0.000000              2        0.000000   
3         Male 34.000000       2 148882.540000              1        1.000000   
4         Male 33.000000       5      0.000000              2        1.000000   
...        ...       ...     ...           ...            ...             ...   
165029  Female 33.000000       2      0.000000              1        1.000000   
165030    Male 35.000000       3      0.000000              1        0.000000   
165031    Male 31.000000       5      0.000000              1        1.000000   
165032  Female 30.000000       7 161533.000000              1        1.000000   
165033    Male 31.000000       1      0.000000              1        0.000000  

In [None]:
#### Service Support Freq (per mth)

Includes missed calls 

In [11]:
support_freq = pd.read_csv("./data/support_frequency.csv")
kde = gaussian_kde(support_freq['no_of_cases'])
bank_df_train['ServiceSupportFrequency'] = abs(kde.resample(n_train).flatten()/12).astype(int)

print(bank_df_train['ServiceSupportFrequency'].describe())


count   165034.000000
mean        13.336537
std         12.496536
min          0.000000
25%          4.000000
50%          9.000000
75%         20.000000
max         69.000000
Name: ServiceSupportFrequency, dtype: float64


#### Net Promoter Score (NPS)
Measure of customer satisfaction and loyalty.
Scaled from 1 to 10 


In [12]:
nps = pd.read_csv('./data/NPS.csv')

nps_data = nps.groupby('Customer Name').agg({'NPS':'mean'}).reset_index()
kde = gaussian_kde(nps_data['NPS'])
bank_df_train['NPS'] = abs(kde.resample(n_train).flatten()).astype(int)
bank_df_train['NPS'] = np.ceil(bank_df_train['NPS']/12*10)

print(bank_df_train['NPS'].describe())

count   165034.000000
mean         5.737539
std          2.978096
min          0.000000
25%          4.000000
50%          7.000000
75%          8.000000
max         10.000000
Name: NPS, dtype: float64


#### Education
Education level might influence financial behavior and churn.


In [27]:
education_marital = pd.read_csv("./data/education_marital.csv")
education = education_marital['Education_Level'].unique()
edu, counts = np.unique(education, return_counts=True)
value_to_index = {value: i for i, value in enumerate(edu)}
numerical_data = [value_to_index[value] for value in education]
kde = gaussian_kde(numerical_data)
x_values = np.unique(numerical_data)
pdf_values = kde(x_values)
pmf = pdf_values / np.sum(pdf_values)
resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)
resampled_values = [edu[index] for index in resampled_indices]

bank_df_train['Education'] = resampled_values
bank_df_train['Education'].replace({'Unknown': 'PSLE'}, inplace=True)
bank_df_train['Education'].replace({'Uneducated': 'O/N'}, inplace=True)
bank_df_train['Education'].replace({'High School': 'A'}, inplace=True)
bank_df_train['Education'].replace({'College': 'Diploma'}, inplace=True)
bank_df_train['Education'].replace({'Graduate': 'Bachelors'}, inplace=True)
bank_df_train['Education'].replace({'Post-Graduate': 'Masters'}, inplace=True)
bank_df_train['Education'].replace({'Doctorate': np.random.choice(['PHD', 'Post-Doc'])}, inplace=True)

print(bank_df_train['Education'].unique())

['Bachelors' 'Masters' 'A' 'Post-Doc' 'PSLE' 'Diploma' 'O/N']


#### Employment Status
Indicates financial stability, affecting churn likelihood

In [28]:
employment = pd.read_csv("./data/employment.csv")

employment_data = employment['job'].unique()

employment_status, counts = np.unique(employment_data, return_counts=True)
value_to_index = {value: i for i, value in enumerate(employment_status)}
numerical_data = [value_to_index[value] for value in employment_data]

kde = gaussian_kde(numerical_data)

x_values = np.unique(numerical_data)
pdf_values = kde(x_values)
pmf = pdf_values / np.sum(pdf_values)

resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)

resampled_values = [employment_status[index] for index in resampled_indices]

bank_df_train['EmploymentStatus'] = resampled_values

bank_df_train['EmploymentStatus'].replace({'admin.': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'blue-collar': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'entrepreneur': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'housemaid': 'Parttime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'management': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'retired': 'Retired'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'student': 'Student'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'services': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'self-employed': 'Self-employed'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'technician': 'Parttime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'unemployed': 'Unemployed'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'unknown': 'Parttime'}, inplace=True)

print(bank_df_train['EmploymentStatus'].unique())

['Fulltime' 'Unemployed' 'Retired' 'Student' 'Self-employed' 'Parttime']


#### Marital Status
Can impact financial decision-making and churn behavior. 

In [30]:
education_marital = pd.read_csv("./data/education_marital.csv")

marital_status = education_marital['Marital_Status'].unique()

marital, counts = np.unique(marital_status, return_counts=True)
value_to_index = {value: i for i, value in enumerate(marital)}
numerical_data = [value_to_index[value] for value in marital_status]

kde = gaussian_kde(numerical_data)

x_values = np.unique(numerical_data)
pdf_values = kde(x_values)

pmf = pdf_values / np.sum(pdf_values)

resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)

resampled_values = [marital[index] for index in resampled_indices]

bank_df_train['MaritalStatus'] = resampled_values

bank_df_train['MaritalStatus'].replace({'Unknown': 'Widowed'}, inplace=True)

print(bank_df_train['MaritalStatus'].unique())

['Single' 'Married' 'Divorced' 'Widowed']
