### Libraries & Data Import 

In [64]:
import pdb
import pandas as pd
import numpy as np
import os
import random
from scipy import stats
from scipy.stats import gaussian_kde
from datetime import datetime, timedelta
from faker import Faker
np.random.seed(3101)
random_state = np.random.RandomState(3101)
fake = Faker()
pd.set_option('display.float_format', lambda x: '{:.6f}'.format(x))

In [65]:
bank_df_train = pd.read_csv('./data/main/train.csv')
n_train = bank_df_train.shape[0]

### Data Cleaning

#### Remove Irrelevant Columns 

In [66]:
bank_df_train = bank_df_train.drop(["id", "CustomerId", "Surname", "CreditScore", "Geography", "HasCrCard", "IsActiveMember"],axis = 1)

### Adding new columns 

#### Customer Id 

In [67]:
cust_id = np.arange(1,n_train+1,1)
bank_df_train['CustomerId'] = cust_id

In [68]:
#### Service Support Freq (per mth)

Includes missed calls 

In [69]:
support_freq = pd.read_csv("./data/support_frequency.csv")
kde = gaussian_kde(support_freq['no_of_cases'])
bank_df_train['ServiceSupportFrequency'] = abs(kde.resample(n_train).flatten()/12).astype(int)

print(bank_df_train['ServiceSupportFrequency'].describe())


count   165034.000000
mean        13.348552
std         12.490492
min          0.000000
25%          4.000000
50%          9.000000
75%         20.000000
max         73.000000
Name: ServiceSupportFrequency, dtype: float64


#### Net Promoter Score (NPS)
Measure of customer satisfaction and loyalty.
Scaled from 1 to 10 


In [70]:
nps = pd.read_csv('./data/NPS.csv')

nps_data = nps.groupby('Customer Name').agg({'NPS':'mean'}).reset_index()
kde = gaussian_kde(nps_data['NPS'])
bank_df_train['NPS'] = abs(kde.resample(n_train).flatten()).astype(int)
bank_df_train['NPS'] = np.ceil(bank_df_train['NPS']/12*10)

print(bank_df_train['NPS'].describe())

count   165034.000000
mean         5.737745
std          2.986255
min          0.000000
25%          4.000000
50%          7.000000
75%          8.000000
max         10.000000
Name: NPS, dtype: float64


#### Education
Education level might influence financial behavior and churn.


In [71]:
education_marital = pd.read_csv("./data/education_marital.csv")
education = education_marital['Education_Level'].unique()
edu, counts = np.unique(education, return_counts=True)
value_to_index = {value: i for i, value in enumerate(edu)}
numerical_data = [value_to_index[value] for value in education]
kde = gaussian_kde(numerical_data)
x_values = np.unique(numerical_data)
pdf_values = kde(x_values)
pmf = pdf_values / np.sum(pdf_values)
resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)
resampled_values = [edu[index] for index in resampled_indices]

bank_df_train['Education'] = resampled_values
bank_df_train['Education'].replace({'Unknown': 'PSLE'}, inplace=True)
bank_df_train['Education'].replace({'Uneducated': np.random.choice(['A', 'Diploma'])}, inplace=True)
bank_df_train['Education'].replace({'High School': 'Bachelors'}, inplace=True)
bank_df_train['Education'].replace({'College': 'O/N'}, inplace=True)
bank_df_train['Education'].replace({'Graduate': 'Masters'}, inplace=True)
bank_df_train['Education'].replace({'Post-Graduate': 'PHD'}, inplace=True)
bank_df_train['Education'].replace({'Doctorate': 'Post-Doc'}, inplace=True)

print(bank_df_train['Education'].unique())

['Bachelors' 'Masters' 'Diploma' 'O/N' 'PHD' 'Post-Doc' 'PSLE']


In [72]:
bank_df_train['Education'].value_counts()

Education
Bachelors    27923
Masters      26974
PHD          26673
Diploma      23896
Post-Doc     23838
O/N          17868
PSLE         17862
Name: count, dtype: int64

#### Employment Status
Indicates financial stability, affecting churn likelihood

In [73]:
employment = pd.read_csv("./data/employment.csv")

employment_data = employment['job'].unique()

employment_status, counts = np.unique(employment_data, return_counts=True)
value_to_index = {value: i for i, value in enumerate(employment_status)}
numerical_data = [value_to_index[value] for value in employment_data]

kde = gaussian_kde(numerical_data)

x_values = np.unique(numerical_data)
pdf_values = kde(x_values)
pmf = pdf_values / np.sum(pdf_values)

resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)

resampled_values = [employment_status[index] for index in resampled_indices]

bank_df_train['EmploymentStatus'] = resampled_values

bank_df_train['EmploymentStatus'].replace({'admin.': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'blue-collar': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'entrepreneur': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'housemaid': 'Parttime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'management': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'retired': 'Retired'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'student': 'Student'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'services': 'Fulltime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'self-employed': 'Self-employed'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'technician': 'Parttime'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'unemployed': 'Unemployed'}, inplace=True)
bank_df_train['EmploymentStatus'].replace({'unknown': 'Parttime'}, inplace=True)

print(bank_df_train['EmploymentStatus'].unique())

['Parttime' 'Fulltime' 'Self-employed' 'Student' 'Retired' 'Unemployed']


#### Marital Status
Can impact financial decision-making and churn behavior. 

In [74]:
education_marital = pd.read_csv("./data/education_marital.csv")

marital_status = education_marital['Marital_Status'].unique()

marital, counts = np.unique(marital_status, return_counts=True)
value_to_index = {value: i for i, value in enumerate(marital)}
numerical_data = [value_to_index[value] for value in marital_status]

kde = gaussian_kde(numerical_data)

x_values = np.unique(numerical_data)
pdf_values = kde(x_values)

pmf = pdf_values / np.sum(pdf_values)

resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)

resampled_values = [marital[index] for index in resampled_indices]

bank_df_train['MaritalStatus'] = resampled_values

bank_df_train['MaritalStatus'].replace({'Unknown': 'Widowed'}, inplace=True)

print(bank_df_train['MaritalStatus'].unique())

['Married' 'Single' 'Widowed' 'Divorced']


#### Housing Status
Reflects stability and long-term commitment, influencing churn. 

In [75]:
housing = pd.read_csv("./data/housing.csv")

housing_status = housing['House_Ownership'].unique()

house, counts = np.unique(housing_status, return_counts=True)
value_to_index = {value: i for i, value in enumerate(house)}
numerical_data = [value_to_index[value] for value in housing_status]

kde = gaussian_kde(numerical_data)

x_values = np.unique(numerical_data)
pdf_values = kde(x_values)

pmf = pdf_values / np.sum(pdf_values)

resampled_indices = np.random.choice(x_values, size=n_train, p=pmf)

resampled_values = [house[index] for index in resampled_indices]

bank_df_train['HousingStatus'] = resampled_values

print(bank_df_train['HousingStatus'].unique())

['owned' 'norent_noown' 'rented']


#### Number of Dependents
Impacts financial priorities and risk tolerance, affecting churn.

In [76]:
dependants_df = pd.read_csv('./data/education+dependents+maritalstatus/BankChurners.csv')

kde = gaussian_kde(dependants_df['Dependent_count'])

bank_df_train['Dependants'] = np.maximum(kde.resample(len(bank_df_train)).flatten(), 0)
bank_df_train['Dependants'] = np.round(bank_df_train['Dependants']).astype(int)

print(bank_df_train['Dependants'].describe())
print(bank_df_train['Dependants'].unique())

count   165034.000000
mean         2.343687
std          1.305842
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max          6.000000
Name: Dependants, dtype: float64
[3 4 2 0 1 5 6]


#### Marketing Offers Accepted
Indicates responsiveness to incentives, affecting churn. Range 0-1, (percentage of marketing offers they accept, e.g.5 offer, 4 accepted, the value of the column 0.8)

In [77]:
marketing_df = pd.read_csv('./data/marketing offers/marketing_campaign.csv', sep=';')

marketing_df["MarketingOffersAcceptance"] = (marketing_df['AcceptedCmp1'] + marketing_df['AcceptedCmp2'] + marketing_df['AcceptedCmp3'] + marketing_df['AcceptedCmp4'] + marketing_df['AcceptedCmp5']) / 5

kde = gaussian_kde(marketing_df['MarketingOffersAcceptance'])

bank_df_train['MarketingOffersAcceptance'] = np.maximum(kde.resample(len(bank_df_train)).flatten(), 0)

print(bank_df_train['MarketingOffersAcceptance'].describe())

count   165034.000000
mean         0.068896
std          0.133310
min          0.000000
25%          0.000000
50%          0.009626
75%          0.046468
max          0.878459
Name: MarketingOffersAcceptance, dtype: float64


#### Preferred Payment Methods 
Reflects preferred banking channels and engagement level. 
faker

In [78]:
transaction_channel_df = pd.read_csv('./data/main_payment_method/WA_Fn-UseC_-Telco-Customer-Churn.csv')
methods = transaction_channel_df['PaymentMethod'].unique().tolist()
faked_data = [random.choice(methods) for _ in range(len(bank_df_train))]

bank_df_train['PaymentMethod'] = faked_data

bank_df_train['PaymentMethod'].replace({'Bank transfer (automatic)': 'Intrabank transfer (GXS Savings Account only)'}, inplace=True)
bank_df_train['PaymentMethod'].replace({'Electronic check': 'PayNow'}, inplace=True)
bank_df_train['PaymentMethod'].replace({'Credit card (automatic)': 'Debit card'}, inplace=True)
bank_df_train['PaymentMethod'].replace({'Mailed check': 'FAST'}, inplace=True)

print(bank_df_train['PaymentMethod'].unique())

['PayNow' 'Intrabank transfer (GXS Savings Account only)' 'Debit card'
 'FAST']


#### Brand Satisfaction
Provides direct feedback on satisfaction levels, predicting churn. 

In [79]:
cust_satisfaction_df = pd.read_csv('./data/satisfaction score/Customer-Churn-Records.csv')

kde = gaussian_kde(cust_satisfaction_df['Satisfaction Score'])

fake_data = np.maximum(kde.resample(len(bank_df_train)).flatten(), 0)
fake_data = np.minimum(fake_data.flatten(), 5)
fake_data = np.round(fake_data).astype(int)

bank_df_train['BrandSatisfaction'] = fake_data

print(bank_df_train['BrandSatisfaction'].describe())

count   165034.000000
mean         3.011573
std          1.410494
min          0.000000
25%          2.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: BrandSatisfaction, dtype: float64


#### Feature Satisfaction
Scale on 1 to 5


In [80]:
feature_df = pd.read_csv('./data/feature_and_support_satisfaction/Customer-survey-data.csv')
feature_df = feature_df.dropna()
kde = gaussian_kde(feature_df['How satisfied were you with your overall delivery experience at Ali?                    1-5 where 1 = extremely dissatisfied and 5 = extremely satisfied'])
bank_df_train['FeatureSatisfaction'] = kde.resample(len(bank_df_train)).flatten().astype(int)
bank_df_train['FeatureSatisfaction'] = bank_df_train['FeatureSatisfaction']
bank_df_train['FeatureSatisfaction'] = bank_df_train['FeatureSatisfaction'].astype(int)
print(bank_df_train['FeatureSatisfaction'].unique())
print(bank_df_train['FeatureSatisfaction'].describe())

[0 1 4 5 2 3]
count   165034.000000
mean         2.816517
std          1.508244
min          0.000000
25%          2.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: FeatureSatisfaction, dtype: float64


#### Support Satisfaction
Scale on 1 to 5


In [81]:
feature_df = pd.read_csv('./data/feature_and_support_satisfaction/Customer-survey-data.csv')
feature_df = feature_df.dropna()
kde = gaussian_kde(feature_df['How satisfied were you with the speed of delivery at Alis?                                1-5 where 1 = extremely dissatisfied and 5 = extremely satisfied'])
bank_df_train['SupportSatisfaction'] = kde.resample(len(bank_df_train)).flatten().astype(int)

print(bank_df_train['SupportSatisfaction'].unique())
print(bank_df_train['SupportSatisfaction'].describe())

[3 4 1 5 2 0]
count   165034.000000
mean         2.825278
std          1.491990
min          0.000000
25%          2.000000
50%          3.000000
75%          4.000000
max          5.000000
Name: SupportSatisfaction, dtype: float64
