### Import Necessary Libraries

In [1]:
## Data Analysis
import numpy as np
import pandas as pd

## Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

### Load the Dataset

In [2]:
df = pd.read_csv('BankChurners.csv')
df = df.iloc[:,0:-2]                ### Remove the last two columns as the are unneccessary.
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [3]:
print(f"The Data has {df.shape[0]} rows and {df.shape[1]} columns")

The Data has 10127 rows and 21 columns


### Feature Engineering

##### Dropping the 'CLIENTNUM' Column

In [4]:
df.drop('CLIENTNUM', axis=1, inplace=True)

##### Encoding the Target Variable

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Attrition_Flag'] = le.fit_transform(df['Attrition_Flag'])

##### Encoding the Genders Column

In [6]:
df['Gender'] = le.fit_transform(df['Gender'])

##### Dealing with the "Unknown" category in Education column

In [7]:
df['Education_Level'].unique()

array(['High School', 'Graduate', 'Uneducated', 'Unknown', 'College',
       'Post-Graduate', 'Doctorate'], dtype=object)

In [8]:
df['Education_Level'].value_counts()

Graduate         3128
High School      2013
Unknown          1519
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: Education_Level, dtype: int64

- Here we will not be grouping the Categories. This is done so because if we group then the new category will overtake another Category

##### Dealing with the "Unknown" category in Marital Status column

In [10]:
## We have seen that in our EDA Section the Marital Status Unknowns can be replaced with Divorced as the Revolving Balance on these 2 Categories 
# are close, also the Count of Clients are also somewhat close to each other. Also in the Univariate Analysis we have seen that both the Categories 
# have the same percentage.

df['Marital_Status'].replace('Unknown', 'Divorced', inplace=True)

##### Dealing with the "Unknown" category in Income Category column

In [11]:
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

In [12]:
df['Income_Category'].value_counts()

Less than $40K    3561
$40K - $60K       1790
$80K - $120K      1535
$60K - $80K       1402
Unknown           1112
$120K +            727
Name: Income_Category, dtype: int64

In [13]:
for i in range(len(df)):
    if df['Card_Category'][i] == 'Blue':
        if df['Income_Category'][i] == 'Unknown':
            df['Income_Category'][i] = 'Less than $40K'
        
    elif df['Card_Category'][i] == 'Gold':
        if df['Income_Category'][i] == 'Unknown':
            df['Income_Category'][i] = '$60K - $80K'
        
    elif df['Card_Category'][i] == 'Silver':
        if df['Income_Category'][i] == 'Unknown':
            df['Income_Category'][i] = '$40K - $60K'
        
    elif df['Card_Category'][i] == 'Platinum':
        if df['Income_Category'][i] == 'Unknown':
            df['Income_Category'][i] = '$80K - $120K'
        
    else:
        pass

##### Dropping Unnecessary Columns

In [15]:
df.drop(columns=['Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1'], inplace=True)
df.shape

(10127, 18)

#### Outlier Treatment

##### Remove Outliers in Customer Age

In [17]:
Upper_Limit  = df["Customer_Age"].quantile(0.99)

df = df[(df["Customer_Age"] < Upper_Limit)]

In [19]:
df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,1,45,1,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1144,42,0.061
1,1,49,0,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1291,33,0.105
2,1,51,1,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,1887,20,0.0
3,1,40,0,4,High School,Divorced,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1171,20,0.76
4,1,40,1,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,816,28,0.0


### Feature Encoding

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
df_one = df.copy()
df_lab = df.copy()

#### LABEL ENCODING

In [22]:
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
label_encoder = LabelEncoder()

for feature in categorical_features:
    trial = df_lab[[feature]]
    print(feature)
    trial[feature+' encode'] = label_encoder.fit_transform(trial[feature].values)
    trial = trial.drop_duplicates(feature)
    print(trial)
    print()

Education_Level
   Education_Level  Education_Level encode
0      High School                       3
1         Graduate                       2
4       Uneducated                       5
6          Unknown                       6
12         College                       0
16   Post-Graduate                       4
20       Doctorate                       1

Marital_Status
  Marital_Status  Marital_Status encode
0        Married                      1
1         Single                      2
3       Divorced                      0

Income_Category
  Income_Category  Income_Category encode
0     $60K - $80K                       2
1  Less than $40K                       4
2    $80K - $120K                       3
5     $40K - $60K                       1
6         $120K +                       0

Card_Category
    Card_Category  Card_Category encode
0            Blue                     0
6            Gold                     1
7          Silver                     3
443      Platinum   

In [23]:
for feature in categorical_features:
    ## Encode labels in all Categorical Columns.
    df_lab[feature]= label_encoder.fit_transform(df_lab[feature])

df_lab.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,1,45,1,3,3,1,2,0,39,5,1,3,12691.0,777,11914.0,1144,42,0.061
1,1,49,0,5,2,2,4,0,44,6,1,2,8256.0,864,7392.0,1291,33,0.105
2,1,51,1,3,2,1,3,0,36,4,1,0,3418.0,0,3418.0,1887,20,0.0
3,1,40,0,4,3,0,4,0,34,3,4,1,3313.0,2517,796.0,1171,20,0.76
4,1,40,1,3,5,1,2,0,21,5,1,0,4716.0,0,4716.0,816,28,0.0


#### One-Hot ENCODING

In [24]:
df_one = pd.get_dummies(df_one, drop_first=True)
df_one.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,...,Education_Level_Unknown,Marital_Status_Married,Marital_Status_Single,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,1,45,1,3,39,5,1,3,12691.0,777,...,0,1,0,0,1,0,0,0,0,0
1,1,49,0,5,44,6,1,2,8256.0,864,...,0,0,1,0,0,0,1,0,0,0
2,1,51,1,3,36,4,1,0,3418.0,0,...,0,1,0,0,0,1,0,0,0,0
3,1,40,0,4,34,3,4,1,3313.0,2517,...,0,0,0,0,0,0,1,0,0,0
4,1,40,1,3,21,5,1,0,4716.0,0,...,0,1,0,0,1,0,0,0,0,0


### Feature Scaling

In [25]:
from sklearn.preprocessing import MinMaxScaler

In [26]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O' and df[feature].nunique() > 10]
numerical_features

['Customer_Age',
 'Months_on_book',
 'Credit_Limit',
 'Total_Revolving_Bal',
 'Avg_Open_To_Buy',
 'Total_Trans_Amt',
 'Total_Trans_Ct',
 'Avg_Utilization_Ratio']

In [27]:
## Try this to get the columns that need not be scaled.
# [feature for feature in df.columns if feature not in numerical_features or feature in categorical_features] --> Label Encoded 
# [feature for feature in df_one.columns if feature not in numerical_features] --> One Hot Encoded 

In [28]:
scaler = MinMaxScaler()

# Transform the df_lab
df_lab = pd.concat([df_lab[['Attrition_Flag', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 
                            'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon']].reset_index(drop= True), 
       pd.DataFrame(scaler.fit_transform(df_lab[numerical_features]), columns= numerical_features)], axis= 1) ## Converting to Dataframe as after transform its an array.

df_lab.head()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Customer_Age,Months_on_book,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,1,1,3,3,1,2,0,5,1,3,0.5,0.604651,0.34019,0.308701,0.345116,0.035273,0.248062,0.061061
1,1,0,5,2,2,4,0,6,1,2,0.605263,0.72093,0.206112,0.343266,0.214093,0.043452,0.178295,0.105105
2,1,1,3,2,1,3,0,4,1,0,0.657895,0.534884,0.05985,0.0,0.098948,0.076611,0.077519,0.0
3,1,0,4,3,0,4,0,3,4,1,0.368421,0.488372,0.056676,1.0,0.022977,0.036775,0.077519,0.760761
4,1,1,3,5,1,2,0,5,1,0,0.368421,0.186047,0.099091,0.0,0.136557,0.017025,0.139535,0.0


In [29]:
scaler = MinMaxScaler()

# Transform the df_one.
df_one = pd.concat([df_one[['Attrition_Flag', 'Gender', 'Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 
                            'Education_Level_Doctorate', 'Education_Level_Graduate', 'Education_Level_High School', 'Education_Level_Post-Graduate', 
                            'Education_Level_Uneducated', 'Marital_Status_Married', 'Marital_Status_Single', 'Income_Category_$40K - $60K',
                            'Income_Category_$60K - $80K', 'Income_Category_$80K - $120K', 'Income_Category_Less than $40K', 'Card_Category_Gold', 
                            'Card_Category_Platinum', 'Card_Category_Silver']].reset_index(drop= True), 
       pd.DataFrame(scaler.fit_transform(df_one[numerical_features]), columns= numerical_features)], axis= 1) ## Converting to Dataframe as after transform its an array.

df_one.head()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,...,Card_Category_Platinum,Card_Category_Silver,Customer_Age,Months_on_book,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Trans_Amt,Total_Trans_Ct,Avg_Utilization_Ratio
0,1,1,3,5,1,3,0,0,1,0,...,0,0,0.5,0.604651,0.34019,0.308701,0.345116,0.035273,0.248062,0.061061
1,1,0,5,6,1,2,0,1,0,0,...,0,0,0.605263,0.72093,0.206112,0.343266,0.214093,0.043452,0.178295,0.105105
2,1,1,3,4,1,0,0,1,0,0,...,0,0,0.657895,0.534884,0.05985,0.0,0.098948,0.076611,0.077519,0.0
3,1,0,4,3,4,1,0,0,1,0,...,0,0,0.368421,0.488372,0.056676,1.0,0.022977,0.036775,0.077519,0.760761
4,1,1,3,5,1,0,0,0,0,0,...,0,0,0.368421,0.186047,0.099091,0.0,0.136557,0.017025,0.139535,0.0


### Feature Selection (Label Encoded Data)

In [30]:
## Capture the Independent Variables
X_lab = df_lab.drop(columns= ['Attrition_Flag'], axis= 1)

## Capture the Dependent Variable
y_lab = df_lab['Attrition_Flag']

#### STEP 1: Identify Correlated Features

In [31]:
# With the following Function we can select Highly Correlated Features it will remove the first feature that is Correlated with anything other Feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_lab, 0.7)
corr_features

{'Avg_Open_To_Buy', 'Months_on_book', 'Total_Trans_Ct'}

- We will not be removing these for the initial analysis. If there is an issue with these then we will comeback and remove them with the code below.

In [91]:
# X_lab.drop(corr_features, axis=1, inplace=True)
# X_lab.head()

Unnamed: 0,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Customer_Age,Credit_Limit,Total_Revolving_Bal,Total_Trans_Amt,Avg_Utilization_Ratio
0,1,3,3,1,2,0,5,1,3,0.5,0.34019,0.308701,0.035273,0.061061
1,0,5,2,2,4,0,6,1,2,0.605263,0.206112,0.343266,0.043452,0.105105
2,1,3,2,1,3,0,4,1,0,0.657895,0.05985,0.0,0.076611,0.0
3,0,4,3,0,4,0,3,4,1,0.368421,0.056676,1.0,0.036775,0.760761
4,1,3,5,1,2,0,5,1,0,0.368421,0.099091,0.0,0.017025,0.0


#### Use Ensemble Method

In [32]:
from sklearn.ensemble import ExtraTreesClassifier

## Genetic Disorder
model = ExtraTreesClassifier()
model.fit(X_lab,y_lab)

ExtraTreesClassifier()

In [33]:
print(model.feature_importances_)

[0.01799185 0.03084684 0.02954756 0.02523729 0.02269467 0.00674558
 0.08384451 0.04570963 0.05456961 0.0395949  0.03682338 0.04091442
 0.1207468  0.03955687 0.15794732 0.19435205 0.05287671]


In [34]:
ranked_features = pd.Series(model.feature_importances_, index= X_lab.columns)
ranked_features.nlargest(19)

Total_Trans_Ct              0.194352
Total_Trans_Amt             0.157947
Total_Revolving_Bal         0.120747
Total_Relationship_Count    0.083845
Contacts_Count_12_mon       0.054570
Avg_Utilization_Ratio       0.052877
Months_Inactive_12_mon      0.045710
Credit_Limit                0.040914
Customer_Age                0.039595
Avg_Open_To_Buy             0.039557
Months_on_book              0.036823
Dependent_count             0.030847
Education_Level             0.029548
Marital_Status              0.025237
Income_Category             0.022695
Gender                      0.017992
Card_Category               0.006746
dtype: float64

### Feature Selection (One-Hot Encoded Data)

In [35]:
## Capture the Independent Variables
X_one = df_one.drop(columns= ['Attrition_Flag'], axis= 1)

## Capture the Dependent Variable
y_one = df_one['Attrition_Flag']

#### STEP 1: Identify Correlated Features

In [36]:
# With the following Function we can select Highly Correlated Features it will remove the first feature that is Correlated with anything other Feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_one, 0.7)
corr_features

{'Avg_Open_To_Buy',
 'Income_Category_Less than $40K',
 'Marital_Status_Single',
 'Months_on_book',
 'Total_Trans_Ct'}

In [37]:
# X_one.drop(corr_features, axis=1, inplace=True)
# X_one.head()

#### Use Ensemble Method

In [38]:
from sklearn.ensemble import ExtraTreesClassifier

## Genetic Disorder
model = ExtraTreesClassifier()
model.fit(X_one,y_one)

ExtraTreesClassifier()

In [39]:
print(model.feature_importances_)

[0.01470539 0.03202896 0.07326464 0.04471163 0.05272345 0.0065986
 0.01497859 0.01246735 0.00721572 0.01145957 0.01563558 0.01418028
 0.00905466 0.00717552 0.00803028 0.01020966 0.00249456 0.00072094
 0.00531727 0.03969528 0.03713805 0.04190308 0.11034632 0.04003072
 0.14750912 0.18269819 0.05770657]


In [40]:
ranked_features = pd.Series(model.feature_importances_, index= X_one.columns)
ranked_features.nlargest(19)

Total_Trans_Ct                    0.182698
Total_Trans_Amt                   0.147509
Total_Revolving_Bal               0.110346
Total_Relationship_Count          0.073265
Avg_Utilization_Ratio             0.057707
Contacts_Count_12_mon             0.052723
Months_Inactive_12_mon            0.044712
Credit_Limit                      0.041903
Avg_Open_To_Buy                   0.040031
Customer_Age                      0.039695
Months_on_book                    0.037138
Dependent_count                   0.032029
Marital_Status_Married            0.015636
Education_Level_Graduate          0.014979
Gender                            0.014705
Marital_Status_Single             0.014180
Education_Level_High School       0.012467
Education_Level_Uneducated        0.011460
Income_Category_Less than $40K    0.010210
dtype: float64

### Feature Selection - Conclusion

- Since the methods used other than correlation didn't yield a good result we will go with the correlation method.

In [41]:
label_data = X_lab.copy()
label_data['Attrition_Flag'] = y_lab

onehot_data = X_one.copy()
onehot_data['Attrition_Flag'] = y_one

In [42]:
# Convert DataFrame to csv.
label_data.to_csv('label.csv', index= False)
onehot_data.to_csv('onehot.csv', index= False)
print('Both csv is ready')

Both csv is ready
