# IMPORTS

## Libraries

In [2]:
import warnings

import numpy  as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats as ss

from IPython.display      import Image
from IPython.core.display import HTML


warnings.filterwarnings("ignore")

## Load Dataset

In [3]:
dfRaw = pd.read_feather('00-Data/FeatherData/df03.feather')

In [4]:
dfRaw.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Origin
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,True,True,101348.88,True,Latin
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,False,True,112542.58,False,Latin
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,True,False,113931.57,True,Latin
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,False,False,93826.63,False,Latin
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,True,True,79084.1,False,Latin


## Helper Functions

#### Jupyter Settings

In [5]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML('<style>.container { width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    sns.set()

In [6]:
jupyter_settings()  

Populating the interactive namespace from numpy and matplotlib


#### Functions

In [6]:
def cramerV(x, y):
    cm = pd.crosstab(x, y).to_numpy()
    n = cm.sum()
    r, k = cm.shape
    
    chi2 = ss.chi2_contingency(cm)[0]
    chi2corr = max(0, chi2 - (k-1)*(r-1)/(n-1))
    
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt((chi2corr/n)/(min(kcorr-1,rcorr-1)))

# DATA PREPARATION

In [7]:
df04 = dfRaw.copy()

## ~~Normalization~~

## Rescaling

In [9]:
# Numerical Attributes and Boolean
numAttributesAndBool = df04.select_dtypes(include=['int64', 'float64', 'bool'])
NotNumerial = ['RowNumber', 'CustomerId']
numAttributesAndBool = numAttributesAndBool[numAttributesAndBool.columns[~numAttributesAndBool.columns.isin(NotNumerial)]]
numAttributesAndBool.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,42,2,0.0,1,True,True,101348.88,True
1,608,41,1,83807.86,1,False,True,112542.58,False
2,502,42,8,159660.8,3,True,False,113931.57,True
3,699,39,1,0.0,2,False,False,93826.63,False
4,850,43,2,125510.82,1,True,True,79084.1,False


In [22]:
import bisect

def boundaries(num, breakpoints=[10, 20, 30, 45, 60, 70, 80, 120], result='01234567'):
    i = bisect.bisect(breakpoints, num-1)
    ageMapping = {
        0: 'Child',
        1: 'Teenager',
        2: 'Young',
        3: 'Adult',
        4: 'Midlife',
        5: 'Mature Adulthood',
        6: 'Senior',
        7: 'Late Adulthood'
    }
    
    return ageMapping[i]




num = int(input('Please input a number: '))
print(boundaries(num))

Please input a number:  36


Adult


In [23]:
df04['AgeGroup'] = df04['Age'].apply(lambda row: boundaries(row))

In [25]:
df04.sample(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Origin,AgeGroup
2197,2198,15670753,Uvarova,614,Spain,Male,35,2,127283.78,1,True,True,31302.35,False,Latin,Adult
4281,4282,15721251,Watson,554,Spain,Female,41,4,112152.89,1,False,True,36242.19,False,Latin,Adult
449,450,15658169,Cook,778,Spain,Female,47,6,127299.34,2,True,False,124694.99,False,Latin,Midlife
6719,6720,15776629,Christie,650,France,Female,39,4,0.0,2,False,False,186275.7,False,Latin,Adult
8783,8784,15617052,Watson,782,France,Male,34,9,0.0,1,True,False,183021.06,True,Latin,Adult
5644,5645,15783522,Mitchell,738,Spain,Female,37,8,100565.94,1,True,True,128799.86,False,Latin,Adult
7724,7725,15673591,Oluchukwu,842,France,Male,44,3,141252.18,4,False,True,128521.16,True,Latin,Adult
2223,2224,15615575,Vial,722,France,Male,34,8,0.0,2,True,True,133447.49,False,Latin,Adult
7531,7532,15642063,Kelechi,692,France,Male,40,6,163505.16,1,False,False,90424.09,False,Latin,Adult
3552,3553,15730161,Marcelo,833,France,Female,39,3,0.0,2,True,False,1710.89,False,Latin,Adult


# Convert to .feather

In [26]:
df03.to_feather('00-Data/FeatherData/df01.feather')