In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Turns The Appropriate Columns into Object Columns

In [27]:
def convert_to_categorical(df):
    """ 
    Returns a new dataframe in which 1, 0 are replaced by "yes", "no" respetively in the the 
    Exited, HasCrCard and IsActiveMember columns 
    """
    new_df = df.drop(columns = ["Exited", "IsActiveMember", "HasCrCard"])
    to_string_map = {1: "yes", 0: "no"}
    new_df = new_df.assign(Exited = df.Exited.map(to_string_map), 
                           IsActiveMember = df.IsActiveMember.map(to_string_map),
                           HasCrCard = df.HasCrCard.map(to_string_map))
    return new_df

In [21]:
#importing the full training dataset
full_df = pd.read_excel("./Test and Submission Data/train.xlsx")

In [28]:
#turning Exited, IsActiveMember, HasCrCard columns into object columns so they 
# aren't included in covariance matrix
for_var_df = convert_to_categorical(full_df)

In [32]:
print(full_df.shape)
full_df.head()

(164911, 11)


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1,0,181449.97,0
1,627,France,Male,33.0,1,0.0,2,1,1,49503.5,0
2,678,France,Male,40.0,10,0.0,2,1,0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1,1,84560.88,0
4,716,Spain,Male,33.0,5,0.0,2,1,1,15068.83,0


In [29]:
for_var_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Exited,IsActiveMember,HasCrCard
0,668,France,Male,33.0,3,0.0,2,181449.97,no,no,yes
1,627,France,Male,33.0,1,0.0,2,49503.5,no,yes,yes
2,678,France,Male,40.0,10,0.0,2,184866.69,no,no,yes
3,581,France,Male,34.0,2,148882.54,1,84560.88,no,yes,yes
4,716,Spain,Male,33.0,5,0.0,2,15068.83,no,yes,yes


# Summary Statistics of Variables

In [30]:
for_var_df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
count,164911.0,164911.0,164911.0,164911.0,164911.0,164911.0
mean,656.468914,38.124278,5.020599,55481.138955,1.55442,112579.798396
std,80.091065,8.86502,2.806237,62819.233625,0.547171,50290.229424
min,350.0,18.0,0.0,0.0,1.0,11.58
25%,597.0,32.0,3.0,0.0,1.0,74639.87
50%,659.0,37.0,5.0,0.0,2.0,117948.0
75%,710.0,42.0,7.0,119948.09,2.0,155155.25
max,850.0,92.0,10.0,250898.09,4.0,199992.48


In [44]:
#a df of all columns with object as it's type
cat_vars_df = for_var_df[["Exited", "IsActiveMember", "HasCrCard", "Geography", "Gender"]]
pd.DataFrame(cat_vars_df.groupby(by = "Exited").count()).iloc[:, 1]
for col in cat_vars_df.columns:
    print(pd.DataFrame(cat_vars_df.groupby(by = [col]).count()).iloc[:, 1])

Exited
no     130002
yes     34909
Name: HasCrCard, dtype: int64
IsActiveMember
no     82836
yes    82075
Name: HasCrCard, dtype: int64
HasCrCard
no      40582
yes    124329
Name: IsActiveMember, dtype: int64
Geography
France     94132
Germany    34587
Spain      36192
Name: IsActiveMember, dtype: int64
Gender
Female    71836
Male      93075
Name: IsActiveMember, dtype: int64
