# Exploratory Data Analysis (EDA)

### Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### UDFs

In [2]:
def count_plot(x, dataframe, ax=None, **kwargs):
    if len(kwargs)==1 and kwargs['mode']=='horizontal':
        sns.countplot(y=x, data=dataframe, ax=ax, order=dataframe[x].value_counts().index)
    else:
        sns.countplot(x=x, data=dataframe, ax=ax, order=dataframe[x].value_counts().index)
    plt.box(False)
    return None

    
def pie_plot(df_col, fig_size, title):
    fig, ax = plt.subplots(figsize=fig_size)
    ax.pie(df_col.values, autopct='%1.2f%%', shadow=False, startangle=90)
    ax.axis('equal')
    plt.legend(labels=df_col.index, bbox_to_anchor=(1.05, 1))
    plt.title(title)
    plt.show()
    return None


def stacked_vBar_plot(dataframe, value, index, column, xlabel, fig_size, scale='linear', with_percent=True):
    
    df_1 = dataframe.pivot_table(values=[value], index=[index], columns=[column], aggfunc=len, margins=True)
    df_1_percent = df_1.div(df_1.iloc[:,-1], axis=0).mul(100, axis=0).round(2)
    df_2_percent = df_1_percent.iloc[:, :-1].drop('All')
    
    columns = df_2_percent.columns.levels[1].tolist()
    columns.remove('All')
    
    ax = df_2_percent.plot.bar(stacked=True)
    ax.figure.set_size_inches(fig_size)
    ax.grid(False)
    plt.legend(labels=columns, bbox_to_anchor=(1.05, 1), title='NPS Type')
    plt.xlabel(xlabel)
    plt.ylabel('%GT Count of NPS_Type')
    
    if with_percent:
        # Add this loop to add the annotations
        for p in ax.patches:
            width, height = p.get_width(), p.get_height()
            x, y = p.get_xy() 
            ax.annotate('{:.2f}%'.format(height), (x + width/8, y + height/2))
    plt.box(False)
    plt.show()
    return None


def pdf_distribution_plots(df, features, target):
    nrow = int((len(features)/3) + len(features)%3)
    
    t0 = df.loc[df[target] == 0]
    t1 = df.loc[df[target] == 1]

    sns.set_style('whitegrid')
    fig = plt.figure(figsize=(15,10))

    for indx, feature in enumerate(features):
        ax = fig.add_subplot(nrow, 3, indx+1)
        sns.kdeplot(t0[feature], label="0", legend=True)
        sns.kdeplot(t1[feature], label="1", legend=True)
        ax.set_ylabel('Density', fontsize=12)
        ax.set_xlabel(feature, fontsize=12)
        ax.tick_params(axis='both', which='major', labelsize=15)
        ax.legend(loc='best')
    
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None,wspace= 0.3, hspace=0.5)
    plt.show()
    return None


def clipping(dataframe, num_cols):
    df_copy = dataframe.copy()
    for col in num_cols:
        p25 = np.percentile(df_copy[col], 25)
        p75 = np.percentile(df_copy[col], 75)
        iqr = p75 - p25
        df_copy[col] = np.clip(df_copy[col], a_min=(p25 - 1.5*iqr), a_max=(p75 + 1.5*iqr))
    return df_copy


def get_category(df, col, binsnum, labels, qcut = False):
    if qcut:
        localdf = pd.qcut(df[col], q = binsnum, labels = labels) # quantile cut
    else:
        localdf = pd.cut(df[col], bins = binsnum, labels = labels) # equal-length cut
        
    localdf = pd.DataFrame(localdf)
    name = col + '_CAT'
    localdf[name] = localdf[col]
    df = df.join(localdf[name])
    df[name] = df[name].astype(object)
    return df

### Configurations

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

### Load Datasets

In [4]:
application_df = pd.read_csv('./data/application_record.csv')

application_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [5]:
credit_df = pd.read_csv('./data/credit_record.csv')

credit_df.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [6]:
print("Application dataset shape: {}".format(application_df.shape))
print("Credit dataset shape: {}".format(credit_df.shape))

Application dataset shape: (438557, 18)
Credit dataset shape: (1048575, 3)


In [7]:
print("Unique IDs in application dataset: {}".format(len(set(application_df['ID']))))

Unique IDs in application dataset: 438510


In [8]:
print("Unique IDs in credit dataset: {}".format(len(set(credit_df['ID']))))

Unique IDs in credit dataset: 45985


In [9]:
print("Common IDs in both datasets: {}".format(len(set(application_df['ID']).intersection(set(credit_df['ID'])))))

Common IDs in both datasets: 36457


##### Drop Duplicated IDs

In [23]:
ids = application_df["ID"]

app_df_dup = application_df[ids.isin(ids[ids.duplicated()])].sort_values("ID")

app_df_dup.shape

(94, 18)

In [24]:
application_df = application_df.drop_duplicates(subset='ID')

print("Application dataset shape: {}".format(application_df.shape))

Application dataset shape: (438510, 18)


In [None]:
application_df.isna().sum()

In [None]:
application_df.nunique()

In [None]:
credit_df.isna().sum()

In [None]:
credit_df.nunique()

### Target Label Creation

In order to identify *high risk* and *low risk* credit users below method is used.

- **High Risk (Class 1):** If during any month, user is late on payments by 30 days or more.
- **Low Risk (Class 0):** If the user has paid off, no loan taken or 1-29 days past due during any month

In [None]:
credit_df['target_status'] = np.where((credit_df['STATUS']=='0')|(credit_df['STATUS']=='C')|(credit_df['STATUS']=='X'), 0, 1)

# target=1 (high risk) iff there is at least one month where user is late on payments by 30 days or more
target_df=pd.DataFrame(credit_df.groupby(['ID'])['target_status'].agg(max)).reset_index()

target_df.head()

In [None]:
target_df.groupby('target_status')['ID'].count()

In [None]:
target_df['target_status'].value_counts(normalize=True)

In [None]:
# Merge target label to application dataset
merged_df = pd.merge(application_df, target_df, how='inner', on='ID')

merged_df.head()

In [None]:
merged_df['target_status'] = merged_df['target_status'].astype("category")

In [None]:
merged_df.shape

### Data Pre-processing

In [None]:
merged_df['ID'].nunique()

In [None]:
merged_df.isna().sum()

In [None]:
merged_df['OCCUPATION_TYPE'].value_counts()

In [None]:
merged_df['OCCUPATION_TYPE'].fillna(value='Other', inplace=True)

merged_df.isna().sum()

In [None]:
merged_df.dtypes

##### Feature Engineering

In [None]:
merged_df = get_category(merged_df, col='AMT_INCOME_TOTAL', binsnum=3, labels=["low","medium", "high"], qcut = True)

merged_df.head()

##### Data Transformation

In [None]:
merged_df['NAME_EDUCATION_TYPE'].unique()

In [None]:
merged_df['DAYS_BIRTH'] = -1 * merged_df['DAYS_BIRTH']

merged_df['DAYS_EMPLOYED'] = -1 * merged_df['DAYS_EMPLOYED']
merged_df['DAYS_EMPLOYED'] = np.where((merged_df['DAYS_EMPLOYED']<=0), 0, merged_df['DAYS_EMPLOYED'])

merged_df['NAME_EDUCATION_TYPE_ENCODED'] = merged_df['NAME_EDUCATION_TYPE'].replace({'Lower secondary': 0,
                                                                                     'Secondary / secondary special': 1,
                                                                                     'Incomplete higher': 2,
                                                                                     'Higher education': 3,
                                                                                     'Academic degree': 4})

In [None]:
cat_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'AMT_INCOME_TOTAL_CAT']

num_features = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']

all_features = num_features + cat_features

##### Statistics

In [None]:
merged_df['CNT_FAM_MEMBERS'] = merged_df['CNT_FAM_MEMBERS'].astype(int)

In [None]:
merged_df[num_features].describe().applymap('{:,.2f}'.format)

In [None]:
merged_df[cat_features] = merged_df[cat_features].astype("category")
merged_df[cat_features].describe()

##### Outlier Handling

In [None]:
handle_outliers = 'yes'

if handle_outliers=='yes':
    merged_df = clipping(dataframe=merged_df, num_cols=num_features)
elif handle_outliers=='no':
    merged_df = merged_df

### Data Visualization - Univariate Analysis

##### Target label

In [None]:
feature = 'target_status'
unique_id = 'ID'

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### CODE_GENDER

In [None]:
feature = 'CODE_GENDER'
unique_id = 'ID'

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### FLAG_OWN_CAR

In [None]:
feature = 'FLAG_OWN_CAR'
unique_id = 'ID'

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### FLAG_OWN_REALTY

In [None]:
feature = 'FLAG_OWN_REALTY'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### NAME_INCOME_TYPE

In [None]:
feature = 'NAME_INCOME_TYPE'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(15,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.1)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0], mode='horizontal')

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0, 0, 0.1, 0.2))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### NAME_EDUCATION_TYPE

In [None]:
feature = 'NAME_EDUCATION_TYPE'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(12,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.01)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0], mode='horizontal')

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0, 0, 0.1, 0.2))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### NAME_FAMILY_STATUS

In [None]:
feature = 'NAME_FAMILY_STATUS'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(12,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.01)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0], mode='horizontal')

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0, 0, 0.1, 0.2))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### NAME_HOUSING_TYPE

In [None]:
feature = 'NAME_HOUSING_TYPE'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(15,5))
fig1.subplots_adjust(hspace=0.4, wspace=0)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0], mode='horizontal')

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0, 0, 0.3, 0.2, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### FLAG_MOBIL

In [None]:
feature = 'FLAG_MOBIL'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90)
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### FLAG_WORK_PHONE

In [None]:
feature = 'FLAG_WORK_PHONE'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### FLAG_PHONE

In [None]:
feature = 'FLAG_PHONE'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### FLAG_EMAIL

In [None]:
feature = 'FLAG_EMAIL'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### OCCUPATION_TYPE

In [None]:
feature = 'OCCUPATION_TYPE'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(15,5))
fig1.subplots_adjust(hspace=0.4, wspace=0)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0], mode='horizontal')

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90)
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

##### AMT_INCOME_TOTAL_CAT

In [None]:
feature = 'AMT_INCOME_TOTAL_CAT'
unique_id = 'ID' 

fig1, axes1 = plt.subplots(1, 2, figsize=(10,5))
fig1.subplots_adjust(hspace=0.4, wspace=0.4)

count_plot(x=feature, dataframe=merged_df, ax=axes1[0])

target_status_df = merged_df.groupby([feature])[[unique_id]].count()
target_status_df.rename(columns={unique_id: 'Count'}, inplace=True)
target_status_df['Percentage'] = round(100*target_status_df['Count']/target_status_df['Count'].sum(), 2)
target_status_df.sort_values(by=['Percentage'], ascending=False, inplace=True)

axes1[1].pie(target_status_df['Count'].values, autopct='%1.2f%%', shadow=False, startangle=90, explode=(0, 0, 0.1))
axes1[1].axis('equal')
axes1[1].legend(labels=target_status_df['Count'].index, bbox_to_anchor=(1.05, 1))
axes1[1].title.set_text('Normalized Frequency of\n{}'.format(feature))

plt.show()

#### Histograms

In [None]:
nrow = int((len(num_features)/2) + len(num_features)%2)
#nrow = int(len(num_features))

fig = plt.figure(figsize=(15,15))

for i, feature in enumerate(num_features):
    ax = fig.add_subplot(nrow, 2, i+1)
    sns.histplot(data=merged_df, x=feature, kde=True)
    ax.set_xlabel(feature, fontsize=12)

# Adjust the spacing between subplots
fig.tight_layout()

# Display the plot
plt.show()

In [None]:
nrow = int((len(num_features)/2) + len(num_features)%2)
#nrow = int(len(num_features))

fig = plt.figure(figsize=(15,15))

for i, feature in enumerate(num_features):
    ax = fig.add_subplot(nrow, 2, i+1)
    sns.boxplot(data=merged_df, x=feature)
    ax.set_xlabel(feature, fontsize=12)

# Adjust the spacing between subplots
fig.tight_layout()

# Display the plot
plt.show()

### Data Visualization - Bivariate Analysis

#### Histograms

In [None]:
nrow = int((len(num_features)/2) + len(num_features)%2)
#nrow = int(len(num_features))

fig = plt.figure(figsize=(20,20))

for i, feature in enumerate(num_features):
    ax = fig.add_subplot(nrow, 2, i+1)
    sns.histplot(data=merged_df, x=feature, hue='target_status', kde=True)
    ax.set_xlabel(feature, fontsize=12)

# Adjust the spacing between subplots
fig.tight_layout()

# Display the plot
plt.show()

In [None]:
pdf_distribution_plots(df=merged_df, features=num_features, target='target_status')
plt.show()

##### Boxplots

In [None]:
nrow = int((len(num_features)/2) + len(num_features)%2)
#nrow = int(len(num_features))

fig = plt.figure(figsize=(20,10))

for i, feature in enumerate(num_features):
    ax = fig.add_subplot(nrow, 2, i+1)
    sns.boxplot(data=merged_df, x=feature, y="target_status")
    ax.set_xlabel(feature, fontsize=12)

# Adjust the spacing between subplots
fig.tight_layout()

# Display the plot
plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))

sns.boxplot(data=merged_df, x="AMT_INCOME_TOTAL", y="NAME_EDUCATION_TYPE")

plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))

sns.boxplot(data=merged_df, x="AMT_INCOME_TOTAL", y="FLAG_OWN_CAR")

plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))

sns.boxplot(data=merged_df, x="AMT_INCOME_TOTAL", y="NAME_INCOME_TYPE")

plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))

sns.boxplot(data=merged_df, x="AMT_INCOME_TOTAL", y="OCCUPATION_TYPE")

plt.show()

In [None]:
fig = plt.figure(figsize=(15,8))

sns.boxplot(data=merged_df, x="AMT_INCOME_TOTAL", y="AMT_INCOME_TOTAL_CAT")

plt.show()

##### Correlation

In [None]:
corr = merged_df[num_features].corr()

fig = plt.figure(figsize=(10, 8))
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
plt.show()

corr

##### Scatter Plots

In [None]:
fig = plt.figure()

ax = sns.scatterplot(data=merged_df, x="CNT_CHILDREN", y="CNT_FAM_MEMBERS")

#ax.set_xticks(np.arange(start=0, stop=20, step=1))
#ax.set_yticks(np.arange(start=0, stop=20, step=1))
plt.show()

In [None]:
fig = plt.figure()

sns.scatterplot(data=merged_df, x="DAYS_EMPLOYED", y="DAYS_BIRTH")

plt.show()

In [None]:
fig = plt.figure()

sns.scatterplot(data=merged_df[merged_df['DAYS_EMPLOYED']==0], x="DAYS_EMPLOYED", y="DAYS_BIRTH")

plt.show()

In [None]:
fig = plt.figure()

sns.scatterplot(data=merged_df, x="AMT_INCOME_TOTAL", y="DAYS_BIRTH")

plt.show()

##### Clustered Bar Charts

In [None]:
fig = plt.figure()

sns.countplot(data=merged_df, x="CODE_GENDER", hue="target_status")

plt.show()

In [None]:
fig = plt.figure()

sns.countplot(data=merged_df, x="FLAG_OWN_CAR", hue="target_status")

plt.show()

In [None]:
fig = plt.figure()

sns.countplot(data=merged_df, x="FLAG_OWN_REALTY", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(8,5))

sns.countplot(data=merged_df, x="NAME_INCOME_TYPE", hue="target_status")

plt.show()

In [None]:
fig = plt.figure()

sns.countplot(data=merged_df, x="AMT_INCOME_TOTAL_CAT", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(12,5))

sns.countplot(data=merged_df, x="NAME_EDUCATION_TYPE", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(12,5))

sns.countplot(data=merged_df, x="NAME_FAMILY_STATUS", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(12,5))

sns.countplot(data=merged_df, x="NAME_HOUSING_TYPE", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(8,5))

sns.countplot(data=merged_df, x="FLAG_WORK_PHONE", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(8,5))

sns.countplot(data=merged_df, x="FLAG_PHONE", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(8,5))

sns.countplot(data=merged_df, x="FLAG_EMAIL", hue="target_status")

plt.show()

In [None]:
fig = plt.figure(figsize=(15,10))

sns.countplot(data=merged_df, y="OCCUPATION_TYPE", hue="target_status")

plt.show()

### Data Visualization - Multivariate Analysis

#### Bubble Chart

In [None]:
fig = plt.figure(figsize=(20,10))

sns.scatterplot(data=merged_df[merged_df['DAYS_EMPLOYED']>=0], x="AMT_INCOME_TOTAL", y="DAYS_EMPLOYED", size="CNT_FAM_MEMBERS", sizes=(20, 200), hue="CNT_FAM_MEMBERS")

plt.show()

In [None]:
fig = plt.figure(figsize=(20,10))

sns.scatterplot(data=merged_df[merged_df['DAYS_EMPLOYED']>0],
                x="AMT_INCOME_TOTAL",
                y="DAYS_EMPLOYED",
                size="NAME_EDUCATION_TYPE_ENCODED",
                sizes=(20, 300),
                hue="NAME_EDUCATION_TYPE_ENCODED")

plt.show()

#### Pairplot

In [None]:
fig = plt.figure()

sns.pairplot(data=merged_df.drop(columns=['ID', 'NAME_EDUCATION_TYPE_ENCODED']), hue="target_status", markers=["o", "D"])

plt.show()