Second Task: Exploratory Data Analysis - EDA

1. Univariate Analysis
2. Bivariate Analysis
3. Multivariate Analysis

# Librairies

In [2]:
# data manipulation
import pandas as pd
import numpy as np
import seaborn as sns
import datetime


# settings & warning handling
import warnings
pd.options.display.max_columns
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

In [3]:
# read in csv file as a DataFrame
data = pd.read_csv("Preprocessed_Data_Improved.csv")

In [4]:
# explore the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,CreditScore,FirstTimeHomebuyer,MSA,MIP,Units,OCLTV,DTI,OrigUPB,LTV,...,LoanPurpose,OrigLoanTerm,NumBorrowers,SellerName,ServicerName,EverDelinquent,MonthsDelinquent,MonthsInRepayment,Parsed_FirstPaymentDate,Parsed_MaturityDate
0,0,711.841009,0,16974,25,1,89,27.0,117000,89.0,...,P,360,2,FL,Major Banks,0,0,52,1999-02-01,2029-01-01
1,1,711.841009,0,19740,0,1,73,17.0,109000,73.0,...,N,360,1,FT,Non-Bank Servicers,0,0,144,1999-02-01,2029-01-01
2,2,711.841009,0,29940,0,1,75,16.0,88000,75.0,...,N,360,2,FL,Major Banks,0,0,67,1999-02-01,2029-01-01
3,3,711.841009,0,31084,0,1,76,14.0,160000,76.0,...,N,360,2,GM,Major Banks,0,0,35,1999-02-01,2029-01-01
4,4,711.841009,0,35644,0,1,78,18.0,109000,78.0,...,N,360,2,GM,Major Banks,0,0,54,1999-02-01,2029-01-01


In [None]:
# explore the last 5 rows
data.tail()

In [None]:
# size of the dataset
data.shape

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">1. Univariate Statistics</p> 

In [None]:
# checking the columns names
data.columns

In [None]:
# looking for more infos about the dataset
data.info()

We can use the .describe() method from pandas to see basic stats like count, mean, standard deviation, minimum, maximum, quantiles...

In [None]:
data.describe().T

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 
From the count column we can see that there is no missing values in our train dataset. This sounds good ;) <br>
Let's be sure about this assumption for the test, original and synthetic datasets. <br>
To achieve this, we will define our own function for univariate statistics with more informations such as the mode, skewness, kurtosis...
</b>
</p> 

In [None]:
# fonction to calculate univariate stats like pandas describe method
def univariate_stats(df):
    #df.drop('id', axis=1, inplace=True)
    output_df = pd.DataFrame(columns=['Count', 'Missing', 'Unique', 'Dtype', 'IsNumeric', 'Mode', 'Mean', 'Min', '25%', 'Median', '75%', 'Max', 'Std', 'Skew', 'Kurt'])
    
    for col in df:
        if is_numeric_dtype(df[col]):
            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], df[col].mean(), df[col].min(), df[col].quantile(.25), df[col].median(), df[col].quantile(.75), df[col].max(), df[col].std(), df[col].skew(), df[col].kurt() ]
        else:
            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], '-', '-', '-', '-', '-', '-', '-', '-', '-' ]

    return output_df.sort_values(by=['IsNumeric', 'Unique'], ascending=False)


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


# Call the function to check univariate stats on the dataset
univariate_stats(data)

In [None]:
# List of numerical columns and categorical columns

numeric_cols = train.select_dtypes(include=['float64']).columns.tolist()
categ_cols   = train.select_dtypes(include=['object']).columns.tolist()
target       = 'Age'
numeric_cols
categ_cols

In [None]:
<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 
Let's visualize some graphics to gain more insights. <br>
        <br><br>
PS : Don't care about the graphics of the id column. We'll remove it in the feature engineering part ;)
</b>
</p>

In [None]:
def plot_histograms(df_train, df_test, original, synthetic,target_col, n_cols=3):
    n_rows = (len(df_train.columns) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))
    axes = axes.flatten()

    for i, var_name in enumerate(df_train.columns.tolist()):
        ax = axes[i]
        sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')      # plot train data
        sns.distplot(original[var_name], kde=True, ax=ax, label='Original')   # plot original data
        sns.distplot(synthetic[var_name], kde=True, ax=ax, label='Synthetic')   # plot original data
        if var_name != target_col:
            sns.distplot(df_test[var_name], kde=True, ax=ax, label='Test')    # plot test data
        
        ax.set_title(f'{var_name} Distribution (Train vs Test)')
        ax.legend()

    plt.tight_layout()
    plt.show()
        
plot_histograms(train[numeric_cols], test[numeric_cols], synthetic, original[numeric_cols], target, n_cols=4)

In [None]:
def plot_distribution(df, hue, title='', drop_cols=[]):
    sns.set_style('whitegrid')

    cols = df.columns.drop([hue] + drop_cols)
    n_cols = 2
    n_rows = (len(cols) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))

    for i, var_name in enumerate(cols):
        row = i // n_cols
        col = i % n_cols

        ax = axes[row, col]
        sns.histplot(data=df, x=var_name, kde=True, ax=ax, hue=hue) # sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')
        ax.set_title(f'{var_name} Distribution')

    fig.suptitle(f'{title} Distribution Plot by {hue}', fontweight='bold', fontsize=16)
    plt.tight_layout()
    plt.show()
    
plot_distribution(train, hue='Sex', title='Train data')

In [None]:
def plot_boxplot(df, hue, title='', drop_cols=[], n_cols=3):
    sns.set_style('whitegrid')

    cols = df.columns.drop([hue] + drop_cols)
    n_rows = (len(cols) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))

    for i, var_name in enumerate(cols):
        row = i // n_cols
        col = i % n_cols

        ax = axes[row, col]
        sns.boxplot(data=df, x=hue, y=var_name, ax=ax, showmeans=True, 
                    meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"blue", "markersize":"5"})
        ax.set_title(f'{var_name} by {hue}')
        ax.set_xlabel('')

    fig.suptitle(f'{title} Boxplot by {hue}', fontweight='bold', fontsize=16)
    plt.tight_layout()
    plt.show()
    
plot_boxplot(train, hue='Sex', title='Train data', n_cols=2)

In [None]:
def plot_violinplot(df, hue, title='', drop_cols=[], n_cols=2):
    sns.set_style('whitegrid')

    cols = df.columns.drop([hue] + drop_cols)
    n_rows = (len(cols) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))

    for i, var_name in enumerate(cols):
        row = i // n_cols
        col = i % n_cols

        ax = axes[row, col]
        sns.violinplot(data=df, x=hue, y=var_name, ax=ax, inner='quartile')
        ax.set_title(f'{var_name} Distribution')

    fig.suptitle(f'{title} Violin Plot by {hue}', fontweight='bold', fontsize=16)
    plt.tight_layout()
    plt.show()
    
plot_violinplot(train, hue='Sex', title='Train data', n_cols=2)

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">2. Bivariate Statistics</p> 

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 Recall: There are three types of bivariate analysis.</b><br>       
<ul> 
<li> <b>Numerical - Numerical: Pearson's Correlation</b> </li> <br>
The correlation represents the strength of a linear relationship between two numerical variables. If there is no correlation between the two variables, there is no tendency to change along with the values of the second quantity.  <br>  <br>
<li> <b>Categorical - Numerical: one-way ANOVA(3 + groups) or t-test (exactly 2 groups)</b>       </li> <br>
The ANOVA test is used to determine whether there is a significant difference among the averages of more than two groups that are statistically different from each other. <br><br>
<li> <b>Categorical - Categorical: Chi-square Test</b>       </li> <br>
It is calculated based on the difference between expected frequencies and the observed frequencies in one or more categories of the frequency table.
</ul>
</p> 

In [None]:
# this just an intermediate function that will be used in bivstats for one-way ANOVA
def anova(df, feature, label):
    import pandas as pd
    import numpy as np
    from scipy import stats
    
    groups = df[feature].unique()
    df_grouped = df.groupby(feature)
    group_labels = []
    for g in groups:
        g_list = df_grouped.get_group(g)
        group_labels.append(g_list[label])
        
    return stats.f_oneway(*group_labels)

# function to calculate bivariate stats; Pearson' correlation, p-value and one-way ANOVA
def bivstats(df, label):
    from scipy import stats
    import pandas as pd
    import numpy as np
    
    # Create an empty DataFrame to store output
    output_df = pd.DataFrame(columns=['Stat', '+/-', 'Effect size', 'p-value'])
    
    for col in df:
        if col != label:
            if df[col].isnull().sum() == 0:
                if is_numeric_dtype(df[col]):   # Only calculate r, 
                    r, p = stats.pearsonr(df[label], df[col])
                    output_df.loc[col] = ['r', np.sign(r), abs(round(r, 3)), round(p,6)]
                    
                else:
                    F, p = anova(df[[col, label]], col, label)
                    output_df.loc[col] = ['F', '', round(F, 3), round(p,6)]
                    
            else:
                output_df.loc[col] = [np.nan, np.nan, np.nan, np.nan]

    return output_df.sort_values(by=['Effect size', 'Stat'], ascending=[False, False])
 
pd.options.display.float_format = '{:.5f}'.format
bivstats(train, target)

In [None]:
def plot_heatmap(df, title):
    # Create a mask for the diagonal elements
    mask = np.zeros_like(df.astype(float).corr())
    mask[np.triu_indices_from(mask)] = True

    # Set the colormap and figure size
    colormap = plt.cm.RdBu_r
    plt.figure(figsize=(8, 8))

    # Set the title and font properties
    plt.title(f'{title} correlation of features', fontweight='bold', y=1.02, size=8)

    # Plot the heatmap with the masked diagonal elements
    sns.heatmap(df.astype(float).corr(), linewidths=0.1, vmax=1.0, vmin=-1.0, 
                square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 14, "weight": "bold"},
                mask=mask)

plot_heatmap(train[numeric_cols + [target]], title='Train data')

In [None]:
<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 
Since the features in our datasets are strongly correlated to each others, applying PCA could be a good idea. <br>
        Let's visualize some graphics to gain more insights.
</b>
</p> 

In [None]:
def plot_scatter_with_fixed_col(df, fixed_col, hue=False, drop_cols=[], size=5, title=''):
    sns.set_style('whitegrid')
    
    if hue:
        cols = df.columns.drop([hue, fixed_col] + drop_cols)
    else:
        cols = df.columns.drop([fixed_col] + drop_cols)
    n_cols = 2
    n_rows = (len(cols) - 1) // n_cols + 1
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(size, size/n_cols*n_rows), sharex=False, sharey=False)
    fig.suptitle(f'{title} Set Scatter Plot with Target Column by {hue}', fontsize=20, fontweight='bold', y=1)

    for i, col in enumerate(cols):
        n_row = i // n_cols
        n_col = i % n_cols
        ax = axes[n_row, n_col]

        ax.set_xlabel(f'{col}', fontsize=12)
        ax.set_ylabel(f'{fixed_col}', fontsize=12)

        # Plot the scatterplot
        if hue:
            sns.scatterplot(data=df, x=col, y=fixed_col, hue=hue, ax=ax,
                            s=40, edgecolor='gray', alpha=0.3, palette='bright')
            ax.legend(title=hue, title_fontsize=12, fontsize=12) # loc='upper right'
        else:
            sns.scatterplot(data=df, x=col, y=fixed_col, ax=ax,
                            s=40, edgecolor='gray', alpha=0.3)

        ax.tick_params(axis='both', which='major', labelsize=8)
        ax.set_title(f'{col}', fontsize=16)
    
    plt.tight_layout(pad=0.5, h_pad=0.5, w_pad=0.5)
    plt.show()
    
plot_scatter_with_fixed_col(train, fixed_col=target, hue='Sex', size=10, title='Train data')

In [None]:
# sns.pairplot(data=train, vars=['Age', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight'], hue='Sex')
# plt.show()