[Reference](https://medium.com/data-and-beyond/mastering-exploratory-data-analysis-eda-everything-you-need-to-know-7e3b48d63a95)

# Pure Understanding of Original Data

In [1]:
def column_summary(df):
    summary_data = []

    for col_name in df.columns:
        col_dtype = df[col_name].dtype
        num_of_nulls = df[col_name].isnull().sum()
        num_of_non_nulls = df[col_name].notnull().sum()
        num_of_distinct_values = df[col_name].nunique()

        if num_of_distinct_values <= 10:
            distinct_values_counts = df[col_name].value_counts().to_dict()
        else:
            top_10_values_counts = df[col_name].value_counts().head(10).to_dict()
            distinct_values_counts = {k: v for k, v in sorted(top_10_values_counts.items(), key=lambda item: item[1], reverse=True)}

        summary_data.append({
            'col_name': col_name,
            'col_dtype': col_dtype,
            'num_of_nulls': num_of_nulls,
            'num_of_non_nulls': num_of_non_nulls,
            'num_of_distinct_values': num_of_distinct_values,
            'distinct_values_counts': distinct_values_counts
        })

    summary_df = pd.DataFrame(summary_data)
    return summary_df

# Example usage:
# Assuming df is your DataFrame
summary_df = column_summary(df)
display(summary_df)

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(df.head())
print(df.describe())
print(df.duplicated().sum())

In [3]:
# Identify numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Perform univariate analysis on numerical columns
for column in numerical_columns:
    # For continuous variables
    if len(df[column].unique()) > 10:  # Assuming if unique values > 10, consider it continuous
        plt.figure(figsize=(8, 6))
        sns.histplot(df[column], kde=True)
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()
    else:  # For discrete or ordinal variables
        plt.figure(figsize=(8, 6))
        ax = sns.countplot(x=column, data=df)
        plt.title(f'Count of {column}')
        plt.xlabel(column)
        plt.ylabel('Count')

        # Annotate each bar with its count
        for p in ax.patches:
            ax.annotate(format(p.get_height(), '.0f'),
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha = 'center', va = 'center',
                        xytext = (0, 5),
                        textcoords = 'offset points')
        plt.show()

# Transformation of Original Data

In [4]:
### Rename the column names for familiarity
# This is if there is no requirement to use back the same column names.
# This is also only done if there is no pre-existing format, or if the col names don't follow conventional format.
# Normally will follow feature mart / dept format to name columns for easy understanding across board.

df_l1 = df.copy()
df_l1.rename(columns=lambda x: x.lower().replace(' ', '_'), inplace=True)
new_col_dict = {'pc': 'c_pc', 'incm_typ': 'c_incm_typ', 'gn_occ': 'c_occ',
                 'num_prd': 'prod_nos', 'casatd_cnt': 'casa_td_nos', 'mthcasa': 'casa_bal_avg_mth',
                 'maxcasa': 'casa_bal_max_yr', 'mincasa': 'casa_bal_min_yr', 'drvcr': 'dr_cr_ratio_yr',
                 'mthtd': 'td_bal_avg', 'maxtd': 'td_bal_max', 'asset_value': 'asset_tot_val',
                 'hl_tag': 'loan_home_tag', 'al_tag': 'loan_auto_tag', 'pur_price_avg': 'prop_pur_price',
                 'ut_ave': 'ut_avg', 'maxut': 'ut_max', 'n_funds': 'funds_nos',
                 'cc_ave': 'cc_out_bal_avg_mth', 'max_mth_trn_amt': 'cc_txn_amt_max_mth', 'min_mth_trn_amt': 'cc_txn_amt_min_mth',
                 'avg_trn_amt': 'cc_txn_amt_avg_mth', 'ann_trn_amt': 'cc_txn_amt_yr', 'ann_n_trx': 'cc_txn_nos_yr'}
df_l1.rename(columns=new_col_dict, inplace=True)

In [5]:
sns.set(style="whitegrid")

# Create the boxplot
plt.figure(figsize=(10, 6))  # Set the size of the plot
sns.boxplot(x='c_incm_typ', y='casa_bal_max_yr', data=df_l1)

# Set labels and title
plt.xlabel('Income Type')
plt.ylabel('casa_bal_max_yr')
plt.title('Boxplot of casa_bal_max_yr by Income Type')
plt.yscale('log')

# Show the plot
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()

In [6]:
new_df = df_l1[['prop_pur_price','loan_home_tag']]
null_loan_home = new_df[new_df['loan_home_tag'].isnull()]
not_null_count = null_loan_home[~null_loan_home[['prop_pur_price']].isnull().any(axis=1)].shape[0]
print("Number of rows where 'loan_home_tag' is null, but 'prop_pur_price' is not null:", not_null_count)

new_df = df_l1[['prop_pur_price','loan_home_tag']]
null_loan_home = new_df[new_df['prop_pur_price'].isnull()]
not_null_count = null_loan_home[~null_loan_home[['loan_home_tag']].isnull().any(axis=1)].shape[0]
print("Number of rows where 'prop_pur_price' is null, but 'loan_home_tag' is not null:", not_null_count)

new_df = df_l1[['prop_pur_price','loan_home_tag']]
condition = new_df['loan_home_tag'] == 1
new_df[condition].describe()

In [7]:
numerical_cols = ['c_age', 'prod_nos',
                  'casa_td_nos', 'casa_bal_avg_mth', 'casa_bal_max_yr', 'casa_bal_min_yr',
                  'dr_cr_ratio_yr', 'td_bal_avg', 'td_bal_max', 'asset_tot_val',
                  'prop_pur_price', 'ut_avg', 'ut_max', 'funds_nos',
                  'cc_out_bal_avg_mth', 'cc_txn_amt_max_mth', 'cc_txn_amt_min_mth', 'cc_txn_amt_avg_mth',
                  'cc_txn_amt_yr', 'cc_txn_nos_yr', 'cc_lmt']
categorical_cols = ['c_edu_encoded', 'c_hse_encoded', 'c_pc', 'c_incm_typ', 'c_occ_encoded',
                    'loan_home_tag', 'loan_auto_tag']

# Assuming df is your DataFrame
correlation_matrix = df_l2[numerical_cols].corr()

# Create the heatmap
plt.figure(figsize=(20, 16))  # Set the size of the plot
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

# Set title
plt.title('Correlation Heatmap')

# Show the plot
plt.tight_layout()
plt.show()

# Find the max correlation
upper_triangular = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
max_correlation = upper_triangular.max().max()
print(f"Maximum pairwise correlation: {max_correlation:.2f}")

In [8]:
def corr_v(df_input, col1, col2):
    correlation_value = df_input[col1].corr(df_input[col2])
    return f"Correlation value between {col1} and {col2} is: {correlation_value}"

print(corr_v(df_l2, 'casa_bal_avg_mth', 'casa_bal_max_yr'))
print(corr_v(df_l2, 'td_bal_avg', 'td_bal_max'))
print(corr_v(df_l2, 'ut_avg', 'ut_max'))
print(corr_v(df_l2, 'cc_txn_amt_max_mth', 'cc_txn_amt_yr'))
print(corr_v(df_l2, 'cc_txn_amt_avg_mth', 'cc_txn_amt_yr'))

In [9]:
def iv_woe(data, target, bins=10, show_woe=False):

    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()

    #Extract Column Names
    cols = data.columns

    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        print("Processing variable:", ivars)
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})


        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']

        # Calculate % of events in each group.
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()

        # Calculate the non events in each group.
        d['Non-Events'] = d['N'] - d['Events']
        # Calculate % of non events in each group.
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

numerical_cols = ['c_age', 'prod_nos',
                  'casa_td_nos', 'casa_bal_avg_mth', 'casa_bal_max_yr', 'casa_bal_min_yr',
                  'dr_cr_ratio_yr', 'td_bal_avg', 'td_bal_max', 'asset_tot_val',
                  'prop_pur_price', 'ut_avg', 'ut_max', 'funds_nos',
                  'cc_out_bal_avg_mth', 'cc_txn_amt_max_mth', 'cc_txn_amt_min_mth', 'cc_txn_amt_avg_mth',
                  'cc_txn_amt_yr', 'cc_txn_nos_yr', 'cc_lmt']
categorical_cols = ['c_edu_encoded', 'c_hse_encoded', 'c_pc', 'c_incm_typ', 'c_occ_encoded',
                    'loan_home_tag', 'loan_auto_tag']
dependent_col = ['c_seg_encoded']
all_cols = numerical_cols + categorical_cols + dependent_col

IVDF, woeDF = iv_woe(df_l2[all_cols], 'c_seg_encoded', bins=10, show_woe=True)

sorted_IVDF = IVDF.sort_values(by='IV', ascending=False)
display(sorted_IVDF)

In [10]:
# Base Settings
df_l2 = df_l1.copy()
numerical_cols = ['c_age', 'prod_nos',
                  'casa_td_nos', 'casa_bal_avg_mth', 'casa_bal_max_yr', 'casa_bal_min_yr',
                  'dr_cr_ratio_yr', 'td_bal_avg', 'td_bal_max', 'asset_tot_val',
                  'prop_pur_price', 'ut_avg', 'ut_max', 'funds_nos',
                  'cc_out_bal_avg_mth', 'cc_txn_amt_max_mth', 'cc_txn_amt_min_mth', 'cc_txn_amt_avg_mth',
                  'cc_txn_amt_yr', 'cc_txn_nos_yr', 'cc_lmt']
categorical_cols = ['c_edu_encoded', 'c_hse_encoded', 'c_pc', 'c_incm_typ', 'c_occ_encoded',
                    'loan_home_tag', 'loan_auto_tag']
dependent_col = ['c_seg_encoded']
independent_col = numerical_cols + categorical_cols
all_cols = numerical_cols + categorical_cols + dependent_col

# Settings Train / Test Split.
# We will not be doing Train / Validation / Test split as this is for feature importance only.
from sklearn.model_selection import train_test_split

# Splitting into Training and Holdout Test Sets
# Ensure stratification for now. We will adjust the ratio only later if required.
X_train, X_test, y_train, y_test = train_test_split(df_l2[independent_col], df_l2[dependent_col],\
                                                    stratify=df_l2[dependent_col], test_size=0.2, random_state=88)

# From Standard Scaler for Numerical Columns (when necessary) Eg. Logistic Regression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(\
    transformers=[('num', StandardScaler(), numerical_cols)],\
    remainder='passthrough') # Pass through categorical features unchanged

X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=independent_col)
X_test_transformed = preprocessor.fit_transform(X_test)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=independent_col)
y_train_transformed = y_train.values.ravel()
y_test_transformed = y_test.values.ravel()

In [11]:
# Function for getting feature importance sorted.
def feature_importance_sorted(classification_model_input, X_train, y_train, feature_importance_input=None):
    if classification_model_input is not None:
        some_model = classification_model_input
        some_model.fit(X_train, y_train)
        feature_importances = some_model.feature_importances_
    else:
        feature_importances = feature_importance_input
    feature_importances_sorted = sorted(zip(X_train.columns, feature_importances), key=lambda x: x[1], reverse=True)
    df_feature_importances = pd.DataFrame(feature_importances_sorted, columns=['Feature', 'Importance'])
    for feature_name, importance in feature_importances_sorted:
        print(f"Feature {feature_name}: {importance}")

    df_feature_importances['rank'] = range(1, len(df_feature_importances)+1)
    return df_feature_importances

# Decision Tree Classifier Feature Importance
from sklearn.tree import DecisionTreeClassifier
dtc_fi = feature_importance_sorted(DecisionTreeClassifier(), X_train, y_train)

# Random Forest Classifier Feature Importance
from sklearn.ensemble import RandomForestClassifier
rfc_fi = feature_importance_sorted(RandomForestClassifier(), X_train, y_train.values.ravel())

# XGB Feature Importance
import xgboost as xgb
xgb_fi = feature_importance_sorted(xgb.XGBClassifier(), X_train, y_train)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train.values.ravel())
feature_importances = lr.coef_[0]  # Assuming binary classification
lr_fi = feature_importance_sorted(None, X_train, y_train.values.ravel(), feature_importances)

In [12]:
dtc_fi = dtc_fi.rename(columns={'Importance': 'imp_dtc', 'rank': 'rank_dtc'})
rfc_fi = rfc_fi.rename(columns={'Importance': 'imp_rfc', 'rank': 'rank_rfc'})
xgb_fi = xgb_fi.rename(columns={'Importance': 'imp_xgb', 'rank': 'rank_xgb'})
lr_fi = lr_fi.rename(columns={'Importance': 'imp_lr', 'rank': 'rank_lr'})

merged_df = dtc_fi.merge(rfc_fi, on='Feature', how='left')\
                  .merge(xgb_fi, on='Feature', how='left')\
                  .merge(lr_fi, on='Feature', how='left')

merged_df

In [13]:
aff_df = df_l2[df_l2['c_seg_encoded']==1]
norm_df = df_l2[df_l2['c_seg_encoded']==0]
norm_df_2 = norm_df.sample(frac=0.2, random_state=88)
# Using a smaller sample of the norm_df, since original norm_df is 5x bigger.
# Don't anticipate much change but just trying.

from scipy.stats import ttest_ind
def individual_t_test(df_1, df_2, listoffeatures, alpha_val):
    '''
    For continuous variable individual t-tests
    '''
    newlist = []
    for feature in listoffeatures:
        fea_1 = df_1[feature]
        fea_2 = df_2[feature]

        t_stat, p_val = ttest_ind(fea_1, fea_2, equal_var=False)
        t_stat1 = f'{t_stat:.3f}'
        p_val1 = f'{p_val:.3f}'

        if p_val < alpha_val:
            sig = 'Significant'
        else:
            sig = 'Insignificant'

        newdict = {'feature': feature, 't_stat': t_stat1,
                   'p_value': p_val1, 'significance': sig}
        newlist.append(newdict)

    df_result = pd.DataFrame(newlist)
    return df_result

individual_t_test(aff_df, norm_df, numerical_cols, 0.05)

individual_t_test(aff_df, norm_df_2, numerical_cols, 0.05)

In [14]:
df_l2 = df_l1.copy()
numerical_cols = ['c_age', 'prod_nos',
                  'casa_td_nos', 'casa_bal_avg_mth', 'casa_bal_max_yr', 'casa_bal_min_yr',
                  'dr_cr_ratio_yr', 'td_bal_avg', 'td_bal_max', 'asset_tot_val',
                  'prop_pur_price', 'ut_avg', 'ut_max', 'funds_nos',
                  'cc_out_bal_avg_mth', 'cc_txn_amt_max_mth', 'cc_txn_amt_min_mth', 'cc_txn_amt_avg_mth',
                  'cc_txn_amt_yr', 'cc_txn_nos_yr', 'cc_lmt']
categorical_cols = ['c_edu_encoded', 'c_hse_encoded', 'c_pc', 'c_incm_typ', 'c_occ_encoded',
                    'loan_home_tag', 'loan_auto_tag']
dependent_col = ['c_seg_encoded']
independent_col = numerical_cols + categorical_cols
all_cols = numerical_cols + categorical_cols + dependent_col

for feature in numerical_cols:
    plt.figure(figsize=(8, 6))
    boxplot = sns.boxplot(x='c_seg_encoded', y=feature, data=df_l2)
    plt.title(f'Box Plot of {feature} by AFFLUENT / NORMAL')

    # Add condition to use log scale if values are greater than 1000
    if df_l2[feature].max() > 1000:
        boxplot.set_yscale('log')

    plt.xlabel('Customer Type')
    plt.ylabel(feature)
    plt.show()