# Exploratory Data Analysis

Import libraries/packages + cleaned data

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../'))
from src.feature_lists import get_feature_lists
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
df = pd.read_parquet('../data/raw/Cleaned_ORN.parquet')

Find NAs

In [None]:
#Extract features with NA or 999 entries
for col in df.columns:
    num_na = df[col].isna().sum()
    num_nines = (df[col] == 999).sum() #some use 999 to indicate blank
    if num_na > 0:
        print('-'*10 + col + '-'*10)
        print(f'NA entries: {num_na}')
print('-'*15 + 'total' + '-'*15)
print(f'TOTAL NA entries: {df.isna().sum().sum()}')


Classify features by data type

In [None]:
##Imported func from src
feature_lists = get_feature_lists()
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]

In [None]:
for col in nominal_cols:
    print(df[col].value_counts())

Plot histograms and Q-Q plots for each numerical feature

Note that because of missing entries (to be imputed later) in some columns, some values for the Shapiro-Wilk Test Statistic may be NA

In [None]:
for feature in numerical_cols:
    stat, p_val = stats.shapiro(df[feature])  
    print(f"Shapiro-Wilk Test Statistic: {stat:.4f}, p-value: {p_val:.4f}")
    plt.figure(figsize=(12, 6))

    # Histogram with KDE
    plt.subplot(1, 2, 1)
    sns.histplot(df[feature], kde=True, bins=20, color='blue')
    plt.title(f'Histogram of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')

    # Q-Q Plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[feature], dist="norm", plot=plt)
    plt.title(f'Q-Q Plot of {feature}')

    # Adjust layout and display
    plt.tight_layout()
    plt.show()


Get numerical-numerical correlations

In [None]:
num_df = df[numerical_cols]
correlation_matrix = num_df.corr(method = 'spearman')
#Values > 0.7 considered HIGH

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

Get categorical-categorical correlation (including ordinal)

In [None]:
# Function to calculate Cramér's V with bias correction
#Code adopted from:
# https://github.com/manindersingh120996/chi2_and_CrammerV_Corelation/blob/main/CrammerrV_correlation.py
def cramerV(label, x):
    confusion_matrix = pd.crosstab(label, x)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    phi2 = chi2 / n
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    if min((kcorr - 1), (rcorr - 1)) == 0:
        v = 0
    else:
        v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    return v

def create_cramer_matrix(df, columns_of_interest):
    cramer_matrix = pd.DataFrame(index=columns_of_interest, columns=columns_of_interest)
    for col1 in columns_of_interest:
        for col2 in columns_of_interest:
            if col1 == col2:
                cramer_matrix.loc[col1, col2] = 1.0  # Perfect correlation with itself
            else:
                cramer_matrix.loc[col1, col2] = cramerV(df[col1], df[col2])
    return cramer_matrix.astype(float)
    
#Get correlation amongst all categorical vars
cat_cols = binary_cols + nominal_cols + ordinal_cols  
cramer_matrix = create_cramer_matrix(df, cat_cols)
#Plot matrix
plt.figure(figsize=(20, 8))
sns.heatmap(cramer_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Cramér's V Correlation Matrix")
plt.show()

Get categorical-numerical correlation

Note that length, ischemic time, and ALB may have NA values due to missing data

In [None]:
def correlation_ratio(categories, values):
    categories = np.array(categories)
    values = np.array(values)
    mean_total = values.mean()
    numerator = sum([len(values[categories == cat]) * ((values[categories == cat].mean() - mean_total) ** 2) for cat in np.unique(categories)])
    denominator = sum((values - mean_total) ** 2)
    return numerator / denominator

corr_matrix = pd.DataFrame(index=cat_cols, columns=numerical_cols)

for cat in cat_cols:
    for num in numerical_cols:
        corr_matrix.loc[cat, num] = correlation_ratio(df[cat], df[num])
corr_matrix = corr_matrix.astype(float)


plt.figure(figsize=(len(numerical_cols)*1.5, len(cat_cols)*0.7))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="YlOrRd",
            cbar_kws={'label': 'Correlation Ratio (η²)'})
plt.title('Correlation Ratio Heatmap (Categorical vs Numerical)')
plt.xlabel('Numerical Columns')
plt.ylabel('Categorical Columns')
plt.tight_layout()
plt.show()
