# Exploratory Data Analysis

In [None]:
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import BASE_PATH

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

df = pd.read_excel(
    BASE_PATH / "data" / "processed" / "fully_cleaned_tongue_data.xlsx", index_col=0
)

In [None]:
# Extract features with NA or 999 entries
for col in df.columns:
    num_na = df[col].isna().sum()
    if num_na > 0:
        print("-" * 10 + col + "-" * 10)
        print(f"NA entries: {num_na}")
print("-" * 15 + "total" + "-" * 15)
print(f"TOTAL NA entries: {df.isna().sum().sum()}")

Classify features by data type

In [None]:
##Imported func from src
feature_lists = get_feature_lists(df)
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]

Plot histograms and Q-Q plots for each numerical feature

In [None]:
for feature in numerical_cols:
    plt.figure(figsize=(12, 6))

    # Histogram with KDE
    plt.subplot(1, 2, 1)
    sns.histplot(df[feature], kde=True, bins=50, color="blue")
    plt.title(f"Histogram of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Count")

    # Q-Q Plot
    plt.subplot(1, 2, 2)
    stats.probplot(df[feature], dist="norm", plot=plt)
    plt.title(f"Q-Q Plot of {feature}")

    # Adjust layout and display
    plt.tight_layout()
    plt.show()

Get numerical-numerical correlations

In [None]:
num_df = df[numerical_cols]
correlation_matrix = num_df.corr(method="spearman")
# Values > 0.7 considered HIGH

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

Get categorical-categorical correlation (including ordinal)

In [None]:
# Function to calculate Cramér's V with bias correction
# Code adopted from:
# https://github.com/manindersingh120996/chi2_and_CrammerV_Corelation/blob/main/CrammerrV_correlation.py
def cramerV(label, x):
    confusion_matrix = pd.crosstab(label, x)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    phi2 = chi2 / n
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    if min((kcorr - 1), (rcorr - 1)) == 0:
        v = 0
    else:
        v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    return v


def create_cramer_matrix(df, columns_of_interest):
    cramer_matrix = pd.DataFrame(index=columns_of_interest, columns=columns_of_interest)
    for col1 in columns_of_interest:
        for col2 in columns_of_interest:
            if col1 == col2:
                cramer_matrix.loc[col1, col2] = 1.0  # Perfect correlation with itself
            else:
                cramer_matrix.loc[col1, col2] = cramerV(df[col1], df[col2])
    return cramer_matrix.astype(float)


# Get correlation amongst all categorical vars
cat_cols = binary_cols + nominal_cols + ordinal_cols
cramer_matrix = create_cramer_matrix(df, cat_cols)
# Plot matrix
plt.figure(figsize=(100, 20))
sns.heatmap(cramer_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Cramér's V Correlation Matrix")
plt.show()

Note that categorical-numerical correlation will have NA values due to missing data so they are not worth exploring before imputing