## Prep

In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import warnings
import seaborn as sns
from scipy.stats import pearsonr


warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
dat = pd.read_csv("../data_processed/Y_BaselineX_processed_full.csv")

In [None]:
# Y for dif periods

dat_tot = dat.copy()

dat_10y = dat.copy()
dat_10y['event_10y'] = np.where((dat_10y['cvda'] == 1) & (dat_10y['cvdatt'] <= 3650), 1, 0)

dat_over_10y = dat.copy()
dat_over_10y = dat_over_10y[dat_over_10y['cvdatt'] > 3650]
dat_over_10y = dat_over_10y.rename(columns={'cvda': 'event_over_10y'})

In [None]:
# rename variables

dat_plt = dat_over_10y[['event_over_10y', 'F1_PC2', 'A_S1FAV', 'A_S1PAI', 'G_bla_rk', 
               'chdiet', 'chphysact','income',
              'site', 'race']]
dat_plt = dat_plt.rename(columns = {'event_over_10y': 'Y  ', 
                                    'F1_PC2': 'Nb SES',
                                    'A_S1FAV': 'Nb unf food store', 
                                    'A_S1PAI': 'Nb phys act fac', 
                                    'G_bla_rk': 'Nb rac seg (Black)', 
                                    'chdiet': 'Ind nut categ', 
                                    'chphysact': 'Ind phys act categ',
                                   'income': 'Family income'})

## Plot

In [None]:
df_ordinal = dat_plt.astype(int)

In [None]:
corr_matrix = dat_plt.corr(method='spearman')
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Spearman's Rank-Order Correlation Matrix")
plt.show()

In [None]:
correlation_matrix = dat_plt.corr(method='spearman')
p_values = pd.DataFrame(index=dat_plt.columns, columns=dat_plt.columns, dtype=float)

for col1 in dat_plt.columns:
    for col2 in dat_plt.columns:
        if col1 != col2:
            r, p = pearsonr(dat_plt[col1], dat_plt[col2])
            p_values.loc[col1, col2] = p

# Create a heatmap of the correlation matrix
plt.figure(figsize=(80, 60))
sns.set(font_scale=14) 
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="Greens", cbar=True,
            xticklabels=correlation_matrix.columns, yticklabels=correlation_matrix.columns)

# Annotate the heatmap with p-values
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        if i != j:
            if p_values.iloc[i, j] < 0.05:
                text = f"*"  
            else:
                text = f"" #p={p_values.iloc[i, j]:.2f}
                
            plt.text(j + 0.5, i + 0.5, text, ha="center", va="center", fontsize=115)

plt.show()
