In [None]:
import sys
from pathlib import Path

# Add root folder to Python path (to import modules)
notebook_dir = Path().absolute()
project_root = notebook_dir.parent
sys.path.append(str(project_root))

In [None]:
import numpy as np
from src.data.dataset import BaseDataset

In [None]:
dataset = BaseDataset()

In [None]:
df = dataset.df

In [None]:
import pandas as pd

print("Correlation Matrix")
# print(df.corr())
# print()

def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(kind='quick_sort', ascending=False)
    return au_corr[0:n]
def get_bottom_correlations(df, n=10):
    au_corr = df.corr().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(kind='quick_sort', ascending=True)
    return au_corr[0:n]
print("Top Absolute Correlations")
corr= get_top_abs_correlations(df, 10)
print(corr)
# print(get_top_abs_correlations(df, 3))

In [None]:
corr= get_bottom_correlations(df, 10)

In [None]:
corr_df = df.iloc[:,3:].corr()

In [None]:
import matplotlib.pyplot as plt

thershold = 0.8
np.fill_diagonal(corr_df.values, np.nan)
ndf = corr_df.loc[np.abs(corr_df.max(axis=1)) >= thershold, np.abs(corr_df.max(axis=0) > thershold)]
np.fill_diagonal(ndf.values, 1)
# np.fill_diagonal(corr_df.values, 1)
plt.figure(figsize=(12, 10))
ax = plt.subplot(111)
im = ax.imshow(ndf, cmap='plasma') #RdBu, viridis, plasma

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(ndf.index)), labels=ndf.index)
ax.set_yticks(np.arange(len(ndf.index)), labels=ndf.index)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

for t in ax.get_yticklabels():
     t.set_fontsize(14)
for t in ax.get_xticklabels():
     t.set_fontsize(14)
# Loop over data dimensions and create text annotations.
# for i in range(len(corr_df.index)):
#     for j in range(len(corr_df.index)):
#         text = ax.text(j, i, corr_df.iloc[i, j],
#                        ha="center", va="center", color="w")
plt.colorbar(im)
# ax.set_title("Features correlation")
plt.tight_layout()
plt.savefig('../figs/feature_corr.png')
plt.show()
