In [None]:
import numpy as np
import scipy.stats

## **Correlative Tests**

In [None]:
import sys
import os
sys.path.append('../utilities')
from scipy import stats
import U_config as cfg

both_vars = {"sic":cfg.SIC_PROCESSED, "sea_surface_temperature":cfg.ERA5_PROCESSED, "2m_temperature":cfg.ERA5_PROCESSED, "mean_sea_level_pressure":cfg.ERA5_PROCESSED, "geopotential":cfg.ERA5_PROCESSED, "surface_net_solar_radiation":cfg.ERA5_PROCESSED, "surface_solar_radiation_upwards":cfg.ERA5_PROCESSED, "surface_solar_radiation_downwards":cfg.ERA5_PROCESSED, "10m_u_component_of_wind":cfg.ERA5_PROCESSED, "10m_v_component_of_wind":cfg.ERA5_PROCESSED}
all_vars = {"sic":cfg.SIC_PROCESSED, "sea_surface_temperature":cfg.ERA5_PROCESSED, "2m_temperature":cfg.ERA5_PROCESSED, "mean_sea_level_pressure":cfg.ERA5_PROCESSED, "geopotential":cfg.ERA5_PROCESSED, "surface_net_solar_radiation":cfg.ERA5_PROCESSED, "surface_solar_radiation_upwards":cfg.ERA5_PROCESSED, "surface_solar_radiation_downwards":cfg.ERA5_PROCESSED, "10m_u_component_of_wind":cfg.ERA5_PROCESSED, "10m_v_component_of_wind":cfg.ERA5_PROCESSED, "u_drift":cfg.SID_U_PROCESSED, "v_drift":cfg.SID_V_PROCESSED, "sit":cfg.SIT_PROCESSED}
data_subset = {"sic":[], "sea_surface_temperature":[], "2m_temperature":[], "mean_sea_level_pressure":[], "geopotential":[], "surface_net_solar_radiation":[], "surface_solar_radiation_upwards":[], "surface_solar_radiation_downwards":[], "10m_u_component_of_wind":[], "10m_v_component_of_wind":[]}
data_all = {"sic":[], "sea_surface_temperature":[], "2m_temperature":[], "mean_sea_level_pressure":[], "geopotential":[], "surface_net_solar_radiation":[], "surface_solar_radiation_upwards":[], "surface_solar_radiation_downwards":[], "10m_u_component_of_wind":[], "10m_v_component_of_wind":[], "u_drift":[], "v_drift":[], "sit":[]}

def gather_data(yr_start, yr_end, month):
    Samples = pd.DataFrame()
    month = f'0{month}' if month<10 else f'{month}'    
    
    selection = {"sic":cfg.SIC_PROCESSED, "sea_surface_temperature":cfg.ERA5_PROCESSED, "2m_temperature":cfg.ERA5_PROCESSED, "mean_sea_level_pressure":cfg.ERA5_PROCESSED, "geopotential":cfg.ERA5_PROCESSED, "surface_net_solar_radiation":cfg.ERA5_PROCESSED, "surface_solar_radiation_upwards":cfg.ERA5_PROCESSED, "surface_solar_radiation_downwards":cfg.ERA5_PROCESSED, "10m_u_component_of_wind":cfg.ERA5_PROCESSED, "10m_v_component_of_wind":cfg.ERA5_PROCESSED, "u_drift":cfg.SID_U_PROCESSED, "v_drift":cfg.SID_V_PROCESSED, "sit":cfg.SIT_PROCESSED}
    data =  {"sic":[], "sea_surface_temperature":[], "2m_temperature":[], "mean_sea_level_pressure":[], "geopotential":[], "surface_net_solar_radiation":[], "surface_solar_radiation_upwards":[], "surface_solar_radiation_downwards":[], "10m_u_component_of_wind":[], "10m_v_component_of_wind":[], "u_drift":[], "v_drift":[], "sit":[]}

    for var in list(selection.keys()):
        path = os.path.join(cfg.ERA5_PROCESSED, var) if (selection[var] == cfg.ERA5_PROCESSED) else selection[var]
        for year in range(start,end+1):
            file_loc = os.path.join(path, f'{year}_{month}.npy')
            file = np.load(file_loc)[:100, :100]
            data[var].append(file)

    var_acronym = {"sic":"SIC", "sea_surface_temperature":"SST", "2m_temperature":"T2M", "mean_sea_level_pressure":"MSL", "geopotential":"GEO", "surface_net_solar_radiation":"S_NET", "surface_solar_radiation_upwards":"S_UP", "surface_solar_radiation_downwards":"S_DOWN", "10m_u_component_of_wind":"10u", "10m_v_component_of_wind":"10v", "u_drift":"uDRIFT", "v_drift":"vDRIFT", "sit":"SIT"}

    for var in list(data.keys()):
        data[var] = stats.mode(np.array(data[var]))[0]
        Samples[var_acronym[var]] = data[var].ravel()

    return Samples

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
sample_set = {}
monthly_linear_important = {}
monthly_non_linear_important = {}

for i in range(0, 2):
    start = 2011
    end = 2019
    month = months[i]
    sample_set[month] = gather_data(start,end,i+1)
    pearson, spearman = plot_correlation(sample_set[month], month, start, end)
    monthly_linear_important[month] = pearson
    monthly_non_linear_important[month] = spearman

plot_annual_vals(monthly_linear_important, monthly_non_linear_important, start, end)

In [None]:
def plot_correlation(data_frame, month, start, end):
    # Non-linear correlations:
    s_correlation = data_frame.corr("spearman")
    plt.figure(figsize=(8, 6))
    mask = np.triu(np.ones_like(s_correlation, dtype=np.bool))
    heatmap = sns.heatmap(s_correlation, mask = mask, vmin=-1, vmax=1, annot=True, cmap="RdBu", fmt='.2f',   annot_kws={"fontsize":10})
    plt.title(f"Spearman Correlation for {month} {start}-{end}\n(Non-Linear Correlation)", fontsize=12)
    plt.savefig(f"../plots/correlation/{start}-{end}-{month}_non_linear.png")

    # Linear correlation
    correlation = data_frame.corr("pearson")
    plt.figure(figsize=(8, 6))
    mask = np.triu(np.ones_like(correlation, dtype=np.bool))
    heatmap = sns.heatmap(correlation, mask = mask, vmin=-1, vmax=1, annot=True, cmap="RdBu", fmt='.2f',   annot_kws={"fontsize":10})
    plt.title(f"Pearson Correlation for {month} {start}-{end}\n(Linear Correlation)", fontsize=12)
    plt.savefig(f"../plots/correlation/{start}-{end}-{month}_linear.png")
    non_linear_list = s_correlation[['SIC']].sort_values(by='SIC', ascending=False)
    linear_list = correlation[['SIC']].sort_values(by='SIC', ascending=False)
    return linear_list, non_linear_list

In [None]:
def plot_annual_vals(linear, non_linear, yr_start, yr_end):
    vals_linear = pd.DataFrame()
    vals_non_linear = pd.DataFrame()

    for month in months:
        vals_linear[month] = monthly_linear_important[month]
        vals_non_linear[month] = monthly_non_linear_important[month]
    vals_linear=vals_linear.T
    vals_non_linear=vals_non_linear.T
    
    plt.figure(figsize=(8, 4))
    heatmap = sns.heatmap(vals_linear, vmin=-1, vmax=1, annot=True, cmap='RdBu', cbar_kws={'label': 'Pearson Correlation Coefficient'}, fmt='.2f')
    heatmap.set_title(f'SIC-Variable Linear Correlation for {yr_start}-{yr_end}', fontdict={'fontsize':12}, pad=16)
    plt.savefig(f'../plots/correlation/{yr_start}-{yr_end}_annual_pearson.png')

    plt.figure(figsize=(8, 4))
    heatmap = sns.heatmap(vals_non_linear, vmin=-1, vmax=1, annot=True, cmap='RdBu', cbar_kws={'label': 'Spearman Correlation Coefficient'}, fmt='.2f')
    heatmap.set_title(f'SIC-Variable Non-Linear Correlation for {yr_start}-{yr_end}', fontdict={'fontsize':12}, pad=16)
    plt.savefig(f'../plots/correlation/{yr_start}-{yr_end}_annual_spearman.png')

In [None]:
plot_annual_vals(monthly_linear_important, monthly_non_linear_important, start, end)