# Pipeline

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import sklearn as skl
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.linear_model import RidgeCV
import sklearn.cluster
from sklearn.metrics import pairwise_distances 
import scipy.spatial 

from sklearn.manifold import TSNE

import seaborn as sns
sns.set_theme(style="darkgrid")

import warnings
warnings.filterwarnings('ignore')

In [2]:
from plotting_functions import plot_violinplot_of_measure

In [3]:
# load data
data_path = '/glade/u/home/cassiacai/marine_heatwaves/notebooks/one_val_data.csv'
df = pd.read_csv(data_path) # shape: (1,13)

In [None]:
# cleaning
measures = list(df.columns)
print(measures)

measure_np = np.zeros((df.shape[1],1203))
for measure_i in range(len(measures)):
    ls_measure = df[measures[measure_i]][0][1:-1].split(', ')
    measure_np[measure_i] = [float(i) for i in ls_measure]

# create attributes dataframe
attributes_df = pd.DataFrame({ measures[i]: measure_np[i] for i in range(len(measures)) })
attributes_df['small_in_lifetime'] = (attributes_df.when_small + 1) / attributes_df.duration
attributes_df['large_in_lifetime'] = (attributes_df.when_large + 1) / attributes_df.duration
print(attributes_df.shape) # shape: (1203, 15)

### Visualize the distribution of measures

In [None]:
# remove heatwaves that are anomalously long
rem_perc_999_duration = np.percentile(attributes_df['duration'], 99.5) 
attributes_df= attributes_df[(attributes_df.duration <= rem_perc_999_duration)]
print(attributes_df.shape)

attributes_df = attributes_df[(attributes_df.duration > 1)] # remove length 1 heatwaves

# separate by season
atdf_wi = attributes_df[(attributes_df.month == 12) | (attributes_df.month == 1) | (attributes_df.month == 2)] # winter 12, 1, 2
atdf_sp = attributes_df[(attributes_df.month >= 3) & (attributes_df.month <= 5)] # spring 3, 4, 5
atdf_su = attributes_df[(attributes_df.month >= 6) & (attributes_df.month <= 8)] # summer 6, 7, 8
atdf_f = attributes_df[(attributes_df.month >= 9) & (attributes_df.month <= 11)] # fall 9, 10, 11

In [None]:
# # helper function to plot violin plots of all measures to visualize how the measures are distributed
# # also in plotting_function.py
# def plot_violinplot_of_measure(atdf_wi: pd.DataFrame, 
#                                atdf_sp: pd.DataFrame, 
#                                atdf_su: pd.DataFrame, 
#                                atdf_f: pd.DataFrame, 
#                                measure: str):
#     '''
#     Helper function to plot violin plot of measure by season
    
#     Parameters
#     ----------
#     atdf_wi : pd.DataFrame
#         A dataframe of attributes that are separated by season
#     atdf_sp : pd.DataFrame
#         A dataframe of attributes that are separated by season
#     atdf_su : pd.DataFrame
#         A dataframe of attributes that are separated by season
#     atdf_f : pd.DataFrame
#         A dataframe of attributes that are separated by season
#     measure : str
#     '''
#     data = [atdf_wi[measure], atdf_sp[measure], atdf_su[measure], atdf_f[measure]]
#     percentiles = np.zeros((4,3)) # quartile1, medians, quartile3
    
#     for i in range(4):
#         percentiles[i] = np.percentile(data[i], [25, 50, 75])
    
#     plt.rcParams["figure.figsize"] = (10,5)
#     plt.title(measure,fontsize=20)
#     parts = plt.violinplot(
#             data, showmeans=False, showmedians=False,
#             showextrema=False)
    
#     for pc in parts['bodies']:
#         pc.set_facecolor('orange')
#         pc.set_edgecolor('black')
#         pc.set_alpha(1)

#     inds = np.arange(1, 5)
#     plt.scatter(inds, percentiles[:,1],marker='o', color='white', s=30, zorder=3) # median
#     plt.vlines(inds, percentiles[:,0], percentiles[:,2],color='k', linestyle='-', lw=3)
#     plt.xticks(np.arange(1, 5), labels=['winter','spring','summer','fall'], fontsize=15)
#     plt.yticks(fontsize=15)
#     plt.ylabel('observed values', fontsize=15)
#     plt.show()

In [None]:
def plot_in_lifetime(dataframe):
    atrib_df = dataframe[(dataframe.duration > 1)]

    plt.rcParams["figure.figsize"] = (15,5)
    plt.subplot(1, 2, 1)
    plt.xlim(-1, 1)
    plt.hist(atrib_df.large_in_lifetime - atrib_df.small_in_lifetime,
             bins=20, color='blue')
    plt.ylabel('frequency', fontsize=15)

    plt.subplot(1, 2, 2)
    plt.ylim(0,30); plt.xlim(-1,1)
    plt.scatter(atrib_df.large_in_lifetime - atrib_df.small_in_lifetime,
                atrib_df.duration, alpha=0.3, c='blue')
    plt.ylabel('duration', fontsize=15)
    plt.show()

In [None]:
plot_in_lifetime(attributes_df)

In [None]:
plot_in_lifetime(atdf_wi)
plot_in_lifetime(atdf_sp)
plot_in_lifetime(atdf_su)
plot_in_lifetime(atdf_f)

In [None]:
measures = list(atdf_su.columns)
print(len(measures))
del measures[9:13]
print(len(measures))

In [None]:
for i in measures:
    plot_violinplot_of_measure(atdf_wi, atdf_sp, atdf_su, atdf_f,str(i))

In [None]:
# make new attributes
bymonth = np.resize(np.arange(1,13),12*167)[1:-11]

attributes_df['timestep_maxspatextime'] = attributes_df['first_timestep'] + attributes_df['when_large']
attributes_df['timestep_minspatextime'] = attributes_df['first_timestep'] + attributes_df['when_small']

attributes_df['month_maxspatextime'] = np.take(bymonth, 
                                               attributes_df['timestep_maxspatextime'].values.astype(int))
attributes_df['month_minspatextime'] = np.take(bymonth, 
                                               attributes_df['timestep_minspatextime'].values.astype(int))

In [None]:
print(attributes_df.shape)
attributes_df.head()

In [None]:
# # the columns 'first_timestep' and 'month' are extremely strong. We don't want to use it.
attributes_df = attributes_df.drop(columns=['first_timestep']) 
attributes_df = attributes_df.drop(columns=['month'])
attributes_df = attributes_df.drop(columns=['max_spatial_extent_time']) # this is redundant. It is the same as max_spatial_extent_time
print(attributes_df.shape) # shape: (1,10)

metrics = list(attributes_df.columns)

In [None]:
f, ax = plt.subplots(figsize=(22, 8))

corr = attributes_df.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Want diagonal elements as well
mask[np.diag_indices_from(mask)] = False

hm = sns.heatmap(round(corr,2), mask=mask, annot=True, ax=ax, cmap="coolwarm", fmt='.2f',
                 linewidths=.05, vmin=-1, vmax=1)
f.subplots_adjust(top=0.93)
t= f.suptitle('Attributes Correlation Heatmap', fontsize=14)

In [None]:
scaler = preprocessing.MinMaxScaler() # scaling
features_normal = scaler.fit_transform(attributes_df)

In [None]:
N = int(features_normal.shape[0])
X = features_normal

mean_data = np.mean(X,0)
centered_X = X - mean_data
print(centered_X.shape)

In [None]:
# elbow plot
KK = np.arange(1,10)
WSS = []

for j in range(len(KK)):
    KMM = skl.cluster.KMeans(n_clusters=KK[j], random_state=0)
    KMM.fit(centered_X)
    centroids = KMM.cluster_centers_
    Y_pred = KMM.labels_
  # compute WSS 
    wss = 0
    for i in range(N):
        current_cents = centroids[Y_pred[i]]
        wss += np.linalg.norm( centered_X[i] - current_cents )**2
    WSS.append(wss)

In [None]:
plt.rcParams["figure.figsize"] = (6,6)
plt.plot(KK, WSS)
plt.xlabel('K'); plt.ylabel('WSS(K)')
plt.show()

In [None]:
pca = PCA(n_components = 5)
pca.fit(centered_X)
frobenius_norm = np.sqrt(sum(pca.singular_values_**2))
print(frobenius_norm)

In [None]:
nuclear_norm = np.linalg.norm(pca.singular_values_) # is the sum of singular singulars
sv_scaled = pca.singular_values_**2/(nuclear_norm**2) 
sv_scaled = pca.explained_variance_ratio_
sv_total = np.cumsum(pca.singular_values_**2)/(nuclear_norm**2)

In [None]:
plt.rcParams["figure.figsize"] = (25,6)

plt.subplot(1, 4, 1)
plt.plot(sv_scaled, 'blue') # equivalent to pca.explained_variance_ratio_
plt.xticks(fontsize=15); plt.yticks(fontsize=15)
plt.xlabel('PCA component', fontsize=15)
plt.ylabel('explained variance ratio', fontsize=15)

plt.subplot(1, 4, 2)
plt.plot(sv_total, 'blue')
plt.xticks(fontsize=15); plt.yticks(fontsize=15)
plt.xlabel('PCA component', fontsize=15)
plt.ylabel('cumulatiave explained variance ratio', fontsize=15)

plt.subplot(1, 4, 3)
plt.plot(sv_total[:30], 'blue')
plt.xticks(fontsize=15); plt.yticks(fontsize=15)
plt.xlabel('PCA component', fontsize=15)
plt.ylabel('cumulatiave explained variance ratio', fontsize=15)

plt.subplot(1, 4, 4)
plt.plot(np.log(sv_scaled),'blue') # equivalent to np.log(pca.singular_values_)
plt.xticks(fontsize=15); plt.yticks(fontsize=15)
plt.xlabel('PCA component', fontsize=15)
plt.ylabel('log(explained variance ratio)', fontsize=15)

plt.show()

In [None]:
def no_pca_modes(ratio):
    pca_test = PCA(n_components=1)
    pca_test.fit(centered_X)
    np.sqrt(sum(pca_test.singular_values_**2))
    i = 1
    while np.sqrt(sum(pca_test.singular_values_**2)) < ratio*frobenius_norm:
        i += 1
        pca_test = PCA(n_components=i)
        pca_test.fit(centered_X)
    return i

print('Number of PCA modes to keep to approximate X_train up to 60%: ', no_pca_modes(0.6))
print('Number of PCA modes to keep to approximate X_train up to 80%: ',no_pca_modes(0.8))
print('Number of PCA modes to keep to approximate X_train up to 90%: ',no_pca_modes(0.9))

In [None]:
transformed = pca.fit_transform(centered_X)
transformed_df = pd.DataFrame(data = transformed, columns = ['PC1', 'PC2','PC3','PC4','PC5'])

In [None]:
plt.rcParams["figure.figsize"] = (20,20)
pd.plotting.scatter_matrix(transformed_df,alpha=0.4)
plt.show()
# why do we see histograms?

In [None]:
# number of components
n_pcs= pca.components_.shape[0]; print(n_pcs)

In [None]:
plt.rcParams["figure.figsize"] = (10,10)
plt.rcParams["axes.edgecolor"] = "0.15"
plt.rcParams["axes.linewidth"]  = 1.
    
plt.xlabel('Principal Component', fontsize=15)
plt.ylabel('Attribute', fontsize=15)
plt.xticks(np.arange(0.5, 5.5, 1),np.arange(1,n_pcs+1), fontsize=15)
plt.yticks(np.arange(0.5,16.5,1), metrics, rotation=0, fontsize=15)

levels = np.arange(0, 1.1, .1)
cmap = plt.cm.get_cmap('Blues')

plt.pcolormesh(np.abs(pca.components_).T,cmap=cmap,edgecolors='k',norm = mpl.colors.BoundaryNorm(levels, ncolors=cmap.N, clip=False))
cb = plt.colorbar()
cb.ax.tick_params(labelsize=15)
plt.show()

In [None]:
attributes_df.shape

In [None]:
K = 5

KMM = skl.cluster.KMeans(n_clusters=K, random_state=0).fit(X)
Y_pred = np.array(KMM.labels_, dtype=float) # predicted clusters by K-means

print(Y_pred.shape)
print(KMM.cluster_centers_.shape)
print(X.shape)

labels = pd.DataFrame(KMM.labels_)
labeledattributes = pd.concat((attributes_df,labels),axis=1)
labeledattributes = labeledattributes.rename({0:'labels'},axis=1)

In [None]:
cluster_centers = KMM.cluster_centers_.shape

In [None]:
cluster_centers

In [None]:
fig, ax = plt.subplots()

plt.axhline(y=0, c='k', alpha=0.4)
plt.axvline(x=0, c='k', alpha=0.4)
scatter = ax.scatter(transformed_df.PC1, transformed_df.PC2,c=Y_pred, alpha=0.5, edgecolor='k', cmap='tab10')

# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(),
                    loc="upper right", bbox_to_anchor=(1.2, 1.0), title="Classes")
ax.add_artist(legend1)
plt.xlabel('PC1', fontsize=15); plt.ylabel('PC2', fontsize=15)
# plt.xlim(-0.6, 1.5); plt.ylim(-0.6, 1.5)
plt.show()

In [None]:
sns.pairplot(labeledattributes, hue="labels", diag_kind="hist", palette='tab10')
# sns.pairplot(labeledattributes,hue='labels')
plt.show()

## t-SNE: t-distributed stochastic neighbor embedding

In [None]:
# We want to get TSNE embedding with 2 dimensions
n_components = 2
tsne = TSNE(n_components)
tsne_result = tsne.fit_transform(centered_X)
print(tsne_result.shape)
 
#Plot the result of our TSNE with the label color coded
tsne_result_df = pd.DataFrame({'tsne_1': tsne_result[:,0], 
                               'tsne_2': tsne_result[:,1],
                               'label': Y_pred})
fig, ax = plt.subplots(1)
sns.scatterplot(x='tsne_1', 
                y='tsne_2', 
                hue='label', 
                data=tsne_result_df, 
                ax=ax,s=120,
                alpha=0.5, 
                palette='bright')
lim = (tsne_result.min()-5, 
       tsne_result.max()+5)
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.set_aspect('equal')
ax.legend(bbox_to_anchor=(1.05, 1), 
          loc=2, borderaxespad=0.0)
plt.show()

# what are tsne1 and tsn2 telling us?

In [None]:
res_list_0 = [i for i in range(len(Y_pred)) if Y_pred[i] == 0.0]
res_list_1 = [i for i in range(len(Y_pred)) if Y_pred[i] == 1.0]
res_list_2 = [i for i in range(len(Y_pred)) if Y_pred[i] == 2.0]
res_list_3 = [i for i in range(len(Y_pred)) if Y_pred[i] == 3.0]
res_list_4 = [i for i in range(len(Y_pred)) if Y_pred[i] == 4.0]

In [None]:
group0 = attributes_df.iloc[res_list_0,:]
group1 = attributes_df.iloc[res_list_1,:]
group2 = attributes_df.iloc[res_list_2,:]
group3 = attributes_df.iloc[res_list_3,:]
group4 = attributes_df.iloc[res_list_4,:]

In [None]:
print(np.nanmax(centered_X[res_list_0]))
print(np.nanmin(centered_X[res_list_0]))

In [None]:
print(np.sum(centered_X[res_list_0],axis=0))

In [None]:
# import plotly.express as px
# fig = px.scatter(x='tsne_1', 
#                 y='tsne_2', 
#                 c=labels, 
#                 data=tsne_result_df)
# fig.show()

In [None]:
len(np.sum(centered_X[res_list_0],axis=0))

In [None]:
plt.scatter(np.arange(16), np.sum(centered_X[res_list_0],axis=0),label='1',c='red')
plt.scatter(np.arange(16), np.sum(centered_X[res_list_1],axis=0),label='2',c='blue')
plt.scatter(np.arange(16), np.sum(centered_X[res_list_2],axis=0),label='3',c='green')
plt.scatter(np.arange(16), np.sum(centered_X[res_list_3],axis=0),label='4',c='orange')
plt.scatter(np.arange(16), np.sum(centered_X[res_list_4],axis=0),label='5',c='purple')
plt.legend()
plt.show()

plt.scatter(np.arange(16), np.mean(centered_X[res_list_0],axis=0),label='1',c='red')
plt.scatter(np.arange(16), np.mean(centered_X[res_list_1],axis=0),label='2',c='blue')
plt.scatter(np.arange(16), np.mean(centered_X[res_list_2],axis=0),label='3',c='green')
plt.scatter(np.arange(16), np.mean(centered_X[res_list_3],axis=0),label='4',c='orange')
plt.scatter(np.arange(16), np.mean(centered_X[res_list_4],axis=0),label='5',c='purple')
plt.show()

In [None]:
fig, axs = plt.subplots(23,6, figsize=(22, 72), facecolor='w', edgecolor='k')

fig.subplots_adjust(hspace = .3, wspace=.3)

axs = axs.ravel()

for i in range(138):
    axs[i].bar(np.arange(11),centered_X[res_list_0][i,:])
    axs[i].set_xlim(-0.5,11.5); axs[i].set_ylim(-0.1,0.8)
    axs[i].set_title(str(i),fontsize=15)

In [None]:
# Hierarchical Clustering (relationship) --> dendogram

In [None]:
# Plot an actual picture (barplots)

- https://plotly.com/python/pca-visualization/