# Evaluation of Single-Context Features

In [None]:
dataset = 'youtube'
layer_name = 'CategoryLayer' 

In [None]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{dataset}/ml_input/single_context/{layer_name}.csv', index_col=0)

# Evaluation of Cross-Context Features

In [None]:
dataset = 'youtube'
layer_name = 'CategoryLayer' 

In [None]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{dataset}/ml_input/cross_context/{layer_name}.csv', index_col=0)

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(df)[:,:-1] # all except y

In [None]:
y = df[df.columns[-1]]

In [None]:
pd.DataFrame(data=X, columns=df.columns[:-1]).head(2)

In [None]:
df.head(2)

## Principal Components

In [None]:
from sklearn.decomposition import PCA

n_components = 10

pca = PCA(n_components=n_components)
# compute scores in Xp
Xp = pca.fit_transform(X)
# get loadings and the ammount of variance explained by each component 
c, v, r = pca.components_, pca.explained_variance_, pca.explained_variance_ratio_

loadings = pd.DataFrame(data=c, columns=df.columns[:-1], index=range(n_components))

In [None]:
pd.DataFrame(data=Xp, 
             columns=[f'Z{i+1}' for i in range(n_components)])

In [None]:
loadings.apply(lambda col: col != 0).sum().sort_values()

Effect of Variables on each Component:

In [None]:
import seaborn as sns
sns.heatmap(c, cmap='Blues',
            yticklabels=["Z"+str(x) for x in range(1,len(c)+1)],
            xticklabels=list(df.columns[:-1]))

Variance explained by Components:

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,2, figsize=(10,5), sharex=True)
ax[0].bar(range(1,len(v)+1),v)
ax[0].set_ylabel('Variance explained')
ax[1].set_ylabel('Cumulative variance explained')
ax[1].bar(range(1,len(r)+1), np.cumsum(r), color='green')
fig.tight_layout()

Loadings to Components:


In [None]:
def biplot(z1, z2, sc, comps, obs, features, colors, title):
    x, y = sc[:,z1], sc[:,z2]
    
    fig = plt.figure(figsize=(10,10))
    plt.xlabel("Z{}".format(z1))
    plt.ylabel("Z{}".format(z2))

    sx = (x.max() - x.min())/2
    sy = (y.max() - y.min())/2

    # print X and y label
    y_colors = ['red', 'orange', 'green', 'magenta', 'blue']
    plt.scatter(x, y, c=[y_colors[int(e)] for e in obs])
    # for i in range(len(obs)):
    #     plt.text(x[i], y[i], obs[i], ha='center', fontsize=12)

    vec = np.transpose(comps[[z1,z2], :])
    
    for i in range(len(vec)):
        plt.arrow(0, 0, vec[i,0]*sx, vec[i,1]*sy, ec=colors[i],
                  head_width=0.1, head_length=0.1, fc=colors[i])
        plt.text(vec[i,0]*sx*1.2, vec[i,1]*sy*1.2, features[i], 
                 color = colors[i])

    plt.title(title)
    plt.grid()

In [None]:
import itertools

feature_colors = plt.cm.rainbow(np.linspace(0, 1, len(df.columns)))
for z1, z2 in itertools.combinations(range(n_components), 2):
    biplot(z1, z2, Xp, c, y, df.columns, colors=feature_colors, title=f"z1={z1} z2={z2}")

In [None]:
sns.pairplot(df, hue="evolution_label")