# Number Clusters

In [None]:
import os
import json

layer = 'ViewsLayer'
for r, _, files in os.walk('data/youtube/raw/clusters/'):
    for f in files:
        with open(f'data/youtube/raw/clusters/{f}', 'r') as fh:
            clusters = json.loads(fh.read())
            print(f"{f}: {len(clusters)}")


In [None]:
len(clusters)

# Class distribution

In [None]:
import pandas as pd
from pandas import DataFrame

layer = 'LikesLayer'

df = pd.read_csv(f'data/youtube/ml_input/single_context/{layer}.csv', index_col=0)

In [None]:
df['evolution_label'].value_counts()

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

X: np.ndarray = StandardScaler().fit_transform(df)[:,:-1] # all except y
y: pd.Series  = df[df.columns[-1]]

In [None]:
# Print class sizes
y.value_counts()

In [None]:
def undersample(X, y, strategy='not minority') -> ('X', 'y'):
    '''Undersampling so all class sizes equal minority class size.'''
    from imblearn.under_sampling import RandomUnderSampler

    rus = RandomUnderSampler(random_state=42, sampling_strategy=strategy)
    X_undersampled, y_undersampled = rus.fit_resample(X, y)

    return X_undersampled, y_undersampled

In [None]:
def oversample(X, y) -> ('X', 'y'):
    '''Oversample based on SMOTE so all class sizes equal majority class size.'''
    from imblearn.over_sampling import SMOTE

    sm = SMOTE(random_state=42)
    X_oversampled, Y_oversampled = sm.fit_resample(X, y)

    return X_oversampled, Y_oversampled

In [None]:
import pandas as pd

def sample_median_size(X, y: pd.Series) -> ('X', 'y'):
    '''Sample the median class size for all classes.'''
    median = int(y.value_counts().median())
    sampling_sizes = {k: min(median, y.value_counts()[k]) for k in y.unique()}

    # undersample the larger classes to median size
    X, y = undersample(X, y, strategy=sampling_sizes)
    
    # oversample the smaller classes to median size
    X, y = oversample(X, y)

    return X, y

In [None]:
X_s, y_s = sample_median_size(X, y)

y_s.value_counts()