# Cross-Context Data

In [47]:
use_case = 'youtube'
layer_name = 'DislikesLayer' 
reference_layer_name = 'ViewsLayer'

approach = 'cross_context'

In [48]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)

In [None]:
df

# Training

In [49]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size].reset_index(drop=True)
    test = dataframe[training_size:].reset_index(drop=True)

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, \n"\
          f"\tTraining = {collections.Counter(y_train)}, \n"\
              f"\tTest = {collections.Counter(y_test)}")
    # try:
    #     print(f"Label Majority Class: Training = {stat.mode(Y_train)}, Test = {stat.mode(Y_test)}\n")
    # except stat.StatisticsError:
    #     print(f"Label Majority Class: no unique mode; found 2 equally common values")

    return train, test

training, testing = split_data(df, shuffle=False)


Working with: 150576 training points + 37644 test points (0.2 test ratio).
Label Occurrences: Total = Counter({-1.0: 92785, 3.0: 28481, 4.0: 28129, 1.0: 15471, 2.0: 14911, 0.0: 8443}), 
	Training = Counter({-1.0: 74172, 3.0: 22831, 4.0: 22576, 1.0: 12339, 2.0: 11941, 0.0: 6717}), 
	Test = Counter({-1.0: 18613, 3.0: 5650, 4.0: 5553, 1.0: 3132, 2.0: 2970, 0.0: 1726})


In [40]:
training

Unnamed: 0,n_nodes,n_clusters,entropy,sizes_min,sizes_max,sizes_avg,sizes_sum,relative_sizes_min,relative_sizes_max,relative_sizes_avg,...,relative_sizes_avg.1,relative_sizes_sum.1,center_dist_min.1,center_dist_max.1,center_dist_avg.1,center_dist_sum.1,time_f1.1,time_f2.1,cluster_id,evolution_label
0,9086.0,7014.0,12.628675,1.0,8.0,1.295409,9086.0,0.000110,0.000880,0.000143,...,0.000146,1.0,0.0,1.623864e+06,398.484073,2.723639e+06,-0.568065,0.822984,8982.0,0.0
1,9162.0,7015.0,12.624013,1.0,9.0,1.306058,9162.0,0.000109,0.000982,0.000143,...,0.000142,1.0,0.0,1.002264e+06,257.336409,1.810876e+06,-0.120537,0.992709,6696.0,0.0
2,10040.0,6569.0,12.472724,1.0,15.0,1.528391,10040.0,0.000100,0.001494,0.000152,...,0.000144,1.0,0.0,1.961570e+06,1696.761731,1.180098e+07,0.568065,-0.822984,19590.0,2.0
3,10180.0,6946.0,12.566904,1.0,12.0,1.465592,10180.0,0.000098,0.001179,0.000144,...,0.000307,1.0,0.0,2.364056e+05,369.147185,1.201943e+06,0.239316,-0.970942,17205.0,3.0
4,8939.0,6733.0,12.565393,1.0,9.0,1.327640,8939.0,0.000112,0.001007,0.000149,...,0.000145,1.0,0.0,3.759299e+05,249.743865,1.722733e+06,-0.354605,0.935016,11094.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484987,9745.0,7290.0,12.678274,1.0,12.0,1.336763,9745.0,0.000103,0.001231,0.000137,...,0.000137,1.0,0.0,3.185698e+06,1957.954065,1.433418e+07,0.992709,0.120537,1090.0,0.0
484988,8986.0,7100.0,12.665515,1.0,7.0,1.265634,8986.0,0.000111,0.000779,0.000141,...,0.000143,1.0,0.0,6.851683e+04,95.392507,6.659351e+05,0.120537,0.992709,1920.0,0.0
484989,9810.0,7368.0,12.686830,1.0,11.0,1.331433,9810.0,0.000102,0.001121,0.000136,...,0.000136,1.0,0.0,2.643188e+06,1598.417071,1.178193e+07,0.822984,0.568065,11539.0,0.0
484990,10292.0,6955.0,12.573513,1.0,11.0,1.479799,10292.0,0.000097,0.001069,0.000144,...,0.000143,1.0,0.0,1.531671e+06,1005.152850,7.038080e+06,0.464723,-0.885456,14745.0,4.0


In [50]:
def remove_empty_community_class(df):
    '''Removes evolution_label -1 from dataset indicating the community stays empty.'''
    # res = df.loc[df['evolution_label'] != -1.0]
    # res = res.reset_index(drop=True)
    # return res
    df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
    return df

training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)

## Standardization

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [None]:
train_Y

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Balancing of Training Data

In [52]:
train_Y.value_counts()

0.0    80889
3.0    22831
4.0    22576
1.0    12339
2.0    11941
Name: evolution_label, dtype: int64

In [53]:
from processing import DataSampler

sampler = DataSampler()
train_X, train_Y = sampler.sample_median_size(train_X, train_Y)

In [44]:
train_Y.value_counts()

0.0    78675
3.0    78675
2.0    78675
4.0    78675
1.0    78675
Name: evolution_label, dtype: int64

## Principal Components

In [54]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)

train_Xp = pca.fit_transform(train_X)
test_Xp = pca.transform(test_X)

## Evaluation Reports

In [28]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes
    :param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [101]:
import pickle
from pathlib import Path

def export_model(model, model_name):
    fpath = f'data/{use_case}/ml_output/{approach}/{layer_name}'
    Path(fpath).mkdir(parents=True, exist_ok=True)
    with open(f'{fpath}/{layer_name}_{reference_layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

# Naive Bayes
Working best with _Xp_

Parameters: 
- priors: _prior probabilities of classes_, None
- var\_smoothing: \[0 , 1\], _1E-9_

In [59]:
from sklearn.naive_bayes import GaussianNB
priors = np.array([8,2,2,1,1]) / (8+2+2+1+1)
smoothing = 1E-9

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.70      0.60      0.64     20339
         1.0       0.24      0.25      0.25      3132
         2.0       0.19      0.71      0.30      2970
         3.0       0.20      0.09      0.12      5650
         4.0       0.18      0.11      0.14      5553

    accuracy                           0.43     37644
   macro avg       0.30      0.35      0.29     37644
weighted avg       0.47      0.43      0.43     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.65      0.86      0.74     20339
         1.0       0.28      0.70      0.40      3132
         2.0       0.26      0.21      0.23      2970
         3.0       0.00      0.00      0.00      5650
         4.0       0.24      0.02      0.04      5553

    accuracy                           0.54     37644
   macro avg       0.29      0.36      0.28     37644
weighted avg       0.43      0.54      0.46     37644


# Support Vector Machine
Parameters:
- C (regularization): <1, _1_, >1
- kernel: _linear_, rbf, poly, sigmoid
- gamma (for rbf, poly, sigmoid): scale, auto, float, def=scale
- class\_weight: _None_, balanced, dict, def=None

In [103]:
from sklearn.svm import SVC
c = 10
kernel = 'linear'
gamma = 'scale'
weights = None

svc = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc.fit(train_X, train_Y)

svc_p = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc_p.fit(train_Xp, train_Y)

print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

In [98]:
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

         0.0       0.73      0.56      0.64     20339
         1.0       0.28      0.35      0.31      3132
         2.0       0.27      0.62      0.37      2970
         3.0       0.22      0.12      0.16      5650
         4.0       0.22      0.32      0.26      5553

    accuracy                           0.45     37644
   macro avg       0.34      0.39      0.35     37644
weighted avg       0.51      0.45      0.46     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.73      0.57      0.64     20339
         1.0       0.29      0.32      0.30      3132
         2.0       0.26      0.64      0.37      2970
         3.0       0.22      0.14      0.17      5650
         4.0       0.21      0.29      0.24      5553

    accuracy                           0.45     37644
   macro avg       0.34      0.39      0.35     37644
weighted avg       0.50      0.45      0.46     37644


# K-nearest Neighbors
Parameters:
- n\_neighbors: _30_
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: _50_ (no difference)

In [60]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc.fit(train_X, train_Y)

knnc_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc_p.fit(train_Xp, train_Y)

print_report([knnc, knnc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(knnc, 'knn_x')
export_model(knnc_p, 'knn_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.74      0.53      0.62     20339
         1.0       0.35      0.46      0.40      3132
         2.0       0.33      0.44      0.38      2970
         3.0       0.22      0.30      0.25      5650
         4.0       0.22      0.28      0.24      5553

    accuracy                           0.45     37644
   macro avg       0.37      0.40      0.38     37644
weighted avg       0.52      0.45      0.47     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.74      0.53      0.62     20339
         1.0       0.35      0.46      0.40      3132
         2.0       0.33      0.44      0.38      2970
         3.0       0.22      0.30      0.25      5650
         4.0       0.22      0.28      0.24      5553

    accuracy                           0.45     37644
   macro avg       0.37      0.40      0.38     37644
weighted avg       0.52      0.45      0.47     37644


# Decision Tree
Working well with _Xp_

Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max\_depth: default=_None_
- min\_samples\_leaf (to construct leaf): _2_, default=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-5_, default=0
- ccp\_alpha (max allowed cost after pruning): _0_, default=0 ie. nopruning

In [77]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'random'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-5 # impurity improvement needed to split
ccp_alpha = 0

seed=42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.74      0.58      0.65     20339
         1.0       0.37      0.44      0.40      3132
         2.0       0.35      0.44      0.39      2970
         3.0       0.22      0.29      0.25      5650
         4.0       0.23      0.28      0.25      5553

    accuracy                           0.47     37644
   macro avg       0.38      0.41      0.39     37644
weighted avg       0.53      0.47      0.49     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.74      0.58      0.65     20339
         1.0       0.37      0.38      0.37      3132
         2.0       0.34      0.48      0.40      2970
         3.0       0.23      0.26      0.24      5650
         4.0       0.22      0.32      0.26      5553

    accuracy                           0.47     37644
   macro avg       0.38      0.40      0.39     37644
weighted avg       0.52      0.47      0.49     37644


# Random Forest
Parameters:
- n\_estimators: _100_ def=100
- criterion: _gini_, entropy
- max\_depth: _None_ def=None
- min\_samples\_leaf (to construct leaf): _2_ def=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-5_ default=0
- bootstrap (if bootstraped sample is used): _True_ def=True

In [93]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 50
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 1E-5
bootstrap=True

seed=42

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.73      0.46      0.56     20339
         1.0       0.34      0.42      0.37      3132
         2.0       0.33      0.45      0.38      2970
         3.0       0.21      0.31      0.25      5650
         4.0       0.20      0.31      0.24      5553

    accuracy                           0.41     37644
   macro avg       0.36      0.39      0.36     37644
weighted avg       0.51      0.41      0.44     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.73      0.43      0.54     20339
         1.0       0.34      0.42      0.37      3132
         2.0       0.32      0.43      0.36      2970
         3.0       0.20      0.32      0.25      5650
         4.0       0.20      0.31      0.24      5553

    accuracy                           0.40     37644
   macro avg       0.36      0.38      0.35     37644
weighted avg       0.50      0.40      0.43     37644


# Boosting
50% accuracy, 51% f1

Parameters:
- base\_estimator: None
- n\_estimators: 50
- algorithm: samme.r
- learning\_rate: .3

In [95]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

base_estimator = None# SVC(kernel='linear')
n_estimators= 50
algo = 'SAMME.R'
learning_rate = .3

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.73      0.67      0.70     20339
         1.0       0.45      0.50      0.47      3132
         2.0       0.29      0.38      0.33      2970
         3.0       0.24      0.16      0.19      5650
         4.0       0.24      0.33      0.28      5553

    accuracy                           0.51     37644
   macro avg       0.39      0.41      0.39     37644
weighted avg       0.52      0.51      0.51     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.72      0.67      0.70     20339
         1.0       0.37      0.46      0.41      3132
         2.0       0.31      0.40      0.35      2970
         3.0       0.22      0.16      0.19      5650
         4.0       0.23      0.30      0.26      5553

    accuracy                           0.50     37644
   macro avg       0.37      0.40      0.38     37644
weighted avg       0.51      0.50      0.50     37644


In [12]:
print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

         0.0       0.72      0.66      0.69     20207
         1.0       0.45      0.55      0.49      3112
         2.0       0.31      0.34      0.32      2975
         3.0       0.26      0.17      0.21      5745
         4.0       0.23      0.33      0.27      5605

    accuracy                           0.50     37644
   macro avg       0.39      0.41      0.40     37644
weighted avg       0.52      0.50      0.51     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.72      0.66      0.69     20207
         1.0       0.41      0.46      0.43      3112
         2.0       0.33      0.43      0.37      2975
         3.0       0.28      0.08      0.13      5745
         4.0       0.22      0.40      0.29      5605

    accuracy                           0.50     37644
   macro avg       0.39      0.41      0.38     37644
weighted avg       0.52      0.50      0.50     37644
