# Cross-Context Data

In [93]:
use_case = 'youtube'
layer_name = 'DislikesLayer' 
reference_layer_name = 'ViewsLayer'

approach = 'cross_context'

In [94]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)

In [None]:
df

# Training

In [95]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size].reset_index(drop=True)
    test = dataframe[training_size:].reset_index(drop=True)

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, "\
          f"Training = {collections.Counter(y_train)}, Test = {collections.Counter(y_test)}")

    return train, test

training, testing = split_data(df, shuffle=False)


Working with: 150576 training points + 37644 test points (0.2 test ratio).
Label Occurrences: Total = Counter({0.0: 101228, 3.0: 28481, 4.0: 28129, 1.0: 15471, 2.0: 14911}), Training = Counter({0.0: 81021, 3.0: 22736, 4.0: 22524, 1.0: 12359, 2.0: 11936}), Test = Counter({0.0: 20207, 3.0: 5745, 4.0: 5605, 1.0: 3112, 2.0: 2975})


In [4]:
training

Unnamed: 0,n_nodes,n_clusters,entropy,sizes_min,sizes_max,sizes_avg,sizes_sum,relative_sizes_min,relative_sizes_max,relative_sizes_avg,...,relative_sizes_avg.1,relative_sizes_sum.1,center_dist_min.1,center_dist_max.1,center_dist_avg.1,center_dist_sum.1,time_f1.1,time_f2.1,cluster_id,evolution_label
0,9016.0,7037.0,12.629915,1.0,9.0,1.281228,9016.0,0.000111,0.000998,0.000142,...,0.000141,1.0,0.0,6.041375e+04,93.662142,6.650012e+05,6.432491e-16,1.000000e+00,1870.0,0.0
1,10292.0,6955.0,12.573513,1.0,11.0,1.479799,10292.0,0.000097,0.001069,0.000144,...,0.000143,1.0,0.0,1.531671e+06,1005.152850,7.038080e+06,4.647232e-01,-8.854560e-01,3035.0,0.0
2,9162.0,7015.0,12.624013,1.0,9.0,1.306058,9162.0,0.000109,0.000982,0.000143,...,0.000142,1.0,0.0,1.002264e+06,257.336409,1.810876e+06,-1.205367e-01,9.927089e-01,4702.0,1.0
3,9016.0,7037.0,12.629915,1.0,9.0,1.281228,9016.0,0.000111,0.000998,0.000142,...,0.000141,1.0,0.0,6.041375e+04,93.662142,6.650012e+05,6.432491e-16,1.000000e+00,119.0,0.0
4,9162.0,7015.0,12.624013,1.0,9.0,1.306058,9162.0,0.000109,0.000982,0.000143,...,0.000142,1.0,0.0,1.002264e+06,257.336409,1.810876e+06,-1.205367e-01,9.927089e-01,3332.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150571,10202.0,7002.0,12.584552,1.0,10.0,1.457012,10202.0,0.000098,0.000980,0.000143,...,0.000144,1.0,0.0,2.372543e+06,1149.232271,7.982567e+06,3.546049e-01,-9.350162e-01,4856.0,0.0
150572,9748.0,7321.0,12.685151,1.0,8.0,1.331512,9748.0,0.000103,0.000821,0.000137,...,0.000139,1.0,0.0,1.831231e+06,880.062845,6.342613e+06,1.000000e+00,-1.608123e-16,5259.0,3.0
150573,9729.0,7371.0,12.688347,1.0,10.0,1.319902,9729.0,0.000103,0.001028,0.000136,...,0.000140,1.0,0.0,2.935547e+06,2033.327409,1.453626e+07,8.854560e-01,4.647232e-01,3115.0,3.0
150574,9004.0,6856.0,12.590117,1.0,7.0,1.313302,9004.0,0.000111,0.000777,0.000146,...,0.000151,1.0,0.0,6.657900e+04,101.640684,6.731662e+05,5.680647e-01,8.229839e-01,2253.0,0.0


## Standardization

In [96]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [130]:
train_Y.value_counts()

0.0    81021
3.0    22736
4.0    22524
1.0    12359
2.0    11936
Name: evolution_label, dtype: int64

In [134]:
test_Y.value_counts()

0.0    20207
3.0     5745
4.0     5605
1.0     3112
2.0     2975
Name: evolution_label, dtype: int64

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Two-stage approach
### 1. Stage: Change Prediction

In [126]:
import pandas as pd

def prepare_stage1_data(X, y: pd.Series) -> ('X', 'y'):
    '''Simplify dataset classes to 0 -> 0, other -> 1.'''
    y_stg1 = y.apply(lambda elem: 0 if elem == 0 else 1)
    return X, y_stg1

In [129]:
train_X_stg1, train_Y_stg1 = prepare_stage1_data(train_X, train_Y)
train_Y_stg1.value_counts()

0    81021
1    69555
Name: evolution_label, dtype: int64

In [135]:
test_X_stg1, test_Y_stg1 = prepare_stage1_data(test_X, test_Y)
test_Y_stg1.value_counts()

0    20207
1    17437
Name: evolution_label, dtype: int64

### 2. Stage: Change Type Prediction

In [137]:
import pandas as pd

def prepare_stage2_data(X, y, columns) -> ('X', 'y'):
    '''Remove class 0 from dataset.'''
    xy = pd.DataFrame(data=X, columns=columns)
    xy['evolution_label'] = y

    # remove class 0
    tmp = xy.loc[xy['evolution_label'] != 0.0].reset_index(drop=True)
    X_stg2 = tmp[tmp.columns[:-1]]
    Y_stg2 = tmp[tmp.columns[-1]]
    
    return X_stg2, Y_stg2

In [140]:
train_X_stg2, train_Y_stg2 = prepare_stage2_data(train_X, train_Y, columns=training.columns[:-1])
train_Y_stg2.value_counts()

3.0    22736
4.0    22524
1.0    12359
2.0    11936
Name: evolution_label, dtype: int64

In [142]:
test_X_stg2, test_Y_stg2 = prepare_stage2_data(test_X, test_Y, columns=testing.columns[:-1])
test_Y_stg2.value_counts()

3.0    5745
4.0    5605
1.0    3112
2.0    2975
Name: evolution_label, dtype: int64

## Balancing of Training Data

In [8]:
train_Y_stg1.value_counts()

0.0    20207
3.0     5745
4.0     5605
1.0     3112
2.0     2975
Name: evolution_label, dtype: int64

In [150]:
train_Y_stg2.value_counts()

3.0    10000
4.0    10000
2.0    10000
1.0    10000
Name: evolution_label, dtype: int64

In [144]:
from processing import DataSampler
sampler = DataSampler()

In [145]:
# balancing by downsampling

train_X_stg1, train_Y_stg1 = sampler.sample_fixed_size(train_X_stg1, train_Y_stg1, size=20000)
train_Y_stg1.value_counts()

1    20000
0    20000
Name: evolution_label, dtype: int64

In [146]:
# balancing by downsampling 

train_X_stg2, train_Y_stg2 = sampler.sample_fixed_size(train_X_stg2, train_Y_stg2, size=10000)
train_Y_stg2.value_counts()

3.0    10000
4.0    10000
2.0    10000
1.0    10000
Name: evolution_label, dtype: int64

## Principal Components

In [46]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)

train_Xp = pca.fit_transform(train_X_stg2)
test_Xp = pca.transform(test_X_stg2)

## Evaluation Reports

In [151]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes
    :param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [152]:
import pickle 
def export_model(model, model_name):
    return
    
    with open(f'data/{use_case}/ml_output/{approach}/{layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

In [182]:
def predict_all(clf1, clf2, test_X) -> 'pred_y':
    '''Runs the two-stage approach by predicting first with clf1 then clf2.'''
    # STG 1
    pred_Y_stg1 = clf1.predict(test_X)   

    # merge original X with predicted change Y1
    test_xy = pd.DataFrame(data=test_X, columns=testing.columns[:-1])
    test_xy['evolution_label'] = pred_Y_stg1
    
    # create new test set with from all predicted change=1
    test_xy_stg2 = test_xy.loc[test_xy['evolution_label'] == 1.0]
    test_X_stg2 = test_xy_stg2[test_xy_stg2.columns[:-1]]

    # STG 2
    pred_Y_stg2 = clf2.predict(test_X_stg2)

    # merge stg2 X with predicted change type Y2
    test_xy_stg2 = test_X_stg2
    test_xy_stg2['evolution_label'] = pred_Y_stg2

    # merge results based on original index (pred class 0 stays 0)
    test_xy['evolution_label'].update(test_xy_stg2['evolution_label'])
    pred_Y = test_xy['evolution_label']

    return pred_Y

# Naive Bayes
Working best with _Xp_

Parameters: 
- priors: _prior probabilities of classes_, none
- var\_smoothing: \[_0_ , 1\]

In [50]:
from sklearn.naive_bayes import GaussianNB
priors = None #np.array([8,2,2,1,1]) / (8+2+2+1+1)
smoothing = 1E-9

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

### X ###
               precision    recall  f1-score   support

         1.0       0.39      0.15      0.22      3112
         2.0       0.25      0.82      0.38      2975
         3.0       0.49      0.34      0.40      5745
         4.0       0.50      0.23      0.31      5605

    accuracy                           0.35     17437
   macro avg       0.41      0.38      0.33     17437
weighted avg       0.43      0.35      0.34     17437

### Xp ###
               precision    recall  f1-score   support

         1.0       0.33      0.72      0.45      3112
         2.0       0.30      0.28      0.29      2975
         3.0       0.49      0.49      0.49      5745
         4.0       0.51      0.19      0.28      5605

    accuracy                           0.40     17437
   macro avg       0.41      0.42      0.38     17437
weighted avg       0.43      0.40      0.38     17437



# Support Vector Machine
Parameters:
- kernel: _linear_, rbf, poly, sigmoid
- C (regularization): <1, _1_, >1
- class\_weight: _None_, balanced

In [30]:
from sklearn.svm import SVC
c = 1
kernel = 'linear'

svc = SVC(kernel='linear', C=c)
svc.fit(train_X, train_Y)

svc_p = SVC(kernel='linear', C=c)
svc_p.fit(train_Xp, train_Y)

print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

KeyboardInterrupt: 

# K-nearest Neighbors
Parameters:
- n\_neighbors: _30_
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: _50_ (no difference)

In [153]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc1 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1.fit(train_X_stg1, train_Y_stg1)

knnc2 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc2.fit(train_X_stg2, train_Y_stg2)


KNeighborsClassifier(leaf_size=50, n_neighbors=30)

In [183]:
pred_Y = predict_all(knnc1, knnc2, test_X)

In [184]:
print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

              precision    recall  f1-score   support

         0.0       0.70      0.79      0.74     20207
         1.0       0.38      0.48      0.42      3112
         2.0       0.35      0.42      0.38      2975
         3.0       0.26      0.16      0.20      5745
         4.0       0.23      0.15      0.18      5605

    accuracy                           0.54     37644
   macro avg       0.38      0.40      0.38     37644
weighted avg       0.51      0.54      0.52     37644



# Decision Tree
Working well with _Xp_

Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max_depth: default=None
- min_samples_leaf (to construct leaf): default=1
- min_impurity_decrease (split if the impurity is then decreased by): default=0
- ccp_alpha (max allowed cost after pruning): default=0/nopruning

In [116]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'entropy'
splitter = 'random'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0 # impurity improvement needed to split
ccp_alpha = 0

seed=42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

### X ###
               precision    recall  f1-score   support

         1.0       0.37      0.52      0.43      3112
         2.0       0.35      0.38      0.37      2975
         3.0       0.48      0.48      0.48      5745
         4.0       0.48      0.35      0.40      5605

    accuracy                           0.43     17437
   macro avg       0.42      0.43      0.42     17437
weighted avg       0.44      0.43      0.43     17437

### Xp ###
               precision    recall  f1-score   support

         1.0       0.33      0.54      0.41      3112
         2.0       0.29      0.35      0.32      2975
         3.0       0.47      0.44      0.45      5745
         4.0       0.48      0.29      0.36      5605

    accuracy                           0.39     17437
   macro avg       0.39      0.41      0.39     17437
weighted avg       0.42      0.39      0.39     17437



In [55]:
print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

         0.0       0.69      0.37      0.48     20207
         1.0       0.28      0.41      0.33      3112
         2.0       0.26      0.39      0.31      2975
         3.0       0.19      0.30      0.23      5745
         4.0       0.18      0.28      0.22      5605

    accuracy                           0.35     37644
   macro avg       0.32      0.35      0.32     37644
weighted avg       0.47      0.35      0.38     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.69      0.37      0.48     20207
         1.0       0.28      0.40      0.33      3112
         2.0       0.26      0.39      0.31      2975
         3.0       0.19      0.30      0.23      5745
         4.0       0.18      0.29      0.22      5605

    accuracy                           0.35     37644
   macro avg       0.32      0.35      0.32     37644
weighted avg       0.47      0.35      0.38     37644


# Random Forest
Parameters:


In [117]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'entropy'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 0
bootstrap=True

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

### X ###
               precision    recall  f1-score   support

         1.0       0.41      0.45      0.43      3112
         2.0       0.38      0.43      0.41      2975
         3.0       0.48      0.45      0.46      5745
         4.0       0.47      0.45      0.46      5605

    accuracy                           0.45     17437
   macro avg       0.44      0.44      0.44     17437
weighted avg       0.45      0.45      0.45     17437

### Xp ###
               precision    recall  f1-score   support

         1.0       0.41      0.44      0.42      3112
         2.0       0.38      0.43      0.40      2975
         3.0       0.47      0.45      0.46      5745
         4.0       0.47      0.45      0.46      5605

    accuracy                           0.44     17437
   macro avg       0.43      0.44      0.44     17437
weighted avg       0.45      0.44      0.44     17437



# Boosting
Parameters:


In [118]:
from sklearn.ensemble import AdaBoostClassifier
base_estimator= SVC(kernel='linear')
n_estimators= 100
algo = 'SAMME'
learning_rate = .3

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')

# Pipeline Approac

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc.fit(train_X, train_Y)


knnc2 =