# Cross-Context Data

In [1]:
use_case = 'youtube'
layer_name = 'DislikesLayer' 
reference_layer_name = 'ViewsLayer'

approach = 'cross_context'

In [2]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)

In [None]:
df

# Training

In [3]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size].reset_index(drop=True)
    test = dataframe[training_size:].reset_index(drop=True)

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, "\
          f"Training = {collections.Counter(y_train)}, Test = {collections.Counter(y_test)}")
    # try:
    #     print(f"Label Majority Class: Training = {stat.mode(Y_train)}, Test = {stat.mode(Y_test)}\n")
    # except stat.StatisticsError:
    #     print(f"Label Majority Class: no unique mode; found 2 equally common values")

    return train, test

training, testing = split_data(df, shuffle=False)


Working with: 150576 training points + 37644 test points (0.2 test ratio).
Label Occurrences: Total = Counter({0.0: 101228, 3.0: 28481, 4.0: 28129, 1.0: 15471, 2.0: 14911}), Training = Counter({0.0: 81021, 3.0: 22736, 4.0: 22524, 1.0: 12359, 2.0: 11936}), Test = Counter({0.0: 20207, 3.0: 5745, 4.0: 5605, 1.0: 3112, 2.0: 2975})


In [None]:
training

## Standardization

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [None]:
train_Y

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Balancing of Training Data

In [5]:
test_Y.value_counts()

0.0    20207
3.0     5745
4.0     5605
1.0     3112
2.0     2975
Name: evolution_label, dtype: int64

In [6]:
from processing import DataSampler

sampler = DataSampler()
train_X, train_Y = sampler.sample_median_size(train_X, train_Y)

In [7]:
train_Y.value_counts()

1.0    22524
3.0    22524
4.0    22524
2.0    22524
0.0    22524
Name: evolution_label, dtype: int64

## Principal Components

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)

train_Xp = pca.fit_transform(train_X)
test_Xp = pca.transform(test_X)

## Evaluation Reports

In [9]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes
    :param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [10]:
import pickle 
def export_model(model, model_name):
    return
    
    with open(f'data/{use_case}/ml_output/{approach}/{layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

# Naive Bayes
Working best with _Xp_

Parameters: 
- priors: _prior probabilities of classes_, none
- var\_smoothing: \[_0_ , 1\]

In [17]:
from sklearn.naive_bayes import GaussianNB
priors = np.array([8,2,2,1,1]) / (8+2+2+1+1)
smoothing = 1E-9

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.71      0.57      0.63     20207
         1.0       0.29      0.18      0.22      3112
         2.0       0.18      0.78      0.29      2975
         3.0       0.19      0.14      0.16      5745
         4.0       0.20      0.07      0.11      5605

    accuracy                           0.41     37644
   macro avg       0.31      0.35      0.28     37644
weighted avg       0.48      0.41      0.42     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.65      0.87      0.74     20207
         1.0       0.29      0.63      0.39      3112
         2.0       0.29      0.32      0.30      2975
         3.0       0.00      0.00      0.00      5745
         4.0       0.28      0.02      0.03      5605

    accuracy                           0.55     37644
   macro avg       0.30      0.37      0.29     37644
weighted avg       0.44      0.55      0.46     37644


# Support Vector Machine
Parameters:
- kernel: _linear_, rbf, poly, sigmoid
- C (regularization): <1, _1_, >1
- class\_weight: _None_, balanced

In [None]:
from sklearn.svm import SVC
c = 1
kernel = 'linear'

svc = SVC(kernel='linear', C=c)
svc.fit(train_X, train_Y)

svc_p = SVC(kernel='linear', C=c)
svc_p.fit(train_Xp, train_Y)

print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

# K-nearest Neighbors
Parameters:
- n\_neighbors: _30_
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: _50_ (no difference)

In [11]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc.fit(train_X, train_Y)

knnc_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc_p.fit(train_Xp, train_Y)

print_report([knnc, knnc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(knnc, 'knn_x')
export_model(knnc_p, 'knn_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.73      0.53      0.62     20207
         1.0       0.36      0.49      0.42      3112
         2.0       0.34      0.44      0.38      2975
         3.0       0.22      0.30      0.25      5745
         4.0       0.21      0.27      0.24      5605

    accuracy                           0.45     37644
   macro avg       0.37      0.41      0.38     37644
weighted avg       0.52      0.45      0.47     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.73      0.53      0.62     20207
         1.0       0.36      0.49      0.42      3112
         2.0       0.34      0.44      0.38      2975
         3.0       0.22      0.30      0.25      5745
         4.0       0.21      0.27      0.24      5605

    accuracy                           0.45     37644
   macro avg       0.37      0.41      0.38     37644
weighted avg       0.51      0.45      0.47     37644


# Decision Tree
Working well with _Xp_

Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max_depth: default=None
- min_samples_leaf (to construct leaf): default=1
- min_impurity_decrease (split if the impurity is then decreased by): default=0
- ccp_alpha (max allowed cost after pruning): default=0/nopruning

In [12]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'entropy'
splitter = 'random'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0 # impurity improvement needed to split
ccp_alpha = 0

seed=42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.70      0.53      0.60     20207
         1.0       0.32      0.49      0.38      3112
         2.0       0.30      0.39      0.34      2975
         3.0       0.20      0.28      0.23      5745
         4.0       0.20      0.20      0.20      5605

    accuracy                           0.43     37644
   macro avg       0.34      0.38      0.35     37644
weighted avg       0.48      0.43      0.45     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.68      0.51      0.58     20207
         1.0       0.29      0.50      0.37      3112
         2.0       0.25      0.34      0.29      2975
         3.0       0.19      0.26      0.22      5745
         4.0       0.19      0.17      0.18      5605

    accuracy                           0.41     37644
   macro avg       0.32      0.36      0.33     37644
weighted avg       0.47      0.41      0.43     37644


In [None]:
print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Random Forest
Parameters:


In [13]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'entropy'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease= 0
bootstrap=True

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.71      0.41      0.52     20207
         1.0       0.33      0.43      0.37      3112
         2.0       0.31      0.41      0.35      2975
         3.0       0.20      0.31      0.24      5745
         4.0       0.19      0.31      0.24      5605

    accuracy                           0.38     37644
   macro avg       0.35      0.37      0.34     37644
weighted avg       0.49      0.38      0.41     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.71      0.41      0.52     20207
         1.0       0.33      0.43      0.37      3112
         2.0       0.31      0.41      0.35      2975
         3.0       0.20      0.31      0.24      5745
         4.0       0.19      0.31      0.24      5605

    accuracy                           0.38     37644
   macro avg       0.35      0.37      0.35     37644
weighted avg       0.49      0.38      0.41     37644


# Boosting
50% accuracy, 51% f1

Parameters:
- base\_estimator: None
- n\_estimators: 50
- algorithm: samme.r
- learning\_rate: .3

In [16]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

base_estimator = SVC(kernel='linear')
n_estimators= 50
algo = 'SAMME'
learning_rate = .3

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.76      0.33      0.46     20207
         1.0       0.17      0.20      0.18      3112
         2.0       0.31      0.45      0.37      2975
         3.0       0.20      0.41      0.27      5745
         4.0       0.17      0.28      0.22      5605

    accuracy                           0.33     37644
   macro avg       0.32      0.33      0.30     37644
weighted avg       0.50      0.33      0.37     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.76      0.31      0.44     20207
         1.0       0.20      0.53      0.29      3112
         2.0       0.23      0.42      0.30      2975
         3.0       0.20      0.28      0.24      5745
         4.0       0.17      0.25      0.20      5605

    accuracy                           0.32     37644
   macro avg       0.31      0.36      0.29     37644
weighted avg       0.50      0.32      0.35     37644


In [12]:
print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

         0.0       0.72      0.66      0.69     20207
         1.0       0.45      0.55      0.49      3112
         2.0       0.31      0.34      0.32      2975
         3.0       0.26      0.17      0.21      5745
         4.0       0.23      0.33      0.27      5605

    accuracy                           0.50     37644
   macro avg       0.39      0.41      0.40     37644
weighted avg       0.52      0.50      0.51     37644

### Xp ###
               precision    recall  f1-score   support

         0.0       0.72      0.66      0.69     20207
         1.0       0.41      0.46      0.43      3112
         2.0       0.33      0.43      0.37      2975
         3.0       0.28      0.08      0.13      5745
         4.0       0.22      0.40      0.29      5605

    accuracy                           0.50     37644
   macro avg       0.39      0.41      0.38     37644
weighted avg       0.52      0.50      0.50     37644
