# Cross-Context Data

In [None]:
use_case = 'youtube'
layer_name = 'DislikesLayer' 
reference_layer_name = 'ViewsLayer'

approach = 'cross_context'

In [None]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)

In [None]:
df

# Training

In [75]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size].reset_index(drop=True)
    test = dataframe[training_size:].reset_index(drop=True)

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, "\
          f"Training = {collections.Counter(y_train)}, Test = {collections.Counter(y_test)}")

    return train, test

training, testing = split_data(df, shuffle=True)


Working with: 150576 training points + 37644 test points (0.2 test ratio).
Label Occurrences: Total = Counter({0.0: 101228, 3.0: 28481, 4.0: 28129, 1.0: 15471, 2.0: 14911}), Training = Counter({0.0: 81133, 3.0: 22724, 4.0: 22481, 1.0: 12393, 2.0: 11845}), Test = Counter({0.0: 20095, 3.0: 5757, 4.0: 5648, 1.0: 3078, 2.0: 3066})


In [None]:
training

## Standardization

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [None]:
train_Y.value_counts()

In [None]:
test_Y.value_counts()

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Two-stage approach
### 1. Stage: Change Prediction

In [5]:
import pandas as pd

def prepare_stage1_data(X, y: pd.Series) -> ('X', 'y'):
    '''Simplify dataset classes to 0 -> 0, other -> 1.'''
    y_stg1 = y.apply(lambda elem: 0 if elem == 0 else 1)
    return X, y_stg1

In [6]:
train_X_stg1, train_Y_stg1 = prepare_stage1_data(train_X, train_Y)
train_Y_stg1.value_counts()

0    81021
1    69555
Name: evolution_label, dtype: int64

In [7]:
test_X_stg1, test_Y_stg1 = prepare_stage1_data(test_X, test_Y)
test_Y_stg1.value_counts()

0    20207
1    17437
Name: evolution_label, dtype: int64

### 2. Stage: Change Type Prediction

In [8]:
import pandas as pd

def prepare_stage2_data(X, y, columns) -> ('X', 'y'):
    '''Remove class 0 from dataset.'''
    xy = pd.DataFrame(data=X, columns=columns)
    xy['evolution_label'] = y

    # remove class 0
    tmp = xy.loc[xy['evolution_label'] != 0.0].reset_index(drop=True)
    X_stg2 = tmp[tmp.columns[:-1]]
    Y_stg2 = tmp[tmp.columns[-1]]
    
    return X_stg2, Y_stg2

In [9]:
train_X_stg2, train_Y_stg2 = prepare_stage2_data(train_X, train_Y, columns=training.columns[:-1])
train_Y_stg2.value_counts()

3.0    22736
4.0    22524
1.0    12359
2.0    11936
Name: evolution_label, dtype: int64

In [10]:
test_X_stg2, test_Y_stg2 = prepare_stage2_data(test_X, test_Y, columns=testing.columns[:-1])
test_Y_stg2.value_counts()

3.0    5745
4.0    5605
1.0    3112
2.0    2975
Name: evolution_label, dtype: int64

## Balancing of Training Data

In [11]:
train_Y_stg1.value_counts()

0    81021
1    69555
Name: evolution_label, dtype: int64

In [12]:
train_Y_stg2.value_counts()

3.0    22736
4.0    22524
1.0    12359
2.0    11936
Name: evolution_label, dtype: int64

In [13]:
from processing import DataSampler
sampler = DataSampler()

In [14]:
# balancing by downsampling

train_X_stg1, train_Y_stg1 = sampler.sample_fixed_size(train_X_stg1, train_Y_stg1, size=20000)
train_Y_stg1.value_counts()

1    20000
0    20000
Name: evolution_label, dtype: int64

In [15]:
# balancing by downsampling 

train_X_stg2, train_Y_stg2 = sampler.sample_fixed_size(train_X_stg2, train_Y_stg2, size=10000)
train_Y_stg2.value_counts()

3.0    10000
4.0    10000
2.0    10000
1.0    10000
Name: evolution_label, dtype: int64

## Principal Components

In [16]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
train_Xp_stg1 = pca.fit_transform(train_X_stg1)
test_Xp_stg1 = pca.transform(test_X_stg1)

pca = PCA(n_components=8)
train_Xp_stg2 = pca.fit_transform(train_X_stg2)
test_Xp_stg2 = pca.transform(test_X_stg2)

## Evaluation Reports

In [17]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes
    :param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [18]:
import pickle 
def export_model(model, model_name):
    return
    
    with open(f'data/{use_case}/ml_output/{approach}/{layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

In [19]:
def predict_all(clf1, clf2, test_X) -> 'pred_y':
    '''Runs the two-stage approach by predicting first with clf1 then clf2.'''
    # STG 1
    pred_Y_stg1 = clf1.predict(test_X)   

    # merge original X with predicted change Y1
    test_xy = pd.DataFrame(data=test_X, columns=testing.columns[:-1])
    test_xy['evolution_label'] = pred_Y_stg1
    
    # create new test set with from all predicted change=1
    test_xy_stg2 = test_xy.loc[test_xy['evolution_label'] == 1.0]
    test_X_stg2 = test_xy_stg2[test_xy_stg2.columns[:-1]]

    # STG 2
    pred_Y_stg2 = clf2.predict(test_X_stg2)

    # merge stg2 X with predicted change type Y2
    test_xy_stg2 = test_X_stg2
    test_xy_stg2['evolution_label'] = pred_Y_stg2

    # merge results based on original index (pred class 0 stays 0)
    test_xy['evolution_label'].update(test_xy_stg2['evolution_label'])
    pred_Y = test_xy['evolution_label']

    return pred_Y

In [20]:
train_X, train_Xp, train_Y = train_X_stg1, train_Xp_stg1, train_Y_stg1
test_X, test_Xp, test_Y = test_X_stg1, test_Xp_stg1, test_Y_stg1

# Naive Bayes
Working best with _Xp_

Parameters: 
- priors: _prior probabilities of classes_, none
- var\_smoothing: \[0, 1\] _1E-9_

Stage 1: 68% accuracy/f1 score (Xp)

In [30]:
from sklearn.naive_bayes import GaussianNB
priors = None# [.5,.5] #np.array([8,2,2,1,1]) / (8+2+2+1+1)
smoothing = 1E-9

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

### X ###
               precision    recall  f1-score   support

           0       0.64      0.52      0.58     20207
           1       0.55      0.66      0.60     17437

    accuracy                           0.59     37644
   macro avg       0.59      0.59      0.59     37644
weighted avg       0.60      0.59      0.59     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.71      0.69      0.70     20207
           1       0.65      0.67      0.66     17437

    accuracy                           0.68     37644
   macro avg       0.68      0.68      0.68     37644
weighted avg       0.68      0.68      0.68     37644



# Support Vector Machine
Parameters:
- C (regularization): <1, _1_, >1, def=1
- kernel: _linear_, rbf, poly, sigmoid, def=rbf
- gamma (for rbf, poly, sigmoid): scale, auto, float, def=scale
- class\_weight: _None_, balanced, dict, def=None

Stage 1: 69% accuracy/f1

In [83]:
from sklearn.svm import SVC
c = 1
kernel = 'linear'
gamma = 'auto'
weights = None

print('x.')
svc = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc.fit(train_X, train_Y)

print('xp.')
svc_p = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc_p.fit(train_Xp, train_Y)

print('report.')
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

x.
xp.
report.
### X ###
               precision    recall  f1-score   support

           0       0.71      0.69      0.70     20207
           1       0.65      0.68      0.67     17437

    accuracy                           0.69     37644
   macro avg       0.68      0.69      0.68     37644
weighted avg       0.69      0.69      0.69     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.71      0.69      0.70     20207
           1       0.66      0.68      0.67     17437

    accuracy                           0.69     37644
   macro avg       0.69      0.69      0.69     37644
weighted avg       0.69      0.69      0.69     37644



In [84]:
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

           0       0.71      0.69      0.70     20207
           1       0.65      0.68      0.67     17437

    accuracy                           0.69     37644
   macro avg       0.68      0.69      0.68     37644
weighted avg       0.69      0.69      0.69     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.71      0.69      0.70     20207
           1       0.66      0.68      0.67     17437

    accuracy                           0.69     37644
   macro avg       0.69      0.69      0.69     37644
weighted avg       0.69      0.69      0.69     37644



# K-nearest Neighbors
Parameters:
- n\_neighbors: _30_
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: _50_ (no difference)

Stage 1: 70% accuracy, 70% f1 score

In [40]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc1 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1.fit(train_X_stg1, train_Y_stg1)

knnc1_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1_p.fit(train_Xp_stg1, train_Y_stg1)

print_report([knnc1, knnc1_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


### X ###
               precision    recall  f1-score   support

           0       0.70      0.79      0.74     20207
           1       0.71      0.60      0.65     17437

    accuracy                           0.70     37644
   macro avg       0.70      0.69      0.69     37644
weighted avg       0.70      0.70      0.70     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.70      0.79      0.74     20207
           1       0.71      0.60      0.65     17437

    accuracy                           0.70     37644
   macro avg       0.70      0.69      0.69     37644
weighted avg       0.70      0.70      0.70     37644



In [None]:
knnc2 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc2.fit(train_X_stg2, train_Y_stg2)

In [None]:
pred_Y = predict_all(knnc1, knnc2, test_X)

In [None]:
print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# Decision Tree
Working well with _Xp_

Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max\_depth: _None_ default=None
- min\_samples\_leaf (to construct leaf): _2_ default=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _0_ default=0
- ccp\_alpha (max allowed cost after pruning): _1E-2_ default=0/nopruning

Stage 1: 69% accuracy/f1 with Xp

In [58]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'random'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0 # impurity improvement needed to split
ccp_alpha = 1E-2

seed=42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

### X ###
               precision    recall  f1-score   support

           0       0.65      0.87      0.75     20207
           1       0.76      0.46      0.57     17437

    accuracy                           0.68     37644
   macro avg       0.70      0.67      0.66     37644
weighted avg       0.70      0.68      0.67     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.69      0.79      0.74     20207
           1       0.71      0.59      0.65     17437

    accuracy                           0.70     37644
   macro avg       0.70      0.69      0.69     37644
weighted avg       0.70      0.70      0.70     37644



In [50]:
print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

           0       0.65      0.87      0.75     20207
           1       0.75      0.47      0.58     17437

    accuracy                           0.68     37644
   macro avg       0.70      0.67      0.66     37644
weighted avg       0.70      0.68      0.67     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.70      0.76      0.72     20207
           1       0.69      0.62      0.65     17437

    accuracy                           0.69     37644
   macro avg       0.69      0.69      0.69     37644
weighted avg       0.69      0.69      0.69     37644



# Random Forest
Parameters:
- n\_estimators: _100_ def=100
- criterion: _gini_, entropy
- max\_depth: _None_ def=None
- min\_samples\_leaf (to construct leaf): _2_ def=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-2_ default=0
- bootstrap (if bootstraped sample is used): _True_ def=True

Stage 1: 69% accuracy/f1

In [71]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-2
bootstrap=True

seed=42

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

### X ###
               precision    recall  f1-score   support

           0       0.68      0.81      0.74     20207
           1       0.72      0.57      0.63     17437

    accuracy                           0.69     37644
   macro avg       0.70      0.69      0.69     37644
weighted avg       0.70      0.69      0.69     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.68      0.80      0.74     20207
           1       0.71      0.57      0.63     17437

    accuracy                           0.69     37644
   macro avg       0.70      0.68      0.68     37644
weighted avg       0.70      0.69      0.69     37644



In [62]:
print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

           0       0.68      0.81      0.74     20207
           1       0.72      0.56      0.63     17437

    accuracy                           0.69     37644
   macro avg       0.70      0.68      0.68     37644
weighted avg       0.70      0.69      0.69     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.68      0.80      0.74     20207
           1       0.71      0.57      0.63     17437

    accuracy                           0.69     37644
   macro avg       0.70      0.68      0.68     37644
weighted avg       0.70      0.69      0.69     37644



# Boosting
Parameters:
- base\_estimator: object, _None(DT)_
- n\_estimators: _50_ def=50
- learning\_rate: _1_ def=1.0
- algorithm: SAMME, _SAMME.R_

Stage 1: 71% accuracy/f1

In [96]:
from sklearn.ensemble import AdaBoostClassifier
base_estimator = None #SVC(kernel='linear')
n_estimators= 50
algo = 'SAMME.R'
learning_rate = 1

seed=42

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')

### X ###
               precision    recall  f1-score   support

           0       0.70      0.82      0.75     20207
           1       0.73      0.59      0.65     17437

    accuracy                           0.71     37644
   macro avg       0.72      0.70      0.70     37644
weighted avg       0.71      0.71      0.71     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.69      0.82      0.75     20207
           1       0.73      0.58      0.65     17437

    accuracy                           0.71     37644
   macro avg       0.71      0.70      0.70     37644
weighted avg       0.71      0.71      0.70     37644



In [91]:
print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

           0       0.70      0.81      0.75     20207
           1       0.73      0.60      0.66     17437

    accuracy                           0.71     37644
   macro avg       0.72      0.70      0.70     37644
weighted avg       0.71      0.71      0.71     37644

### Xp ###
               precision    recall  f1-score   support

           0       0.69      0.81      0.75     20207
           1       0.73      0.59      0.65     17437

    accuracy                           0.71     37644
   macro avg       0.71      0.70      0.70     37644
weighted avg       0.71      0.71      0.70     37644



# Pipeline Approach
Pipeline from best classifier for each stage

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc.fit(train_X, train_Y)


knnc2 =