# Cross-Context Data

In [85]:
use_case = 'youtube'
layer_name = 'ViewsLayer' 
reference_layer_name = 'CountryLayer'

approach = 'cross_context'

In [86]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)

  mask |= (ar1 == a)


In [None]:
df

# Training

In [87]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size].reset_index(drop=True)
    test = dataframe[training_size:].reset_index(drop=True)

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, "\
          f"Training = {collections.Counter(y_train)}, Test = {collections.Counter(y_test)}")

    return train, test

training, testing = split_data(df, shuffle=True)


Working with: 801936 training points + 200484 test points (0.2 test ratio).
Label Occurrences: Total = Counter({0.0: 668685, 3.0: 160491, 4.0: 156733, 1.0: 8279, 2.0: 8232}), Training = Counter({0.0: 535042, 3.0: 128236, 4.0: 125467, 1.0: 6619, 2.0: 6572}), Test = Counter({0.0: 133643, 3.0: 32255, 4.0: 31266, 1.0: 1660, 2.0: 1660})


In [None]:
training

## Standardization

In [88]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [None]:
train_Y.value_counts()

In [89]:
test_Y.value_counts()

0.0    133643
3.0     32255
4.0     31266
1.0      1660
2.0      1660
Name: evolution_label, dtype: int64

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Two-stage approach
### 1. Stage: Change Prediction

In [None]:
import pandas as pd

def prepare_stage1_data(X, y: pd.Series) -> ('X', 'y'):
    '''Simplify dataset classes to 0 -> 0, other -> 1.'''
    y_stg1 = y.apply(lambda elem: 0 if elem == 0 else 1)
    return X, y_stg1

In [None]:
train_X_stg1, train_Y_stg1 = prepare_stage1_data(train_X, train_Y)
train_Y_stg1.value_counts()

In [None]:
test_X_stg1, test_Y_stg1 = prepare_stage1_data(test_X, test_Y)
test_Y_stg1.value_counts()

### 2. Stage: Change Type Prediction

In [None]:
import pandas as pd

def prepare_stage2_data(X, y, columns) -> ('X', 'y'):
    '''Remove class 0 from dataset.'''
    xy = pd.DataFrame(data=X, columns=columns)
    xy['evolution_label'] = y

    # remove class 0
    tmp = xy.loc[xy['evolution_label'] != 0.0].reset_index(drop=True)
    X_stg2 = tmp[tmp.columns[:-1]]
    Y_stg2 = tmp[tmp.columns[-1]]
    
    return X_stg2, Y_stg2

In [None]:
train_X_stg2, train_Y_stg2 = prepare_stage2_data(train_X, train_Y, columns=training.columns[:-1])
train_Y_stg2.value_counts()

In [None]:
test_X_stg2, test_Y_stg2 = prepare_stage2_data(test_X, test_Y, columns=testing.columns[:-1])
test_Y_stg2.value_counts()

## Balancing of Training Data

In [None]:
train_Y_stg1.value_counts()

In [None]:
train_Y_stg2.value_counts()

In [None]:
from processing import DataSampler
sampler = DataSampler()

In [None]:
# balancing by downsampling

train_X_stg1, train_Y_stg1 = sampler.sample_fixed_size(train_X_stg1, train_Y_stg1, size=20000)
train_Y_stg1.value_counts()

In [None]:
# balancing by downsampling 

train_X_stg2, train_Y_stg2 = sampler.sample_fixed_size(train_X_stg2, train_Y_stg2, size=10000)
train_Y_stg2.value_counts()

## Principal Components

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
train_Xp_stg1 = pca.fit_transform(train_X_stg1)
test_Xp_stg1 = pca.transform(test_X_stg1)

pca = PCA(n_components=8)
train_Xp_stg2 = pca.fit_transform(train_X_stg2)
test_Xp_stg2 = pca.transform(test_X_stg2)

## Evaluation Reports

In [None]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes
    :param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [None]:
import pickle 
def export_model(model, model_name):
    return
    
    with open(f'data/{use_case}/ml_output/{approach}/{layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

In [None]:
def predict_all(clf1, clf2, test_X) -> 'pred_y':
    '''Runs the two-stage approach by predicting first with clf1 then clf2.'''
    # STG 1
    pred_Y_stg1 = clf1.predict(test_X)   

    # merge original X with predicted change Y1
    test_xy = pd.DataFrame(data=test_X, columns=testing.columns[:-1])
    test_xy['evolution_label'] = pred_Y_stg1
    
    # create new test set with from all predicted change=1
    test_xy_stg2 = test_xy.loc[test_xy['evolution_label'] == 1.0]
    test_X_stg2 = test_xy_stg2[test_xy_stg2.columns[:-1]]

    # STG 2
    pred_Y_stg2 = clf2.predict(test_X_stg2)

    # merge stg2 X with predicted change type Y2
    test_xy_stg2 = test_X_stg2
    test_xy_stg2['evolution_label'] = pred_Y_stg2

    # merge results based on original index (pred class 0 stays 0)
    test_xy['evolution_label'].update(test_xy_stg2['evolution_label'])
    pred_Y = test_xy['evolution_label']

    return pred_Y

In [None]:
train_X, train_Xp, train_Y = train_X_stg1, train_Xp_stg1, train_Y_stg1
test_X, test_Xp, test_Y = test_X_stg1, test_Xp_stg1, test_Y_stg1

In [None]:
train_X, train_Xp, train_Y = train_X_stg2, train_Xp_stg2, train_Y_stg2
test_X, test_Xp, test_Y = test_X_stg2, test_Xp_stg2, test_Y_stg2

# Naive Bayes
Working best with _Xp_

Stage 1: 68% accuracy/f1 score (Xp) 
Parameters: 
- priors: prior probabilities of classes, _None_
- var\_smoothing: \[0, 1\] _1E-9_

Stage 2: 40% accuracy, 38% f1 with Xp
Parameters: 
- None
- 1E-9

In [None]:
from sklearn.naive_bayes import GaussianNB
priors = None # np.array([2,2,1,1]) / (2+2+1+1)
smoothing = 1E-9

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

In [None]:
print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Support Vector Machine
Stage 1: 69% accuracy/f1
Parameters:
- C (regularization): <1, _1_, >1, def=1
- kernel: _linear_, rbf, poly, sigmoid, def=rbf
- gamma (for rbf, poly, sigmoid): scale, auto, float, def=scale
- class\_weight: _None_, balanced, dict, def=None

Stage 2: 44% accuracy/f1
Parameters:
- 10
- rbf
- scale
- None



In [None]:
from sklearn.svm import SVC
c = 10
kernel = 'rbf'
gamma = 'scale'
weights = None

print('x.')
svc = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc.fit(train_X, train_Y)

print('xp.')
svc_p = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc_p.fit(train_Xp, train_Y)

print('report.')
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

In [None]:
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# K-nearest Neighbors

Stage 1: 70% accuracy, 70% f1 score
Parameters:
- n\_neighbors: _30_
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: _50_ (no difference)

Stage 2: 46% accuracy/f1 
Parameters:
- _20_
- uniform
- auto
- _30_


In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 20
weights = 'uniform'
algo = 'auto'
leaf_size = 30

knnc1 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1.fit(train_X, train_Y)

knnc1_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1_p.fit(train_Xp, train_Y)

print_report([knnc1, knnc1_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Decision Tree
Working well with _Xp_

Stage 1: 69% accuracy/f1 with Xp
Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max\_depth: _None_ default=None
- min\_samples\_leaf (to construct leaf): _2_ default=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _0_ default=0
- ccp\_alpha (max allowed cost after pruning): _1E-2_ default=0/nopruning

Stage 2: 43% accuracy/f1 with X
Parameters:
- gini
- random
- None
- 2
- 0
- _0_



In [None]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'random'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0 # impurity improvement needed to split
ccp_alpha = 0

seed=42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

In [None]:
print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Random Forest
Stage 1: 69% accuracy/f1
Parameters:
- n\_estimators: _100_ def=100
- criterion: _gini_, entropy
- max\_depth: _None_ def=None
- min\_samples\_leaf (to construct leaf): _2_ def=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-2_ default=0
- bootstrap (if bootstraped sample is used): _True_ def=True

Stage 2: 44% accuracy/f1
Parameters:
- 100
- _entropy_
- None
- 2
- _0_
- True



In [None]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'entropy'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0
bootstrap=True

seed=42

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

In [None]:
print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Boosting
Stage 1: 71% accuracy/f1
Parameters:
- base\_estimator: object, _None(DT)_
- n\_estimators: _50_ def=50
- learning\_rate: _1_ def=1.0
- algorithm: SAMME, _SAMME.R_

Stage 2: 46% accuracy, 45% f1
Parameters:
- None
- 50
- 1
- SAMME.R


In [None]:
from sklearn.ensemble import AdaBoostClassifier

base_estimator = None
n_estimators= 50
learning_rate = 1
algo = 'SAMME.R'

seed=42

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')

In [None]:
print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Pipeline Approach
Pipeline from best classifier for each stage

## Pipeline with KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc1 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1.fit(train_X_stg1, train_Y_stg1)

print_report([knnc1], [test_X_stg1], test_Y_stg1, ["stg1"])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 20
weights = 'uniform'
algo = 'auto'
leaf_size = 30

knnc2 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc2.fit(train_X_stg2, train_Y_stg2)

print_report([knnc2], [test_X_stg2], test_Y_stg2, ["stg2"])

In [None]:
pred_Y = predict_all(knnc1, knnc2, test_X)

print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

## Pipeline with Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

base_estimator = None
n_estimators= 50
learning_rate = 1
algo = 'SAMME.R'

seed=42

bc1 = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc1.fit(train_X_stg1, train_Y_stg1)

bc2 = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc2.fit(train_X_stg2, train_Y_stg2)

In [None]:
pred_Y = predict_all(bc1, bc2, test_X)

print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

## Final Decision: KNN then Boosting(DT)

In [91]:
pred_Y = predict_all(knnc1, bc2, test_X)

print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

              precision    recall  f1-score   support

         0.0       0.68      0.61      0.64    133643
         1.0       0.01      0.05      0.01      1660
         2.0       0.01      0.15      0.02      1660
         3.0       0.18      0.21      0.19     32255
         4.0       0.15      0.04      0.06     31266

    accuracy                           0.45    200484
   macro avg       0.21      0.21      0.19    200484
weighted avg       0.51      0.45      0.47    200484

