# Cross-Context Data

In [48]:
use_case = 'youtube'
layer_name = 'TrendDelayLayer' 
reference_layer_name = 'ViewsLayer'

approach = 'cross_context_2stage'

In [49]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/cross_context/{layer_name}_{reference_layer_name}.csv', index_col=0)

In [None]:
df.head(5)

# Training

In [50]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size].reset_index(drop=True)
    test = dataframe[training_size:].reset_index(drop=True)

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, "\
          f"Training = {collections.Counter(y_train)}, Test = {collections.Counter(y_test)}")

    return train, test

training, testing = split_data(df, shuffle=False)


Working with: 3504 training points + 876 test points (0.2 test ratio).
Label Occurrences: Total = Counter({-1.0: 2625, 1.0: 540, 2.0: 493, 3.0: 332, 4.0: 304, 0.0: 86}), Training = Counter({-1.0: 2127, 1.0: 430, 2.0: 374, 3.0: 262, 4.0: 240, 0.0: 71}), Test = Counter({-1.0: 498, 2.0: 119, 1.0: 110, 3.0: 70, 4.0: 64, 0.0: 15})


In [None]:
training

In [4]:
def remove_empty_community_class(df):
    '''Removes evolution_label -1 from dataset indicating the community stays empty.'''
    # res = df.loc[df['evolution_label'] != -1.0]
    # res = res.reset_index(drop=True)
    # return res
    df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
    return df

training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)

## Standardization

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

## Balancing

In [None]:
train_Y.value_counts()

In [None]:
from processing import DataSampler

sampler = DataSampler()
train_X, train_Y = sampler.sample_median_size(train_X, train_Y)

In [None]:
train_Y.value_counts()

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Two-stage approach
### 1. Stage: Change Prediction

In [53]:
import pandas as pd

def prepare_stage1_data(X, y: pd.Series) -> ('X', 'y'):
    '''Simplify dataset classes to 0 -> 0, other -> 1.'''
    y_stg1 = y.apply(lambda elem: 0 if elem == 0 else 1)
    return X, y_stg1

In [54]:
train_X_stg1, train_Y_stg1 = prepare_stage1_data(train_X, train_Y)
train_Y_stg1.value_counts()

0    2198
1    1306
Name: evolution_label, dtype: int64

In [55]:
test_X_stg1, test_Y_stg1 = prepare_stage1_data(test_X, test_Y)
test_Y_stg1.value_counts()

0    513
1    363
Name: evolution_label, dtype: int64

### 2. Stage: Change Type Prediction

In [56]:
import pandas as pd

def prepare_stage2_data(X, y, columns) -> ('X', 'y'):
    '''Remove class 0 from dataset.'''
    xy = pd.DataFrame(data=X, columns=columns)
    xy['evolution_label'] = y

    # remove class 0
    tmp = xy.loc[xy['evolution_label'] != 0.0].reset_index(drop=True)
    X_stg2 = tmp[tmp.columns[:-1]]
    Y_stg2 = tmp[tmp.columns[-1]]
    
    return X_stg2, Y_stg2

In [57]:
train_X_stg2, train_Y_stg2 = prepare_stage2_data(train_X, train_Y, columns=training.columns[:-1])
train_Y_stg2.value_counts()

1.0    430
2.0    374
3.0    262
4.0    240
Name: evolution_label, dtype: int64

In [58]:
test_X_stg2, test_Y_stg2 = prepare_stage2_data(test_X, test_Y, columns=testing.columns[:-1])
test_Y_stg2.value_counts()

2.0    119
1.0    110
3.0     70
4.0     64
Name: evolution_label, dtype: int64

## Balancing of Training Data

In [59]:
train_Y_stg1.value_counts()

0    2198
1    1306
Name: evolution_label, dtype: int64

In [60]:
train_Y_stg2.value_counts()

1.0    430
2.0    374
3.0    262
4.0    240
Name: evolution_label, dtype: int64

In [61]:
from processing import DataSampler
sampler = DataSampler()

In [62]:
# balancing by downsampling

train_X_stg1, train_Y_stg1 = sampler.sample_fixed_size(train_X_stg1, train_Y_stg1, size=1000)
train_Y_stg1.value_counts()

1    1000
0    1000
Name: evolution_label, dtype: int64

In [63]:
# balancing by downsampling 

train_X_stg2, train_Y_stg2 = sampler.sample_fixed_size(train_X_stg2, train_Y_stg2, size=300)
train_Y_stg2.value_counts()

4.0    300
3.0    300
2.0    300
1.0    300
Name: evolution_label, dtype: int64

## Principal Components

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)
pca.fit(train_X)
test_Xp = pca.transform(test_X)

In [None]:
from sklearn.decomposition import PCA

pca1 = PCA(n_components=8)
train_Xp_stg1 = pca1.fit_transform(train_X_stg1)
test_Xp_stg1 = pca1.transform(test_X_stg1)

pca2 = PCA(n_components=8)
train_Xp_stg2 = pca2.fit_transform(train_X_stg2)
test_Xp_stg2 = pca2.transform(test_X_stg2)

## Evaluation Reports

In [17]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes
    :param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [37]:
import pickle
from pathlib import Path

def export_model(model, model_name):    
    fpath = f'data/{use_case}/ml_output/{approach}/{layer_name}'
    Path(fpath).mkdir(parents=True, exist_ok=True)
    with open(f'{fpath}/{layer_name}_{reference_layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

In [38]:
def predict_all(clf1, clf2, test_X) -> 'pred_y':
    '''Runs the two-stage approach by predicting first with clf1 then clf2.'''
    # STG 1
    pred_Y_stg1 = clf1.predict(test_X)   

    # merge original X with predicted change Y1
    test_xy = pd.DataFrame(data=test_X, columns=testing.columns[:-1])
    test_xy['evolution_label'] = pred_Y_stg1
    
    # create new test set with from all predicted change=1
    test_xy_stg2 = test_xy.loc[test_xy['evolution_label'] == 1.0]
    test_X_stg2 = test_xy_stg2[test_xy_stg2.columns[:-1]]

    if len(test_X_stg2) > 0:
        # STG 2
        pred_Y_stg2 = clf2.predict(test_X_stg2)

        # merge stg2 X with predicted change type Y2
        test_xy_stg2 = test_X_stg2
        test_xy_stg2['evolution_label'] = pred_Y_stg2

        # merge results based on original index (pred class 0 stays 0)
        test_xy['evolution_label'].update(test_xy_stg2['evolution_label'])
        
    pred_Y = test_xy['evolution_label']

    return pred_Y

In [None]:
train_X, train_Xp, train_Y = train_X_stg1, train_Xp_stg1, train_Y_stg1
test_X, test_Xp, test_Y = test_X_stg1, test_Xp_stg1, test_Y_stg1

In [None]:
train_X, train_Xp, train_Y = train_X_stg2, train_Xp_stg2, train_Y_stg2
test_X, test_Xp, test_Y = test_X_stg2, test_Xp_stg2, test_Y_stg2

# Naive Bayes
Working best with _Xp_

Stage 1: 68% accuracy/f1 score (Xp) 
Parameters: 
- priors: prior probabilities of classes, _None_
- var\_smoothing: \[0, 1\] _1E-9_

Stage 2: 40% accuracy, 38% f1 with Xp
Parameters: 
- None
- 1E-9

In [64]:
from sklearn.naive_bayes import GaussianNB
priors = np.array([53,27]) / (53+27)
smoothing = 1E-9

clf1 = GaussianNB(priors=priors, var_smoothing=smoothing)
clf1.fit(train_X_stg1, train_Y_stg1)
export_model(clf1, 'nb1_x')

# clf1_p = GaussianNB(priors=priors, var_smoothing=smoothing)
# clf1_p.fit(train_Xp_stg1, train_Y_stg1)
# export_model(clf1_p, 'nb1_xp')

# print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [65]:
from sklearn.naive_bayes import GaussianNB
priors = None 
smoothing = 1E-9

clf2 = GaussianNB(priors=priors, var_smoothing=smoothing)
clf2.fit(train_X_stg2, train_Y_stg2)
export_model(clf2, 'nb2_x')

# clf2_p = GaussianNB(priors=priors, var_smoothing=smoothing)
# clf2_p.fit(train_Xp_stg2, train_Y_stg2)
# export_model(clf2_p, 'nb2_xp')

# print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [66]:
pred_Y = predict_all(clf1, clf2, test_X)
print('NB: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# pred_Y = predict_all(clf1_p, clf2_p, test_X, pca=True)
# print('NB Xp: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

NB:                precision    recall  f1-score   support

         0.0       0.79      0.65      0.71       513
         1.0       0.33      0.12      0.17       110
         2.0       0.39      0.36      0.38       119
         3.0       0.28      0.16      0.20        70
         4.0       0.18      0.73      0.29        64

    accuracy                           0.51       876
   macro avg       0.39      0.40      0.35       876
weighted avg       0.59      0.51      0.53       876



# Support Vector Machine
Stage 1: 69% accuracy/f1
Parameters:
- C (regularization): <1, _1_, >1, def=1
- kernel: _linear_, rbf, poly, sigmoid, def=rbf
- gamma (for rbf, poly, sigmoid): scale, auto, float, def=scale
- class\_weight: _None_, balanced, dict, def=None

Stage 2: 44% accuracy/f1
Parameters:
- 10
- rbf
- scale
- None



In [67]:
from sklearn.svm import LinearSVC
c = 1
kernel = 'linear'
gamma = 'scale'
weights = None

svc1 = LinearSVC(C=c, dual=False)
svc1.fit(train_X_stg1, train_Y_stg1)
export_model(svc1, 'svc1_x')

# svc1_p = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
# svc1_p.fit(train_Xp_stg1, train_Y_stg1)
# export_model(svc1_p, 'svc1_xp')

# print('report.')
# print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [69]:
from sklearn.svm import SVC
c = 10
kernel = 'rbf'
gamma = 'scale'
weights = None

svc2 = SVC(C=c, kernel=kernel, gamma=gamma)# dual=False)
svc2.fit(train_X_stg2, train_Y_stg2)
export_model(svc2, 'svc2_x')

# svc2_p = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
# svc2_p.fit(train_Xp_stg2, train_Y_stg2)
# export_model(svc2_p, 'svc2_xp')

# print('report.')
# print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [70]:
pred_Y = predict_all(svc1, svc2, test_X)
print('SVC: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# pred_Y = predict_all(svc1_p, svc2_p, test_X)
# print('SVC Xp: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

SVC:                precision    recall  f1-score   support

         0.0       0.87      0.78      0.82       513
         1.0       0.56      0.40      0.47       110
         2.0       0.48      0.61      0.54       119
         3.0       0.36      0.36      0.36        70
         4.0       0.28      0.52      0.36        64

    accuracy                           0.65       876
   macro avg       0.51      0.53      0.51       876
weighted avg       0.69      0.65      0.67       876



# K-nearest Neighbors

Stage 1: 70% accuracy, 70% f1 score
Parameters:
- n\_neighbors: _30_
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: _50_ (no difference)

Stage 2: 46% accuracy/f1 
Parameters:
- _20_
- uniform
- auto
- _30_


In [71]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc1 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1.fit(train_X_stg1, train_Y_stg1)
export_model(knnc1, 'knn1_x')

# knnc1_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
# knnc1_p.fit(train_Xp_stg1, train_Y_stg1)
# export_model(knnc1_p, 'knn1_xp')

# print_report([knnc1, knnc1_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [72]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 20
weights = 'uniform'
algo = 'auto'
leaf_size = 30

knnc2 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc2.fit(train_X_stg2, train_Y_stg2)
export_model(knnc2, 'knn2_x')

# knnc2_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
# knnc2_p.fit(train_Xp_stg2, train_Y_stg2)
# export_model(knnc2_p, 'knn2_xp')



In [None]:
# print_report([knnc2], [test_X_stg2], test_Y_stg2, ["X"])

In [73]:
pred_Y = predict_all(knnc1, knnc2, test_X)
print('KNN: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# pred_Y = predict_all(knnc1_p, knnc2_p, test_X, pca=True)
# print('KNN Xp: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

KNN:                precision    recall  f1-score   support

         0.0       0.89      0.76      0.82       513
         1.0       0.50      0.40      0.44       110
         2.0       0.47      0.53      0.50       119
         3.0       0.27      0.44      0.33        70
         4.0       0.31      0.48      0.38        64

    accuracy                           0.64       876
   macro avg       0.49      0.52      0.49       876
weighted avg       0.69      0.64      0.66       876



# Decision Tree
Working well with _Xp_

Stage 1: 69% accuracy/f1 with Xp
Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max\_depth: _None_ default=None
- min\_samples\_leaf (to construct leaf): _2_ default=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _0_ default=0
- ccp\_alpha (max allowed cost after pruning): _1E-2_ default=0/nopruning

Stage 2: 43% accuracy/f1 with X
Parameters:
- gini
- random
- None
- 2
- 0
- _0_



In [74]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'best'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0 # impurity improvement needed to split
ccp_alpha = 1E-2

seed=42

dtc1 = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc1.fit(train_X_stg1, train_Y_stg1)
export_model(dtc1, 'dt1_x')

# dtc1_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
# dtc1_p.fit(train_Xp_stg1, train_Y_stg1)
# export_model(dtc1_p, 'dt1_xp')

# print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [75]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'random'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0 # impurity improvement needed to split
ccp_alpha = 0

seed=42

dtc2 = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc2.fit(train_X_stg2, train_Y_stg2)
export_model(dtc2, 'dt2_x')

# dtc2_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
# dtc2_p.fit(train_Xp_stg2, train_Y_stg2)
# export_model(dtc2_p, 'dt2_xp')

# print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [76]:
pred_Y = predict_all(dtc1, dtc2, test_X)
print('DT: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# pred_Y = predict_all(dtc1_p, dtc2_p, test_X, pca=True)
# print('DT Xp: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

DT:                precision    recall  f1-score   support

         0.0       0.79      0.96      0.87       513
         1.0       0.69      0.70      0.69       110
         2.0       0.65      0.54      0.59       119
         3.0       0.33      0.11      0.17        70
         4.0       0.26      0.09      0.14        64

    accuracy                           0.74       876
   macro avg       0.55      0.48      0.49       876
weighted avg       0.68      0.74      0.70       876



# Random Forest
Stage 1: 69% accuracy/f1
Parameters:
- n\_estimators: _100_ def=100
- criterion: _gini_, entropy
- max\_depth: _None_ def=None
- min\_samples\_leaf (to construct leaf): _2_ def=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-2_ default=0
- bootstrap (if bootstraped sample is used): _True_ def=True

Stage 2: 44% accuracy/f1
Parameters:
- 100
- _entropy_
- None
- 2
- _0_
- True



In [77]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-2
bootstrap=True

seed=42

rfc1 = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc1.fit(train_X_stg1, train_Y_stg1)
export_model(rfc1, 'rf1_x')

# rfc1_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
# rfc1_p.fit(train_Xp_stg1, train_Y_stg1)
# export_model(rfc1_p, 'rf1_xp')

# print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [78]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'entropy'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 0
bootstrap=True

seed=42

rfc2 = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc2.fit(train_X_stg2, train_Y_stg2)
export_model(rfc2, 'rf2_x')

# rfc2_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
# rfc2_p.fit(train_Xp_stg2, train_Y_stg2)
# export_model(rfc2_p, 'rf2_xp')

In [43]:
# print_report([rfc2], [test_X_stg2], test_Y_stg2, ["X"])

### X ###
               precision    recall  f1-score   support

         1.0       0.03      0.24      0.05      1624
         2.0       0.03      0.25      0.05      1666
         3.0       0.50      0.30      0.38     31975
         4.0       0.49      0.28      0.36     31408

    accuracy                           0.29     66673
   macro avg       0.26      0.27      0.21     66673
weighted avg       0.47      0.29      0.35     66673



In [79]:
pred_Y = predict_all(rfc1, rfc2, test_X)
print('RF: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# pred_Y = predict_all(rfc1_p, rfc2_p, test_X, pca=True)
# print('DT Xp: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

RF:                precision    recall  f1-score   support

         0.0       0.94      0.68      0.79       513
         1.0       0.68      0.59      0.63       110
         2.0       0.60      0.68      0.64       119
         3.0       0.27      0.54      0.36        70
         4.0       0.19      0.41      0.25        64

    accuracy                           0.64       876
   macro avg       0.54      0.58      0.54       876
weighted avg       0.76      0.64      0.68       876



# Boosting
Stage 1: 71% accuracy/f1
Parameters:
- base\_estimator: object, _None(DT)_
- n\_estimators: _50_ def=50
- learning\_rate: _1_ def=1.0
- algorithm: SAMME, _SAMME.R_

Stage 2: 46% accuracy, 45% f1
Parameters:
- None
- 50
- 1
- SAMME.R


In [80]:
from sklearn.ensemble import AdaBoostClassifier

base_estimator = None
n_estimators= 50
learning_rate = 1
algo = 'SAMME.R'

seed=42

bc1 = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc1.fit(train_X_stg1, train_Y_stg1)
export_model(bc1, 'boost1_x')

# bc1_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
# bc1_p.fit(train_Xp_stg1, train_Y_stg1)
# export_model(bc1_p, 'boost1_xp')

# print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [81]:
from sklearn.ensemble import AdaBoostClassifier

base_estimator = None
n_estimators= 50
learning_rate = 1
algo = 'SAMME.R'

seed=42

bc2 = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc2.fit(train_X_stg2, train_Y_stg2)
export_model(bc2, 'boost2_x')

# bc2_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
# bc2_p.fit(train_Xp_stg2, train_Y_stg2)
# export_model(bc2_p, 'boost2_xp')

# print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])


In [82]:
pred_Y = predict_all(bc1, bc2, test_X)
print('B: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

# pred_Y = predict_all(bc1_p, bc2_p, test_X, pca=True)
# print('DT Xp: ', sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

B:                precision    recall  f1-score   support

         0.0       0.86      0.83      0.84       513
         1.0       0.54      0.75      0.63       110
         2.0       0.54      0.27      0.36       119
         3.0       0.30      0.27      0.28        70
         4.0       0.26      0.44      0.33        64

    accuracy                           0.67       876
   macro avg       0.50      0.51      0.49       876
weighted avg       0.69      0.67      0.67       876



# Pipeline Approach
Pipeline from best classifier for each stage

## Pipeline with KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 30
weights = 'uniform'
algo = 'auto'
leaf_size = 50

knnc1 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc1.fit(train_X_stg1, train_Y_stg1)

print_report([knnc1], [test_X_stg1], test_Y_stg1, ["stg1"])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 20
weights = 'uniform'
algo = 'auto'
leaf_size = 30

knnc2 = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc2.fit(train_X_stg2, train_Y_stg2)

print_report([knnc2], [test_X_stg2], test_Y_stg2, ["stg2"])

In [None]:
pred_Y = predict_all(knnc1, knnc2, test_X)

print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

## Pipeline with Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

base_estimator = None
n_estimators= 50
learning_rate = 1
algo = 'SAMME.R'

seed=42

bc1 = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc1.fit(train_X_stg1, train_Y_stg1)

bc2 = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algo, random_state=seed)
bc2.fit(train_X_stg2, train_Y_stg2)

In [None]:
pred_Y = predict_all(bc1, bc2, test_X)

print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

## Final Decision: KNN then Boosting(DT)

In [89]:
print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

              precision    recall  f1-score   support

         0.0       0.89      0.76      0.82       513
         1.0       0.56      0.40      0.47       110
         2.0       0.49      0.61      0.54       119
         3.0       0.30      0.43      0.35        70
         4.0       0.32      0.55      0.40        64

    accuracy                           0.65       876
   macro avg       0.51      0.55      0.52       876
weighted avg       0.70      0.65      0.67       876



In [91]:
pred_Y = predict_all(rfc1, rfc2, test_X)

print(sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

              precision    recall  f1-score   support

         0.0       0.94      0.68      0.79       513
         1.0       0.68      0.59      0.63       110
         2.0       0.60      0.68      0.64       119
         3.0       0.27      0.54      0.36        70
         4.0       0.19      0.41      0.25        64

    accuracy                           0.64       876
   macro avg       0.54      0.58      0.54       876
weighted avg       0.76      0.64      0.68       876

