# Single-Context Data

In [None]:
use_case = 'youtube'
layer_name = 'LikesLayer' 

approach = 'single_context'

In [None]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)

In [None]:
df

### Simple split

In [None]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size]
    test = dataframe[training_size:]

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, \n" \
          f"\tTraining = {collections.Counter(y_train)}, \n" \
          f"\tTest = {collections.Counter(y_test)}")
    # try:
    #     print(f"Label Majority Class: Training = {stat.mode(Y_train)}, Test = {stat.mode(Y_test)}\n")
    # except stat.StatisticsError:
    #     print(f"Label Majority Class: no unique mode; found 2 equally common values")

    return train, test

# training, testing = split_data(df, shuffle=False)

### k-fold cross validation

In [None]:
from pandas import DataFrame
from typing import Iterator, Tuple

def chunks(lst: DataFrame, n) -> Iterator[DataFrame]:
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


def get_k_folds(dataframe: DataFrame, k: int = 10) -> Iterator[Tuple[DataFrame, DataFrame]]:
    """
    Folds the dataframe k times and returns each fold for training
    :returns: k-1 folds for training, 1 fold for testing
    """
    
    fold_size = int(len(dataframe) / k)
    folds = [c for c in chunks(dataframe, fold_size)]
    
    if len(folds) != k:
        print(f"#folds={len(folds)} do not match k={k}! "\
            f"Merging last 2 folds with sizes={len(folds[-2])}, {len(folds[-1])}")
        folds[-2:] = [pd.concat([folds[-2], folds[-1]])]
        print(f"#folds={len(folds)}, new size last fold={len(folds[-1])}")
        
    for i in range(k):
        yield pd.concat([f for (idx, f) in enumerate(folds) if idx != i]), folds[i]
                

In [None]:
test = DataFrame([[i, i*4] for i in range(10)], columns=["values", "else"])

for idx, (training, testing) in enumerate(get_k_folds(test)):
    print(idx)
    print(training)
    print(testing)
    print()

In [None]:
def remove_empty_community_class(df):
    '''Removes evolution_label -1 from dataset indicating the community stays empty.'''
    # res = df.loc[df['evolution_label'] != -1.0]
    # res = res.reset_index(drop=True)
    # return res
    df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
    return df

# training = remove_empty_community_class(training)
# testing = remove_empty_community_class(testing)

In [None]:
df = remove_empty_community_class(df)
df = df[df.columns[:-1]] # remove the new column containing results for regression

In [None]:
# use first result from k-fold
training, testing = next(get_k_folds(df))

### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [None]:
train_Y

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Balancing of Training Data

In [None]:
train_Y.value_counts()

In [None]:
from processing import DataSampler

sampler = DataSampler()
train_X, train_Y = sampler.sample_median_size(train_X, train_Y, max_size=10000)

In [None]:
train_Y.value_counts()

## Principal Components

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)

train_Xp = pca.fit_transform(train_X)
test_Xp = pca.transform(test_X)

## Evaluation Reports

In [None]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes:param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [None]:
import pickle 
from pathlib import Path

def export_model(model, model_name):
    return
    fpath = f'data/{use_case}/ml_output/{approach}/{layer_name}'
    Path(fpath).mkdir(parents=True, exist_ok=True)
    with open(f'{fpath}/{layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

# Naive Bayes
Working best with _Xp_

49\% accuracy and 43% f1 score

Parameters: 
- priors: prior probabilities of classes, _None_
- var\_smoothing: \[_0_ , 1\]

In [None]:
from sklearn.naive_bayes import GaussianNB
priors = None #np.array([19,16,16,74,74]) / (19+16+16+74+74)
smoothing = 0

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

# Support Vector Machine
Parameters:
Parameters:
- C (regularization): <1, _1_, >1, def=1
- kernel: _linear_, rbf, poly, sigmoid, def=rbf
- gamma (for rbf, poly, sigmoid): scale, auto, float, def=scale
- class\_weight: _None_, balanced, dict, def=None

In [None]:
from sklearn.svm import LinearSVC
c = 1
dual = False
tol = 1E-4

svc = LinearSVC(C=c, dual=dual, tol=tol)
svc.fit(train_X, train_Y)

svc_p = LinearSVC(C=c, dual=dual, tol=tol)
svc_p.fit(train_Xp, train_Y)

print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

In [None]:
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# K-nearest Neighbors
Parameters:
- n\_neighbors: 20
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: 30

In [None]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 20
weights = 'uniform'
algo = 'auto'
leaf_size = 30

knnc = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc.fit(train_X, train_Y)

knnc_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc_p.fit(train_Xp, train_Y)

print_report([knnc, knnc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(knnc, 'knn_x')
export_model(knnc_p, 'knn_xp')

# Decision Tree
Working well with _Xp_

Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max\_depth: _10_, default=None
- min\_samples\_leaf (to construct leaf): _1_, default=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-5_, default=0
- ccp\_alpha (max allowed cost after pruning): _1E-3_, default=0 ie. nopruning


In [None]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'random'
max_depth = 10
min_samples_leaf = 1
min_impurity_decrease = 1E-5 # impurity improvement needed to split
ccp_alpha = 1E-3

seed = 42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

# Random Forest
Parameters:
- n\_estimators: _100_ def=100
- criterion: _gini_, entropy
- max\_depth: _None_ def=None
- min\_samples\_leaf (to construct leaf): _2_ def=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-5_ default=0
- bootstrap (if bootstraped sample is used): _True_ def=True


In [None]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-5
bootstrap=True

seed = 42

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

In [None]:

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# Boosting
Parameters:
- base\_estimator: None
- n\_estimators: 50
- algorithm: samme.r
- learning\_rate: .3


In [None]:
from sklearn.ensemble import AdaBoostClassifier
base_estimator = None #SVC(kernel='linear')
n_estimators = 50
algo = 'SAMME.R'
learning_rate = .3

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')