# Single-Context Data

In [1]:
use_case = 'youtube'
layer_name = 'LikesLayer' 

approach = 'single_context'

In [2]:
import pandas as pd
from pandas import DataFrame

df: DataFrame = pd.read_csv(f'data/{use_case}/ml_input/single_context/{layer_name}.csv', index_col=0)

In [None]:
df

# Training

In [3]:
import numpy as np
import collections

def split_data(dataframe, test_dataset_frac=.2, shuffle=False) -> '(training_data, test_data)':
    if shuffle:
        dataframe = dataframe.sample(frac=1).reset_index(drop=True)

    training_size = int(len(dataframe) * (1-test_dataset_frac))

    train = dataframe[:training_size]
    test = dataframe[training_size:]

    y_train = train[train.columns[-1]]
    y_test = test[test.columns[-1]]
  
    print(f"\nWorking with: {len(train)} training points + {len(test)} test points ({len(test)/(len(test)+len(train))} test ratio).")
    print(f"Label Occurrences: Total = {collections.Counter(y_train.tolist() + y_test.tolist())}, \n" \
          f"\tTraining = {collections.Counter(y_train)}, \n" \
          f"\tTest = {collections.Counter(y_test)}")
    # try:
    #     print(f"Label Majority Class: Training = {stat.mode(Y_train)}, Test = {stat.mode(Y_test)}\n")
    # except stat.StatisticsError:
    #     print(f"Label Majority Class: no unique mode; found 2 equally common values")

    return train, test

training, testing = split_data(df, shuffle=False)


Working with: 452659 training points + 113165 test points (0.2000003534668024 test ratio).
Label Occurrences: Total = Counter({-1.0: 313795, 4.0: 93192, 3.0: 92757, 0.0: 24818, 1.0: 20679, 2.0: 20583}), 
	Training = Counter({-1.0: 251137, 4.0: 74536, 3.0: 74105, 0.0: 19812, 1.0: 16650, 2.0: 16419}), 
	Test = Counter({-1.0: 62658, 4.0: 18656, 3.0: 18652, 0.0: 5006, 2.0: 4164, 1.0: 4029})


In [None]:
training

In [4]:
def remove_empty_community_class(df):
    '''Removes evolution_label -1 from dataset indicating the community stays empty.'''
    # res = df.loc[df['evolution_label'] != -1.0]
    # res = res.reset_index(drop=True)
    # return res
    df['evolution_label'] = df['evolution_label'].replace(-1.0, 0)
    return df

training = remove_empty_community_class(training)
testing = remove_empty_community_class(testing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
training

Unnamed: 0,cluster_size,cluster_variance,cluster_density,cluster_import1,cluster_import2,cluster_area,cluster_center_distance,time_f1,time_f2,cluster_size.1,...,cluster_size.2,cluster_variance.2,cluster_density.2,cluster_import1.2,cluster_import2.2,cluster_area.2,cluster_center_distance.2,time_f1.2,time_f2.2,evolution_label
0,1.0,0.0,0.0,0.000091,0.000183,0.0,2.2,0.568065,0.822984,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,7.485107e-01,0.663123,0.0
1,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.992709,0.120537,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,9.927089e-01,-0.120537,0.0
2,1.0,0.0,0.0,0.000084,0.000167,0.0,0.0,0.822984,-0.568065,1.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2.393157e-01,0.970942,4.0
3,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,-0.239316,0.970942,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,6.432491e-16,1.000000,0.0
4,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.239316,0.970942,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,5.680647e-01,-0.822984,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452654,2.0,0.0,0.0,0.000177,0.000163,0.0,0.0,0.663123,-0.748511,3.0,...,1.0,0.0,0.0,0.000086,0.000154,0.0,0.0,4.647232e-01,-0.885456,2.0
452655,1.0,0.0,0.0,0.000126,0.000218,0.0,0.0,0.239316,0.970942,1.0,...,3.0,0.0,0.0,0.000258,0.000156,0.0,0.0,5.680647e-01,-0.822984,3.0
452656,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.464723,-0.885456,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2.393157e-01,-0.970942,4.0
452657,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,-0.663123,0.748511,2.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,-4.647232e-01,0.885456,4.0


## Standardization

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

train_X = scaler.fit_transform(training)[:,:-1] # all except y
train_Y = training[training.columns[-1]]

test_X = scaler.transform(testing)[:,:-1] # all except y
test_Y = testing[testing.columns[-1]]

In [None]:
train_Y

In [None]:
pd.DataFrame(data=train_X, columns=df.columns[:-1])

## Balancing of Training Data

In [6]:
train_Y.value_counts()

0.0    270949
4.0     74536
3.0     74105
1.0     16650
2.0     16419
Name: evolution_label, dtype: int64

In [7]:
from processing import DataSampler

sampler = DataSampler()
train_X, train_Y = sampler.sample_median_size(train_X, train_Y)

In [8]:
train_Y.value_counts()

1.0    74105
3.0    74105
4.0    74105
2.0    74105
0.0    74105
Name: evolution_label, dtype: int64

## Principal Components

In [9]:
from sklearn.decomposition import PCA

pca = PCA(n_components=8)

train_Xp = pca.fit_transform(train_X)
test_Xp = pca.transform(test_X)

## Evaluation Reports

In [10]:
import sklearn.metrics

def print_report(clfs: list, test_Xs: list, test_Y: 'y', titles: list):
    """
    Prints all reports.
    :param clfs: list of classifiers to evaluate
    :param test_Xs: list of test_X for the corresponding classifier at idx
    :param test_Y: true classes:param titles: list of titles for the classifiers at idx
    """
    for clf, test_X, title in zip(clfs, test_Xs, titles):
        pred_Y = clf.predict(test_X)        
        print(f"### {title} ###\n", sklearn.metrics.classification_report(y_true=test_Y, y_pred=pred_Y))

In [11]:
import pickle 
from pathlib import Path

def export_model(model, model_name):
    fpath = f'data/{use_case}/ml_output/{approach}/{layer_name}'
    Path(fpath).mkdir(parents=True, exist_ok=True)
    with open(f'{fpath}/{layer_name}_{model_name}.model', 'wb') as f:
        pickle.dump(model, f)

# Naive Bayes
Working best with _Xp_

49\% accuracy and 43% f1 score

Parameters: 
- priors: prior probabilities of classes, _None_
- var\_smoothing: \[_0_ , 1\]

In [22]:
from sklearn.naive_bayes import GaussianNB
priors = None #np.array([19,16,16,74,74]) / (19+16+16+74+74)
smoothing = 0

clf = GaussianNB(priors=priors, var_smoothing=smoothing)
clf.fit(train_X, train_Y)

clf_p = GaussianNB(priors=priors, var_smoothing=smoothing)
clf_p.fit(train_Xp, train_Y)

print_report([clf, clf_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(clf, 'nb_x')
export_model(clf_p, 'nb_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.16      0.94      0.28      5006
         1.0       0.48      0.35      0.40      4029
         2.0       0.07      0.00      0.00      4164
         3.0       0.78      0.00      0.01     18652
         4.0       1.00      1.00      1.00     18656

    accuracy                           0.49     50507
   macro avg       0.50      0.46      0.34     50507
weighted avg       0.72      0.49      0.43     50507

### Xp ###
               precision    recall  f1-score   support

         0.0       0.14      0.46      0.22      5006
         1.0       0.53      0.17      0.26      4029
         2.0       0.08      0.00      0.00      4164
         3.0       0.62      0.00      0.01     18652
         4.0       0.48      0.86      0.62     18656

    accuracy                           0.38     50507
   macro avg       0.37      0.30      0.22     50507
weighted avg       0.47      0.38      0.27     50507


# Support Vector Machine
Parameters:
Parameters:
- C (regularization): <1, _1_, >1, def=1
- kernel: _linear_, rbf, poly, sigmoid, def=rbf
- gamma (for rbf, poly, sigmoid): scale, auto, float, def=scale
- class\_weight: _None_, balanced, dict, def=None

In [14]:
from sklearn.svm import SVC
c = 1
kernel = 'linear'
gamma = 'auto'
weights = None

svc = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc.fit(train_X, train_Y)

svc_p = SVC(C=c, kernel=kernel, gamma=gamma, class_weight=weights)
svc_p.fit(train_Xp, train_Y)

print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(svc, 'svc_x')
export_model(svc_p, 'svc_xp')

In [None]:
print_report([svc, svc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

# K-nearest Neighbors
Parameters:
- n\_neighbors: 20
- weights: _uniform_, distance
- algorithm: _auto_, ball_tree, kd_tree, brute
- leaf\_size: 30

In [13]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 20
weights = 'uniform'
algo = 'auto'
leaf_size = 30

knnc = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algo, leaf_size=leaf_size)
knnc.fit(train_X, train_Y)

knnc_p = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights,  algorithm=algo, leaf_size=leaf_size)
knnc_p.fit(train_Xp, train_Y)

print_report([knnc, knnc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(knnc, 'knn_x')
export_model(knnc_p, 'knn_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.80      0.48      0.60     67664
         1.0       0.36      0.80      0.49      4029
         2.0       0.26      0.53      0.35      4164
         3.0       0.75      0.58      0.65     18652
         4.0       0.25      0.55      0.35     18656

    accuracy                           0.52    113165
   macro avg       0.48      0.59      0.49    113165
weighted avg       0.66      0.52      0.55    113165

### Xp ###
               precision    recall  f1-score   support

         0.0       0.80      0.49      0.61     67664
         1.0       0.36      0.79      0.50      4029
         2.0       0.26      0.53      0.35      4164
         3.0       0.74      0.58      0.65     18652
         4.0       0.26      0.55      0.35     18656

    accuracy                           0.53    113165
   macro avg       0.48      0.59      0.49    113165
weighted avg       0.67      0.53      0.56    113165


# Decision Tree
Working well with _Xp_

Parameters:
- criterion: _gini_, entropy
- splitter: best, _random_
- max\_depth: _10_, default=None
- min\_samples\_leaf (to construct leaf): _1_, default=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-5_, default=0
- ccp\_alpha (max allowed cost after pruning): _1E-3_, default=0 ie. nopruning


In [27]:
from sklearn.tree import DecisionTreeClassifier 
criterion = 'gini'
splitter = 'random'
max_depth = 10
min_samples_leaf = 1
min_impurity_decrease = 1E-5 # impurity improvement needed to split
ccp_alpha = 1E-3

seed = 42

dtc = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc.fit(train_X, train_Y)

dtc_p = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, random_state=seed)
dtc_p.fit(train_Xp, train_Y)

print_report([dtc, dtc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(dtc, 'dt_x')
export_model(dtc_p, 'dt_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.79      0.73      0.76     67664
         1.0       0.32      0.80      0.46      4029
         2.0       0.28      0.31      0.30      4164
         3.0       0.72      0.67      0.69     18652
         4.0       0.29      0.28      0.29     18656

    accuracy                           0.64    113165
   macro avg       0.48      0.56      0.50    113165
weighted avg       0.66      0.64      0.64    113165

### Xp ###
               precision    recall  f1-score   support

         0.0       0.79      0.48      0.60     67664
         1.0       0.30      0.38      0.33      4029
         2.0       0.12      0.53      0.20      4164
         3.0       0.74      0.52      0.61     18652
         4.0       0.24      0.46      0.32     18656

    accuracy                           0.48    113165
   macro avg       0.44      0.48      0.41    113165
weighted avg       0.65      0.48      0.53    113165


# Random Forest
Parameters:
- n\_estimators: _100_ def=100
- criterion: _gini_, entropy
- max\_depth: _None_ def=None
- min\_samples\_leaf (to construct leaf): _2_ def=1
- min\_impurity\_decrease (split if the impurity is then decreased by): _1E-5_ default=0
- bootstrap (if bootstraped sample is used): _True_ def=True


In [31]:
from sklearn.ensemble import RandomForestClassifier
n_estimators = 100
criterion = 'gini'
max_depth = None
min_samples_leaf = 2
min_impurity_decrease = 1E-5
bootstrap=True

seed = 42

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc.fit(train_X, train_Y)

rfc_p = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease, bootstrap=bootstrap, random_state=seed)
rfc_p.fit(train_Xp, train_Y)

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(rfc, 'rf_x')
export_model(rfc_p, 'rf_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.82      0.44      0.57     67664
         1.0       0.37      0.78      0.50      4029
         2.0       0.28      0.50      0.36      4164
         3.0       0.75      0.64      0.69     18652
         4.0       0.27      0.64      0.38     18656

    accuracy                           0.52    113165
   macro avg       0.50      0.60      0.50    113165
weighted avg       0.68      0.52      0.55    113165

### Xp ###
               precision    recall  f1-score   support

         0.0       0.81      0.44      0.57     67664
         1.0       0.36      0.76      0.49      4029
         2.0       0.27      0.51      0.35      4164
         3.0       0.74      0.61      0.67     18652
         4.0       0.27      0.64      0.37     18656

    accuracy                           0.52    113165
   macro avg       0.49      0.59      0.49    113165
weighted avg       0.68      0.52      0.54    113165


In [29]:

print_report([rfc, rfc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

### X ###
               precision    recall  f1-score   support

         0.0       0.82      0.42      0.56     67664
         1.0       0.37      0.77      0.50      4029
         2.0       0.28      0.51      0.36      4164
         3.0       0.75      0.64      0.69     18652
         4.0       0.26      0.66      0.38     18656

    accuracy                           0.51    113165
   macro avg       0.50      0.60      0.50    113165
weighted avg       0.68      0.51      0.54    113165

### Xp ###
               precision    recall  f1-score   support

         0.0       0.81      0.42      0.55     67664
         1.0       0.36      0.76      0.49      4029
         2.0       0.27      0.51      0.35      4164
         3.0       0.74      0.61      0.67     18652
         4.0       0.26      0.66      0.38     18656

    accuracy                           0.51    113165
   macro avg       0.49      0.59      0.49    113165
weighted avg       0.68      0.51      0.53    113165


# Boosting
Parameters:
- base\_estimator: None
- n\_estimators: 50
- algorithm: samme.r
- learning\_rate: .3


In [35]:
from sklearn.ensemble import AdaBoostClassifier
base_estimator = None #SVC(kernel='linear')
n_estimators = 50
algo = 'SAMME.R'
learning_rate = .3

bc = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc.fit(train_X, train_Y)

bc_p = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, algorithm=algo, learning_rate=learning_rate)
bc_p.fit(train_Xp, train_Y)

print_report([bc, bc_p], [test_X, test_Xp], test_Y, ["X", "Xp"])

export_model(bc, 'boost_x')
export_model(bc_p, 'boost_xp')

### X ###
               precision    recall  f1-score   support

         0.0       0.70      0.20      0.31     67664
         1.0       0.32      0.68      0.43      4029
         2.0       0.28      0.56      0.38      4164
         3.0       0.75      0.60      0.67     18652
         4.0       0.21      0.70      0.32     18656

    accuracy                           0.38    113165
   macro avg       0.45      0.55      0.42    113165
weighted avg       0.60      0.38      0.38    113165

### Xp ###
               precision    recall  f1-score   support

         0.0       0.66      0.68      0.67     67664
         1.0       0.50      0.38      0.43      4029
         2.0       0.28      0.40      0.33      4164
         3.0       0.59      0.28      0.38     18652
         4.0       0.27      0.38      0.32     18656

    accuracy                           0.54    113165
   macro avg       0.46      0.42      0.42    113165
weighted avg       0.57      0.54      0.54    113165
