In [1]:
import warnings
warnings.filterwarnings("ignore")

import networkx as nx
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np
from IPython.display import display

from node2vec import Node2Vec
from graphrole import RecursiveFeatureExtractor

from sklearn.model_selection import StratifiedKFold,cross_validate,train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

import itertools 
from tqdm import tqdm
import optuna
import logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Dim. Red. and Feature Selection:
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

# Engine selectors
from feature_engine.selection import (DropConstantFeatures,DropDuplicateFeatures,SmartCorrelatedSelection,
                                      DropCorrelatedFeatures)


# Models:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

**Step: Reading edgelist and labels, creating graph**

In [2]:
new_g = nx.read_edgelist('stdcommnet.edges')

In [3]:
labels = pd.read_csv('stddepartments.txt', sep=' ', index_col='stdid')

**Step: Feature creation**

In [4]:
degree_centrality = nx.degree_centrality(new_g)
betweenness_centrality = nx.betweenness_centrality(new_g)
closeness_centrality = nx.closeness_centrality(new_g)
pagerank = nx.pagerank(new_g)
clustering_coefficient = nx.clustering(new_g)
eigenvector_centrality = nx.eigenvector_centrality(new_g)
triangles = nx.triangles(new_g)

neighbourhood_size = {}
for node in new_g.nodes():
    neighbourhood_size[node] = new_g.degree(node)

mean_external_connections = {}
for node in new_g.nodes():
    external_connections = sum(new_g.degree(neighbor) for neighbor in new_g.neighbors(node) if neighbor != node)
    mean_external_connections[node] = external_connections / neighbourhood_size[node]

node_centrality_difference = {}
for node in new_g.nodes():
    node_centrality_difference[node] = degree_centrality[node] - eigenvector_centrality[node]

In [5]:
new_features = [closeness_centrality,pagerank,clustering_coefficient,eigenvector_centrality,
                neighbourhood_size,mean_external_connections,node_centrality_difference,triangles]

my_features_df = pd.DataFrame.from_dict(betweenness_centrality, orient ='index', columns=["col1"]) 

for i in new_features:
    df1 = pd.DataFrame.from_dict(i, orient ='index') 
    column_name = f'col{len(my_features_df.columns) + 1}'
    my_features_df[column_name] = df1[0]

In [6]:
my_features_df.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,0.004689,0.40162,0.003665,0.471014,0.035426,26,28.576923,0.039502,130
1,0.003313,0.400231,0.003522,0.383399,0.033586,25,30.36,0.03846,97
316,0.001999,0.370331,0.003445,0.495238,0.023328,23,25.086957,0.042954,104
146,0.005588,0.398393,0.003061,0.279412,0.022315,19,27.157895,0.03244,38
221,0.008583,0.431592,0.005145,0.373874,0.056489,37,32.432432,0.050139,249


# Kitchen sink:

**I think the features extracted from Node2Vec,rolx and 9 feature that I created will be highly correlated. However, since I will use the DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures, SmartCorrelatedSelection functions from the feature engine library in the pipeline, I continue with a method similar to the "kitchen sink".**

In [7]:
def preprocess_for_ml(a,b,c,d,e):
    n2v = Node2Vec(new_g, dimensions=a, num_walks=b, walk_length=c, p=d, q=e)
    n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)
    
    n2vrep = dict()
    for node in new_g.nodes():
        n2vrep[node] = n2v_model.wv[str(node)]
    df_n2v = pd.DataFrame(n2vrep).T

    feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
    rolx_feats = feat_ext.extract_features()
    df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

    df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
    df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
    df_data.columns = df_data.columns.astype(str)

    my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
    kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

    unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
    raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

    X = raw_df.drop(columns=['label'])
    y = raw_df.label
    X_unknown=unknowns.drop(columns=['label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    return n2v_model, X_train, X_test, y_train_encoded, y_test_encoded, X_unknown

In [8]:
estimator = DecisionTreeClassifier(random_state=42)
scoring = "f1_macro"

**For the dimensions, num_walks, walk_length, p and q values, I will try the following values as combinations.**

**For each trial, I will apply 2 different methods as dimensity reduction and feature selection.**

**I will build models with Random Forest, XGB, QDA and MLP. While choosing these models, I paid attention to their algorithms being 4 different approaches.**

In [9]:
dim_list = [8,10]
walk_list = [12,15]
walk_dist = [8,10]
p_list = [0.5, 0.7]
q_list = [2,3]

# Clfs
classifier_list = [RandomForestClassifier(random_state=42),XGBClassifier(),
                   QuadraticDiscriminantAnalysis(),MLPClassifier(random_state=42)]

classifier_names = ["RFC","XGB","Quad","MLP"]

# Selectors
reduction_list = [PCA(n_components=0.95,random_state=42),RFECV(estimator=estimator,scoring=scoring)]

In [21]:
combins = itertools.product(dim_list,walk_list,walk_dist,p_list,q_list)

**Cross validation with 5 stratified splits:**

In [12]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

**For hyperparameter tuning, I will use optuna** 

In [13]:
def objective(trial):
    if i == 0:  # RandomForestClassifier
        params = {'n_estimators': trial.suggest_int('n_estimators', 150, 300),
                  'max_depth': trial.suggest_int('max_depth', 4,7),
                  'min_samples_split': trial.suggest_int('min_samples_split', 6,8),
                  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
                  'max_features': trial.suggest_categorical('max_features', ['sqrt', 'auto'])}
        
    elif i == 1:  # XGBClassifier
        params = {'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
                  'max_depth': trial.suggest_int('max_depth', 9, 10),
                  'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                  'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
                  'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.3),
                  'reg_lambda': trial.suggest_float('reg_lambda', 0.8, 1.0)}
        
    elif i == 2:  # QuadraticDiscriminantAnalysis
        params = {'reg_param': trial.suggest_float('reg_param', 0.4, 1.0)}
        
    elif i == 3:  # MLPClassifier
        params = {'hidden_layer_sizes': trial.suggest_int('hidden_layer_sizes', 100, 150),
                  'activation': trial.suggest_categorical('activation', ['relu']),
                  'solver': trial.suggest_categorical('solver', ['adam']),
                  'alpha': trial.suggest_float('alpha', 0.0002, 0.0005),
                  'learning_rate_init': trial.suggest_float('learning_rate_init', 0.005, 0.010)}

    classifier_list[i].fit(X_train, y_train_encoded)
    preds = classifier_list[i].predict(X_test)
    score = f1_score(y_test_encoded, preds, average='macro')

    return score

In [14]:
def get_best_params():
    if i == 0:
        best_model = RandomForestClassifier(**best_params)
    elif i == 1:
        best_model = XGBClassifier(**best_params)
    elif i == 2:
        best_model = QuadraticDiscriminantAnalysis(**best_params)
    elif i == 3:
        best_model = MLPClassifier(**best_params)
    return best_model

**Trials with loops and collecting results in lists**

In [22]:
dim_results = []
walk_results = []
walk_dist_results = []
p_results = []
q_results = []
selector_results = []
classifier_names = []
val_acc_train_results = []
val_acc_test_results = []
acc_test_results = []
val_f1_train_results = []
val_f1_test_results = []
f1_test_results = []
best_params_results = []

for a,b,c,d,e in tqdm(combins):
    n2v_model, X_train, X_test, y_train_encoded, y_test_encoded, X_unknown = preprocess_for_ml(a, b, c, d, e)                  
    for selector in reduction_list:
        for i in range(4):
            pipeline = Pipeline([
                ('preprocessor',  StandardScaler()),
                ('DropConstant',  DropConstantFeatures(tol=0.99)),
                ('DropDuplicate', DropDuplicateFeatures()),
                ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                ('selector',      selector),
                ('classifier',    classifier_list[i])])

            study = optuna.create_study(direction='maximize')
            study.optimize(objective, n_trials=100)

            best_params = study.best_params
            best_model = get_best_params()
            pipeline.steps[-1] = ('classifier', best_model)

            cv_results = cross_validate(pipeline, X_train, y_train_encoded, scoring=("f1_macro","accuracy"), cv=kf, return_train_score=True)

            val_f1_train = cv_results['train_f1_macro'].mean()
            val_f1_test = cv_results['test_f1_macro'].mean()
            val_acc_train = cv_results['train_accuracy'].mean()
            val_acc_test = cv_results['test_accuracy'].mean()

            model = pipeline.fit(X_train, y_train_encoded)
            preds = model.predict(X_test)
            test_acc_score = accuracy_score(y_test_encoded, preds)
            test_f1_score = f1_score(y_test_encoded, preds, average='macro')


            dim_results.append(a)
            walk_results.append(b)
            walk_dist_results.append(c)
            p_results.append(d)
            q_results.append(e)
            selector_results.append(selector)
            classifier_names.append(classifier_list[i].__class__.__name__)
            val_acc_train_results.append(val_acc_train)
            val_acc_test_results.append(val_acc_test)
            val_f1_train_results.append(val_f1_train)
            val_f1_test_results.append(val_f1_test)
            acc_test_results.append(test_acc_score)
            f1_test_results.append(test_f1_score)
            best_params_results.append(best_params)

0it [00:00, ?it/s]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  42%|███████████████████████▎                                | 5/12 [00:00<00:00, 34.32it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 31.67it/s][A
1it [32:06, 1926.77s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 42.75it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 29.04it/s][A
2it [1:01:59, 1848.08s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  42%|███████████████████████▎                                | 5/12 [00:00<00:00, 39.57it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 33.10it/s][A
3it [1:32:14, 1832.71s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  42%|███████████████████████▎                                | 5/12 [00:00<00:00, 44.71it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 34.56it/s][A
4it [1:58:28, 1730.81s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 4/12 [00:00<00:00, 37.59it/s][A
Generating walks (CPU: 1):  67%|█████████████████████████████████████▎                  | 8/12 [00:00<00:00, 33.35it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 29.41it/s][A
5it [2:08:12, 1317.03s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 4/12 [00:00<00:00, 31.45it/s][A
Generating walks (CPU: 1):  67%|█████████████████████████████████████▎                  | 8/12 [00:00<00:00, 26.00it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 24.56it/s][A
6it [2:17:22, 1056.42s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  25%|██████████████                                          | 3/12 [00:00<00:00, 23.75it/s][A
Generating walks (CPU: 1):  58%|████████████████████████████████▋                       | 7/12 [00:00<00:00, 24.91it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 25.44it/s][A
7it [2:30:37, 970.98s/it] 

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 4/12 [00:00<00:00, 33.37it/s][A
Generating walks (CPU: 1):  67%|█████████████████████████████████████▎                  | 8/12 [00:00<00:00, 28.35it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 27.56it/s][A
8it [2:55:39, 1140.09s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  27%|██████████████▉                                         | 4/15 [00:00<00:00, 33.08it/s][A
Generating walks (CPU: 1):  60%|█████████████████████████████████▌                      | 9/15 [00:00<00:00, 39.96it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 36.13it/s][A
9it [3:20:09, 1242.97s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 5/15 [00:00<00:00, 41.46it/s][A
Generating walks (CPU: 1):  67%|████████████████████████████████████▋                  | 10/15 [00:00<00:00, 40.29it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 35.74it/s][A
10it [3:51:51, 1446.47s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 5/15 [00:00<00:00, 34.20it/s][A
Generating walks (CPU: 1):  60%|█████████████████████████████████▌                      | 9/15 [00:00<00:00, 35.72it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 32.81it/s][A
11it [4:12:45, 1387.63s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 60.23it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 51.92it/s][A
12it [4:16:44, 1038.28s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 5/15 [00:00<00:00, 39.97it/s][A
Generating walks (CPU: 1):  67%|████████████████████████████████████▋                  | 10/15 [00:00<00:00, 40.97it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 36.03it/s][A
13it [4:20:46, 796.91s/it] 

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 54.87it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 39.94it/s][A
14it [4:24:34, 625.17s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 64.22it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 45.83it/s][A
15it [4:28:03, 499.65s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 58.06it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 46.36it/s][A
16it [4:32:28, 429.20s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 59.57it/s][A
17it [4:36:35, 374.33s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 61.38it/s][A
18it [11:26:24, 7650.60s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 60.03it/s][A
19it [11:31:25, 5443.15s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 58.73it/s][A
20it [11:36:49, 3906.19s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 51.49it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 44.94it/s][A
21it [11:48:44, 2948.15s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 48.45it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 42.87it/s][A
22it [11:54:03, 2159.19s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 54.52it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 44.35it/s][A
23it [12:14:47, 1884.71s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 55.76it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 44.31it/s][A
24it [12:20:05, 1414.48s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 68.08it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 58.85it/s][A
25it [12:25:30, 1087.69s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 68.18it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 54.84it/s][A
26it [12:29:22, 830.76s/it] 

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 68.83it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 56.65it/s][A
27it [12:33:16, 651.91s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  33%|██████████████████▋                                     | 5/15 [00:00<00:00, 32.73it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 44.27it/s][A
28it [12:37:24, 530.73s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 52.03it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 45.45it/s][A
29it [12:41:15, 440.66s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 54.80it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 45.45it/s][A
30it [12:45:03, 376.88s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 54.75it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 45.04it/s][A
31it [12:49:20, 340.95s/it]

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 53.66it/s][A
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 46.23it/s][A
32it [12:53:28, 1450.25s/it]


**Combining and displaying results**

In [23]:
columns = ['Model Name', 'Selector',"val_acc_train_results","val_acc_test_results","val_f1_train_results",
           "val_f1_test_results","acc_test_results","f1_test_results","dim_results","walk_results","walk_dist_results",
           "p_results","q_results","best_params_results"]

df_results = pd.DataFrame(zip(classifier_names, selector_results,val_acc_train_results,
                              val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,
                              f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,
                              best_params_results), columns=columns)
df_results

Unnamed: 0,Model Name,Selector,val_acc_train_results,val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results
0,RandomForestClassifier,"PCA(n_components=0.95, random_state=42)",0.985368,0.945859,0.982987,0.939272,0.927083,0.920507,8,12,8,0.5,2,"{'n_estimators': 191, 'max_depth': 4, 'min_sam..."
1,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.000000,0.941111,1.000000,0.939135,0.927083,0.918235,8,12,8,0.5,2,"{'learning_rate': 0.1495269979526077, 'max_dep..."
2,QuadraticDiscriminantAnalysis,"PCA(n_components=0.95, random_state=42)",0.974087,0.959394,0.970399,0.950348,0.937500,0.938967,8,12,8,0.5,2,{'reg_param': 0.7869007446994514}
3,MLPClassifier,"PCA(n_components=0.95, random_state=42)",1.000000,0.972929,1.000000,0.966049,0.979167,0.979114,8,12,8,0.5,2,"{'hidden_layer_sizes': 100, 'activation': 'rel..."
4,RandomForestClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,0.972970,0.945657,0.968644,0.942867,0.937500,0.941344,8,12,8,0.5,2,"{'n_estimators': 263, 'max_depth': 4, 'min_sam..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,MLPClassifier,"PCA(n_components=0.95, random_state=42)",1.000000,0.991010,1.000000,0.987708,0.989583,0.992236,10,15,10,0.7,3,"{'hidden_layer_sizes': 135, 'activation': 'rel..."
252,RandomForestClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,0.980854,0.941212,0.977942,0.927240,0.864583,0.864067,10,15,10,0.7,3,"{'n_estimators': 283, 'max_depth': 4, 'min_sam..."
253,XGBClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,1.000000,0.959495,1.000000,0.950939,0.875000,0.869031,10,15,10,0.7,3,"{'learning_rate': 0.18823863639956118, 'max_de..."
254,QuadraticDiscriminantAnalysis,RFECV(estimator=DecisionTreeClassifier(random_...,0.950448,0.936667,0.945033,0.926965,0.729167,0.692796,10,15,10,0.7,3,{'reg_param': 0.8498019533299637}


**Since the data set is small, I apply the generally accepted overfit control ratio 0.05 to 0.1.**

In [24]:
df_results["checking_overfitting"] = np.where((df_results.val_acc_train_results - df_results.val_acc_test_results > 0.1) | (df_results.val_acc_train_results - df_results.acc_test_results > 0.1),"check","pass")

In [25]:
df_results[df_results.checking_overfitting == "check"]

Unnamed: 0,Model Name,Selector,val_acc_train_results,val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
8,RandomForestClassifier,"PCA(n_components=0.95, random_state=42)",0.989869,0.941212,0.988104,0.93358,0.885417,0.870096,8,12,8,0.5,3,"{'n_estimators': 249, 'max_depth': 6, 'min_sam...",check
9,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.937071,1.0,0.931396,0.895833,0.875078,8,12,8,0.5,3,"{'learning_rate': 0.18175582252326514, 'max_de...",check
65,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.941515,1.0,0.93405,0.895833,0.885745,8,15,8,0.5,2,"{'learning_rate': 0.13422670686569266, 'max_de...",check
89,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.950606,1.0,0.945809,0.895833,0.897622,8,15,8,0.7,3,"{'learning_rate': 0.1725101824559117, 'max_dep...",check
161,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.936869,1.0,0.925827,0.895833,0.888284,10,12,10,0.5,2,"{'learning_rate': 0.1714327705114161, 'max_dep...",check
222,QuadraticDiscriminantAnalysis,RFECV(estimator=DecisionTreeClassifier(random_...,0.925747,0.909394,0.916819,0.893574,0.760417,0.758164,10,15,8,0.7,3,{'reg_param': 0.9015138170563634},check
252,RandomForestClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,0.980854,0.941212,0.977942,0.92724,0.864583,0.864067,10,15,10,0.7,3,"{'n_estimators': 283, 'max_depth': 4, 'min_sam...",check
253,XGBClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,1.0,0.959495,1.0,0.950939,0.875,0.869031,10,15,10,0.7,3,"{'learning_rate': 0.18823863639956118, 'max_de...",check
254,QuadraticDiscriminantAnalysis,RFECV(estimator=DecisionTreeClassifier(random_...,0.950448,0.936667,0.945033,0.926965,0.729167,0.692796,10,15,10,0.7,3,{'reg_param': 0.8498019533299637},check


**Getting the best results for each model type**

In [26]:
res = df_results.sort_values(by="f1_test_results", ascending=False)
res = res.drop_duplicates(subset = "Model Name", keep="first")
res

Unnamed: 0,Model Name,Selector,val_acc_train_results,val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
251,MLPClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.99101,1.0,0.987708,0.989583,0.992236,10,15,10,0.7,3,"{'hidden_layer_sizes': 135, 'activation': 'rel...",pass
184,RandomForestClassifier,"PCA(n_components=0.95, random_state=42)",0.99887,0.959495,0.998795,0.949965,0.989583,0.992236,10,12,10,0.7,3,"{'n_estimators': 286, 'max_depth': 6, 'min_sam...",pass
186,QuadraticDiscriminantAnalysis,"PCA(n_components=0.95, random_state=42)",0.981972,0.959495,0.979386,0.951939,0.979167,0.982788,10,12,10,0.7,3,{'reg_param': 0.5243003025640753},pass
133,XGBClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,1.0,0.941313,1.0,0.934589,0.979167,0.982788,10,12,8,0.5,2,"{'learning_rate': 0.14581582540560722, 'max_de...",pass


In [27]:
len(res[res.checking_overfitting == "check"])

0

**Deployment of MLPClassifier model with best params and making predictions**

In [28]:
res.iloc[:1,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
251,10,15,10,0.7,3,"{'hidden_layer_sizes': 135, 'activation': 'rel...",pass


In [29]:
res.iloc[0,-2]

{'hidden_layer_sizes': 135,
 'activation': 'relu',
 'solver': 'adam',
 'alpha': 0.00031159004134736167,
 'learning_rate_init': 0.0056806300928082575}

In [30]:
n2v = Node2Vec(new_g, dimensions=10, num_walks=15, walk_length=10, p=0.7, q=3)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      PCA(n_components=0.95, random_state=42)),
                    ('classifier',    MLPClassifier(hidden_layer_sizes= 135,
                                                    activation= "relu",
                                                    solver= "adam",
                                                    alpha= 0.00031159004134736167,
                                                    learning_rate_init= 0.0056806300928082575))])

MLPClassifiermodel = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 45.45it/s]


In [43]:
mlp_predictions = X_unknown.copy()
mlp_predictions["label"] = le.inverse_transform(MLPClassifiermodel.predict(X_unknown)).tolist()
mlp_predictions = mlp_predictions[["label"]]

In [46]:
mlp_predictions.to_csv('predictions_mlp.txt', index_label="stdid", sep=' ')

**Deployment of QuadraticDiscriminantAnalysis model with best params and making predictions**

In [35]:
res.iloc[2:3,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
186,10,12,10,0.7,3,{'reg_param': 0.5243003025640753},pass


In [32]:
n2v = Node2Vec(new_g, dimensions=10, num_walks=12, walk_length=10, p=0.7, q=3)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      PCA(n_components=0.95, random_state=42)),
                    ('classifier',    QuadraticDiscriminantAnalysis(reg_param= 0.5243003025640753))])

QUADmodel = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 46.33it/s]


In [48]:
QDA_predictions = X_unknown.copy()
QDA_predictions["label"] = le.inverse_transform(QUADmodel.predict(X_unknown)).tolist()
QDA_predictions = QDA_predictions[["label"]]
QDA_predictions.to_csv('predictions_qda.txt', index_label="stdid", sep=' ')

**Deployment of XGB Classifier model with best params and making predictions**

In [36]:
res.iloc[3:4,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
133,10,12,8,0.5,2,"{'learning_rate': 0.14581582540560722, 'max_de...",pass


In [37]:
res.iloc[3,-2]

{'learning_rate': 0.14581582540560722,
 'max_depth': 10,
 'subsample': 0.7541758139901041,
 'colsample_bytree': 0.6915055905882553,
 'reg_alpha': 0.2906391689199846,
 'reg_lambda': 0.9865505524209566}

In [38]:
n2v = Node2Vec(new_g, dimensions=10, num_walks=12, walk_length=8, p=0.5, q=2)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      RFECV(estimator=estimator,scoring=scoring)),
                    ('classifier',    XGBClassifier(learning_rate=0.14581582540560722,
                                                    max_depth=10,
                                                    subsample=0.7541758139901041,
                                                    colsample_bytree=0.6915055905882553,
                                                    reg_alpha=0.2906391689199846,
                                                    reg_lambda=0.9865505524209566))])


XGBmodel = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 60.39it/s]


In [52]:
xgb_predictions = X_unknown.copy()
xgb_predictions["label"] = le.inverse_transform(XGBmodel.predict(X_unknown)).tolist()
xgb_predictions = xgb_predictions[["label"]]
xgb_predictions.to_csv('predictions_xgb.txt', index_label="stdid", sep=' ')

**Deployment of RandomForestClassifier model with best params and making predictions**

In [40]:
res.iloc[1:2,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
184,10,12,10,0.7,3,"{'n_estimators': 286, 'max_depth': 6, 'min_sam...",pass


In [41]:
res.iloc[1,-2]

{'n_estimators': 286,
 'max_depth': 6,
 'min_samples_split': 7,
 'min_samples_leaf': 1,
 'max_features': 'auto'}

In [42]:
n2v = Node2Vec(new_g, dimensions=10, num_walks=12, walk_length=10, p=0.7, q=3)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      RFECV(estimator=estimator,scoring=scoring)),
                    ('classifier',    RandomForestClassifier(n_estimators=286,
                                                             max_depth=6,
                                                             min_samples_split=7,
                                                             min_samples_leaf=1,
                                                             max_features="auto"))])


random_forest_model = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 45.46it/s]


In [56]:
randomforest_predictions = X_unknown.copy()
randomforest_predictions["label"] = le.inverse_transform(random_forest_model.predict(X_unknown)).tolist()
randomforest_predictions = randomforest_predictions[["label"]]
randomforest_predictions.to_csv('predictions_random_forest.txt', index_label="stdid", sep=' ')

**Saving Models:**

In [59]:
import pickle

filename = 'mlp_deployed_model.ncmodel'
pickle.dump(MLPClassifiermodel, open(filename,'wb'))

In [61]:
filename = 'qda_deployed_model.ncmodel'
pickle.dump(QUADmodel, open(filename,'wb'))

filename = 'xgb_deployed_model.ncmodel'
pickle.dump(XGBmodel, open(filename,'wb'))

filename = 'rf_deployed_model.ncmodel'
pickle.dump(random_forest_model, open(filename,'wb'))