In [17]:
import warnings
warnings.filterwarnings("ignore")

import networkx as nx
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np
from IPython.display import display

from node2vec import Node2Vec
from graphrole import RecursiveFeatureExtractor

from sklearn.model_selection import StratifiedKFold,cross_validate,train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
import optuna
import logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Dim. Red. and Feature Selection:
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

# Engine selectors
from feature_engine.selection import (DropConstantFeatures,DropDuplicateFeatures,SmartCorrelatedSelection,
                                      DropCorrelatedFeatures)


# Models:
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

**Step: Reading edgelist and labels, creating graph**

In [2]:
new_g = nx.read_edgelist('stdcommnet.edges')

In [3]:
labels = pd.read_csv('stddepartments.txt', sep=' ', index_col='stdid')

**Step: Feature creation**

In [4]:
degree_centrality = nx.degree_centrality(new_g)
betweenness_centrality = nx.betweenness_centrality(new_g)
closeness_centrality = nx.closeness_centrality(new_g)
pagerank = nx.pagerank(new_g)
clustering_coefficient = nx.clustering(new_g)
eigenvector_centrality = nx.eigenvector_centrality(new_g)
triangles = nx.triangles(new_g)

neighbourhood_size = {}
for node in new_g.nodes():
    neighbourhood_size[node] = new_g.degree(node)

mean_external_connections = {}
for node in new_g.nodes():
    external_connections = sum(new_g.degree(neighbor) for neighbor in new_g.neighbors(node) if neighbor != node)
    mean_external_connections[node] = external_connections / neighbourhood_size[node]

node_centrality_difference = {}
for node in new_g.nodes():
    node_centrality_difference[node] = degree_centrality[node] - eigenvector_centrality[node]

In [5]:
new_features = [closeness_centrality,pagerank,clustering_coefficient,eigenvector_centrality,
                neighbourhood_size,mean_external_connections,node_centrality_difference,triangles]

my_features_df = pd.DataFrame.from_dict(betweenness_centrality, orient ='index', columns=["col1"]) 

for i in new_features:
    df1 = pd.DataFrame.from_dict(i, orient ='index') 
    column_name = f'col{len(my_features_df.columns) + 1}'
    my_features_df[column_name] = df1[0]

In [6]:
my_features_df.head()

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9
0,0.004689,0.40162,0.003665,0.471014,0.035426,26,28.576923,0.039502,130
1,0.003313,0.400231,0.003522,0.383399,0.033586,25,30.36,0.03846,97
316,0.001999,0.370331,0.003445,0.495238,0.023328,23,25.086957,0.042954,104
146,0.005588,0.398393,0.003061,0.279412,0.022315,19,27.157895,0.03244,38
221,0.008583,0.431592,0.005145,0.373874,0.056489,37,32.432432,0.050139,249


# Kitchen sink:

**I think the features extracted from Node2Vec,rolx and 9 feature that I created will be highly correlated. However, since I will use the DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures, SmartCorrelatedSelection functions from the feature engine library in the pipeline, I continue with a method similar to the "kitchen sink".**

In [7]:
def preprocess_for_ml(a,b,c,d,e):
    n2v = Node2Vec(new_g, dimensions=a, num_walks=b, walk_length=c, p=d, q=e)
    n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)
    
    n2vrep = dict()
    for node in new_g.nodes():
        n2vrep[node] = n2v_model.wv[str(node)]
    df_n2v = pd.DataFrame(n2vrep).T

    feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
    rolx_feats = feat_ext.extract_features()
    df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

    df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
    df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
    df_data.columns = df_data.columns.astype(str)

    my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
    kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

    unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
    raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

    X = raw_df.drop(columns=['label'])
    y = raw_df.label
    X_unknown=unknowns.drop(columns=['label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    return n2v_model, X_train, X_test, y_train_encoded, y_test_encoded, X_unknown

In [8]:
estimator = DecisionTreeClassifier(random_state=42)
scoring = "f1_macro"

**For the dimensions, num_walks, walk_length, p and q values, I will try the following values as combinations.**

**For each trial, I will apply 2 different methods as dimensity reduction and feature selection.**

**I will build models with Random Forest, XGB, QDA and MLP. While choosing these models, I paid attention to their algorithms being 4 different approaches.**

In [9]:
dim_list = [8,10]
walk_list = [12,15]
walk_dist = [8,10]
p_list = [0.5, 0.7]
q_list = [2,3]

# Clfs
classifier_list = [RandomForestClassifier(random_state=42),XGBClassifier(),
                   QuadraticDiscriminantAnalysis(),MLPClassifier(random_state=42)]

classifier_names = ["RFC","XGB","Quad","MLP"]

# Selectors
reduction_list = [PCA(n_components=0.95,random_state=42),RFECV(estimator=estimator,scoring=scoring)]

**Cross validation with 5 stratified splits:**

In [10]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

**For hyperparameter tuning, I will use optuna** 

In [11]:
def objective(trial):
    if i == 0:  # RandomForestClassifier
        params = {'n_estimators': trial.suggest_int('n_estimators', 150, 300),
                  'max_depth': trial.suggest_int('max_depth', 4,7),
                  'min_samples_split': trial.suggest_int('min_samples_split', 6,8),
                  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
                  'max_features': trial.suggest_categorical('max_features', ['sqrt', 'auto'])}
        
    elif i == 1:  # XGBClassifier
        params = {'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.2),
                  'max_depth': trial.suggest_int('max_depth', 9, 10),
                  'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                  'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
                  'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.3),
                  'reg_lambda': trial.suggest_float('reg_lambda', 0.8, 1.0)}
        
    elif i == 2:  # QuadraticDiscriminantAnalysis
        params = {'reg_param': trial.suggest_float('reg_param', 0.4, 1.0)}
        
    elif i == 3:  # MLPClassifier
        params = {'hidden_layer_sizes': trial.suggest_int('hidden_layer_sizes', 100, 150),
                  'activation': trial.suggest_categorical('activation', ['relu']),
                  'solver': trial.suggest_categorical('solver', ['adam']),
                  'alpha': trial.suggest_float('alpha', 0.0002, 0.0005),
                  'learning_rate_init': trial.suggest_float('learning_rate_init', 0.005, 0.010)}

    classifier_list[i].fit(X_train, y_train_encoded)
    preds = classifier_list[i].predict(X_test)
    score = f1_score(y_test_encoded, preds, average='macro')

    return score

In [12]:
def get_best_params():
    if i == 0:
        best_model = RandomForestClassifier(**best_params)
    elif i == 1:
        best_model = XGBClassifier(**best_params)
    elif i == 2:
        best_model = QuadraticDiscriminantAnalysis(**best_params)
    elif i == 3:
        best_model = MLPClassifier(**best_params)
    return best_model

**Trials with loops and collecting results in lists**

In [18]:
dim_results = []
walk_results = []
walk_dist_results = []
p_results = []
q_results = []
selector_results = []
classifier_names = []
val_acc_train_results = []
val_acc_test_results = []
acc_test_results = []
val_f1_train_results = []
val_f1_test_results = []
f1_test_results = []
best_params_results = []


for a in tqdm(dim_list):
    for b in tqdm(walk_list):
        for c in tqdm(walk_dist):
            for d in tqdm(p_list):
                for e in tqdm(q_list):
                    n2v_model, X_train, X_test, y_train_encoded, y_test_encoded, X_unknown = preprocess_for_ml(a, b, c, d, e)                  
                    for selector in reduction_list:
                        for i in range(4):
                            pipeline = Pipeline([
                                ('preprocessor',  StandardScaler()),
                                ('DropConstant',  DropConstantFeatures(tol=0.99)),
                                ('DropDuplicate', DropDuplicateFeatures()),
                                ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                                ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                                ('selector',      selector),
                                ('classifier',    classifier_list[i])])

                            study = optuna.create_study(direction='maximize')
                            study.optimize(objective, n_trials=100)

                            best_params = study.best_params
                            best_model = get_best_params()
                            pipeline.steps[-1] = ('classifier', best_model)

                            cv_results = cross_validate(pipeline, X_train, y_train_encoded, scoring=("f1_macro","accuracy"), cv=kf, return_train_score=True)

                            val_f1_train = cv_results['train_f1_macro'].mean()
                            val_f1_test = cv_results['test_f1_macro'].mean()
                            val_acc_train = cv_results['train_accuracy'].mean()
                            val_acc_test = cv_results['test_accuracy'].mean()

                            model = pipeline.fit(X_train, y_train_encoded)
                            preds = model.predict(X_test)
                            test_acc_score = accuracy_score(y_test_encoded, preds)
                            test_f1_score = f1_score(y_test_encoded, preds, average='macro')


                            dim_results.append(a)
                            walk_results.append(b)
                            walk_dist_results.append(c)
                            p_results.append(d)
                            q_results.append(e)
                            selector_results.append(selector)
                            classifier_names.append(classifier_list[i].__class__.__name__)
                            val_acc_train_results.append(val_acc_train)
                            val_acc_test_results.append(val_acc_test)
                            val_f1_train_results.append(val_f1_train)
                            val_f1_test_results.append(val_f1_test)
                            acc_test_results.append(test_acc_score)
                            f1_test_results.append(test_f1_score)
                            best_params_results.append(best_params)

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A

  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 62.53it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:52<03:52, 232.75s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 49.47it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:57<00:00, 208.96s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [06:57<06:57, 417.92s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 59.20it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:09<03:09, 189.74s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 51.15it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:34<00:00, 197.14s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [13:32<00:00, 406.10s/it][A[A[A


 50%|█████████████████████████████████████████▌                                         | 1/2 [13:32<13:32, 812.21s/it][A[A


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 54.56it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 40.27it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:06<03:06, 186.16s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 54.54it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 47.14it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:32<00:00, 196.02s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [06:32<06:32, 392.04s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 56.63it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 43.30it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:41<03:41, 221.15s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  42%|███████████████████████▎                                | 5/12 [00:00<00:00, 48.53it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 40.18it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:06<00:00, 213.26s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [13:38<00:00, 409.29s/it][A[A[A


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [27:10<00:00, 815.40s/it][A[A

 50%|█████████████████████████████████████████                                         | 1/2 [27:10<27:10, 1630.80s/it][A

  0%|                                                        

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 64.19it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 60.15it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:49<03:49, 229.89s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 52.34it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 35.60it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:15<00:00, 217.94s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [07:15<07:15, 435.88s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 63.53it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 54.52it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:45<03:45, 225.53s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  53%|█████████████████████████████▊                          | 8/15 [00:00<00:00, 68.25it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 57.02it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:28<00:00, 224.30s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [14:44<00:00, 442.25s/it][A[A[A


 50%|█████████████████████████████████████████▌                                         | 1/2 [14:44<14:44, 884.49s/it][A[A


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A



  0%|                                               

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 54.81it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 47.76it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:44<03:44, 224.02s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  33%|██████████████████▋                                     | 5/15 [00:00<00:00, 42.19it/s][A[A[A[A[A




Generating walks (CPU: 1):  67%|████████████████████████████████████▋                  | 10/15 [00:00<00:00, 40.91it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 30.59it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:09<00:00, 214.81s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [07:09<07:09, 429.62s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 54.73it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 43.38it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:20<03:20, 200.68s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 55.71it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 45.22it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:46<00:00, 203.33s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [13:56<00:00, 418.14s/it][A[A[A


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [28:40<00:00, 860.39s/it][A[A

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [55:51<00:00, 1675.80s/it][A
 50%|█████████████████████████████████████████                

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 54.50it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:26<03:26, 206.54s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 64.13it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:40<00:00, 200.49s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [06:40<06:40, 400.98s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 49.69it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:53<03:53, 233.60s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 57.23it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:47<00:00, 203.63s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [13:28<00:00, 404.13s/it][A[A[A


 50%|█████████████████████████████████████████▌                                         | 1/2 [13:28<13:28, 808.26s/it][A[A


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 58.12it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 45.74it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [04:02<04:02, 242.36s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 57.06it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 42.71it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:35<00:00, 227.57s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [07:35<07:35, 455.15s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 45.88it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [04:14<04:14, 254.44s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/12 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  50%|████████████████████████████                            | 6/12 [00:00<00:00, 56.03it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 43.76it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:39<00:00, 229.53s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [15:14<00:00, 457.11s/it][A[A[A


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [28:42<00:00, 861.24s/it][A[A

 50%|█████████████████████████████████████████                                         | 1/2 [28:42<28:42, 1722.48s/it][A

  0%|                                                        

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 61.13it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:40<03:40, 220.27s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  33%|██████████████████▋                                     | 5/15 [00:00<00:00, 30.02it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 41.27it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:28<00:00, 224.32s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [07:28<07:28, 448.66s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 73.04it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:43<03:43, 223.12s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 57.77it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:40<00:00, 230.36s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [15:09<00:00, 454.70s/it][A[A[A


 50%|█████████████████████████████████████████▌                                         | 1/2 [15:09<15:09, 909.40s/it][A[A


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 53.37it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 44.08it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:39<03:39, 219.92s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  47%|██████████████████████████▏                             | 7/15 [00:00<00:00, 54.59it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 44.57it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [07:20<00:00, 220.30s/it][A[A[A[A



 50%|█████████████████████████████████████████▌                                         | 1/2 [07:20<07:20, 440.59s/it][A[A[A



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 54.81it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 43.69it/s][A[A[A[A[A




 50%|█████████████████████████████████████████▌                                         | 1/2 [03:24<03:24, 204.22s/it][A[A[A[A

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]






Generating walks (CPU: 1):   0%|                                                                | 0/15 [00:00<?, ?it/s][A[A[A[A[A




Generating walks (CPU: 1):  40%|██████████████████████▍                                 | 6/15 [00:00<00:00, 56.72it/s][A[A[A[A[A




Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 42.45it/s][A[A[A[A[A




100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:52<00:00, 206.14s/it][A[A[A[A



100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [14:12<00:00, 426.44s/it][A[A[A


100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [29:22<00:00, 881.14s/it][A[A

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [58:04<00:00, 1742.39s/it][A
100%|█████████████████████████████████████████████████████████

**Combining and displaying results**

In [19]:
columns = ['Model Name', 'Selector',"val_acc_train_results","val_acc_test_results","val_f1_train_results",
           "val_f1_test_results","acc_test_results","f1_test_results","dim_results","walk_results","walk_dist_results",
           "p_results","q_results","best_params_results"]

df_results = pd.DataFrame(zip(classifier_names, selector_results,val_acc_train_results,
                              val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,
                              f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,
                              best_params_results), columns=columns)
df_results

Unnamed: 0,Model Name,Selector,val_acc_train_results,val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results
0,RandomForestClassifier,"PCA(n_components=0.95, random_state=42)",0.974094,0.941212,0.969381,0.935952,0.937500,0.933775,8,12,8,0.5,2,"{'n_estimators': 300, 'max_depth': 4, 'min_sam..."
1,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.000000,0.950404,1.000000,0.946391,0.968750,0.965656,8,12,8,0.5,2,"{'learning_rate': 0.1940683966265389, 'max_dep..."
2,QuadraticDiscriminantAnalysis,"PCA(n_components=0.95, random_state=42)",0.984225,0.959495,0.980867,0.950219,0.968750,0.969676,8,12,8,0.5,2,{'reg_param': 0.5485729152035468}
3,MLPClassifier,"PCA(n_components=0.95, random_state=42)",1.000000,0.977576,1.000000,0.972830,0.979167,0.979114,8,12,8,0.5,2,"{'hidden_layer_sizes': 132, 'activation': 'rel..."
4,RandomForestClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,0.994376,0.959394,0.992921,0.947173,0.979167,0.979114,8,12,8,0.5,2,"{'n_estimators': 171, 'max_depth': 5, 'min_sam..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,MLPClassifier,"PCA(n_components=0.95, random_state=42)",1.000000,0.977576,1.000000,0.972729,0.968750,0.969676,10,15,10,0.7,3,"{'hidden_layer_sizes': 120, 'activation': 'rel..."
252,RandomForestClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,0.986485,0.959394,0.985468,0.952242,0.958333,0.961227,10,15,10,0.7,3,"{'n_estimators': 152, 'max_depth': 5, 'min_sam..."
253,XGBClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,1.000000,0.959394,1.000000,0.954008,0.968750,0.968941,10,15,10,0.7,3,"{'learning_rate': 0.17745275529475676, 'max_de..."
254,QuadraticDiscriminantAnalysis,RFECV(estimator=DecisionTreeClassifier(random_...,0.962826,0.950404,0.958823,0.948546,0.937500,0.942899,10,15,10,0.7,3,{'reg_param': 0.6519125842893825}


**Since the data set is small, I apply the generally accepted overfit control ratio 0.05 to 0.1.**

In [26]:
df_results["checking_overfitting"] = np.where((df_results.val_acc_train_results - df_results.val_acc_test_results > 0.1) | (df_results.val_acc_train_results - df_results.acc_test_results > 0.1),"check","pass")

In [27]:
df_results[df_results.checking_overfitting == "check"]

Unnamed: 0,Model Name,Selector,val_acc_train_results,val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
25,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.954949,1.0,0.947284,0.885417,0.880961,8,12,8,0.7,3,"{'learning_rate': 0.10900187323173956, 'max_de...",check
40,RandomForestClassifier,"PCA(n_components=0.95, random_state=42)",0.97636,0.94596,0.972711,0.938622,0.84375,0.826115,8,12,10,0.5,3,"{'n_estimators': 179, 'max_depth': 4, 'min_sam...",check
105,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.959495,1.0,0.949626,0.885417,0.875638,8,15,10,0.5,3,"{'learning_rate': 0.17502076121709725, 'max_de...",check
153,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.972929,1.0,0.971565,0.895833,0.887073,10,12,8,0.7,3,"{'learning_rate': 0.12261745084921966, 'max_de...",check
185,XGBClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.959495,1.0,0.952118,0.895833,0.879629,10,12,10,0.7,3,"{'learning_rate': 0.13264237183439287, 'max_de...",check


**Getting the best results for each model type**

In [28]:
res = df_results.sort_values(by="f1_test_results", ascending=False)
res = res.drop_duplicates(subset = "Model Name", keep="first")
res

Unnamed: 0,Model Name,Selector,val_acc_train_results,val_acc_test_results,val_f1_train_results,val_f1_test_results,acc_test_results,f1_test_results,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
51,MLPClassifier,"PCA(n_components=0.95, random_state=42)",1.0,0.968586,1.0,0.961847,0.989583,0.992236,8,12,10,0.7,2,"{'hidden_layer_sizes': 144, 'activation': 'rel...",pass
82,QuadraticDiscriminantAnalysis,"PCA(n_components=0.95, random_state=42)",0.981978,0.963838,0.97913,0.957636,0.989583,0.992236,8,15,8,0.7,2,{'reg_param': 0.4807542577650866},pass
45,XGBClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,1.0,0.954848,1.0,0.944489,0.979167,0.981441,8,12,10,0.5,3,"{'learning_rate': 0.12052219901072066, 'max_de...",pass
4,RandomForestClassifier,RFECV(estimator=DecisionTreeClassifier(random_...,0.994376,0.959394,0.992921,0.947173,0.979167,0.979114,8,12,8,0.5,2,"{'n_estimators': 171, 'max_depth': 5, 'min_sam...",pass


In [31]:
len(res[res.checking_overfitting == "check"])

0

**Deployment of MLPClassifier model with best params and making predictions**

In [72]:
res.iloc[:1,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
51,8,12,10,0.7,2,"{'hidden_layer_sizes': 144, 'activation': 'rel...",pass


In [34]:
res.iloc[0,-2]

{'hidden_layer_sizes': 144,
 'activation': 'relu',
 'solver': 'adam',
 'alpha': 0.0003345382135550441,
 'learning_rate_init': 0.007146153618921432}

In [38]:
n2v = Node2Vec(new_g, dimensions=8, num_walks=12, walk_length=10, p=0.7, q=2)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      PCA(n_components=0.95, random_state=42)),
                    ('classifier',    MLPClassifier(hidden_layer_sizes= 144,
                                                    activation= "relu",
                                                    solver= "adam",
                                                    alpha= 0.0003345382135550441,
                                                    learning_rate_init= 0.007146153618921432))])

MLPClassifiermodel = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 41.46it/s]


In [43]:
mlp_predictions = X_unknown.copy()
mlp_predictions["label"] = le.inverse_transform(MLPClassifiermodel.predict(X_unknown)).tolist()
mlp_predictions = mlp_predictions[["label"]]

In [46]:
mlp_predictions.to_csv('predictions_mlp.txt', index_label="stdid", sep=' ')

**Deployment of QuadraticDiscriminantAnalysis model with best params and making predictions**

In [73]:
res.iloc[1:2,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
82,8,15,8,0.7,2,{'reg_param': 0.4807542577650866},pass


In [47]:
n2v = Node2Vec(new_g, dimensions=8, num_walks=15, walk_length=8, p=0.7, q=2)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      PCA(n_components=0.95, random_state=42)),
                    ('classifier',    QuadraticDiscriminantAnalysis(reg_param= 0.4807542577650866))])

QUADmodel = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 15/15 [00:00<00:00, 51.30it/s]


In [48]:
QDA_predictions = X_unknown.copy()
QDA_predictions["label"] = le.inverse_transform(QUADmodel.predict(X_unknown)).tolist()
QDA_predictions = QDA_predictions[["label"]]
QDA_predictions.to_csv('predictions_qda.txt', index_label="stdid", sep=' ')

**Deployment of XGB Classifier model with best params and making predictions**

In [76]:
res.iloc[2:3,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
45,8,12,10,0.5,3,"{'learning_rate': 0.12052219901072066, 'max_de...",pass


In [50]:
res.iloc[2,-2]

{'learning_rate': 0.12052219901072066,
 'max_depth': 9,
 'subsample': 0.9360415701197875,
 'colsample_bytree': 0.7867526912279533,
 'reg_alpha': 0.07221562761736805,
 'reg_lambda': 0.8968124921684337}

In [51]:
n2v = Node2Vec(new_g, dimensions=8, num_walks=12, walk_length=10, p=0.5, q=3)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      RFECV(estimator=estimator,scoring=scoring)),
                    ('classifier',    XGBClassifier(learning_rate=0.12052219901072066,
                                                    max_depth=9,
                                                    subsample=0.9360415701197875,
                                                    colsample_bytree=0.7867526912279533,
                                                    reg_alpha=0.07221562761736805,
                                                    reg_lambda=0.8968124921684337))])


XGBmodel = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 48.09it/s]


In [52]:
xgb_predictions = X_unknown.copy()
xgb_predictions["label"] = le.inverse_transform(XGBmodel.predict(X_unknown)).tolist()
xgb_predictions = xgb_predictions[["label"]]
xgb_predictions.to_csv('predictions_xgb.txt', index_label="stdid", sep=' ')

**Deployment of RandomForestClassifier model with best params and making predictions**

In [77]:
res.iloc[3:4,8:]

Unnamed: 0,dim_results,walk_results,walk_dist_results,p_results,q_results,best_params_results,checking_overfitting
4,8,12,8,0.5,2,"{'n_estimators': 171, 'max_depth': 5, 'min_sam...",pass


In [54]:
res.iloc[3,-2]

{'n_estimators': 171,
 'max_depth': 5,
 'min_samples_split': 7,
 'min_samples_leaf': 3,
 'max_features': 'auto'}

In [55]:
n2v = Node2Vec(new_g, dimensions=8, num_walks=12, walk_length=8, p=0.5, q=2)
n2v_model = n2v.fit(window=4, min_count=1, batch_words=4)

n2vrep = dict()
for node in new_g.nodes():
    n2vrep[node] = n2v_model.wv[str(node)]
df_n2v = pd.DataFrame(n2vrep).T

feat_ext = RecursiveFeatureExtractor(new_g, max_generations=5)
rolx_feats = feat_ext.extract_features()
df_feats_all = pd.merge(df_n2v, rolx_feats, left_index=True, right_index=True)

df_feats_all.index = df_feats_all.index.astype(np.int64, copy=True)
df_data = pd.merge(df_feats_all, labels, left_index=True, right_index=True)
df_data.columns = df_data.columns.astype(str)

my_features_df.index = my_features_df.index.astype(np.int64, copy=True)
kitchen_sink_df = pd.merge(df_data, my_features_df, left_index=True, right_index=True)

unknowns = kitchen_sink_df[kitchen_sink_df.label=='unknown']
raw_df = kitchen_sink_df[kitchen_sink_df.label!='unknown']

X = raw_df.drop(columns=['label'])
y = raw_df.label
X_unknown=unknowns.drop(columns=['label'])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

pipeline = Pipeline([
                    ('preprocessor',  StandardScaler()),
                    ('DropConstant',  DropConstantFeatures(tol=0.99)),
                    ('DropDuplicate', DropDuplicateFeatures()),
                    ('DropCorr',      DropCorrelatedFeatures(threshold=0.90)),
                    ('DropSmart',     SmartCorrelatedSelection(threshold=0.80, cv=3)),
                    ('selector',      RFECV(estimator=estimator,scoring=scoring)),
                    ('classifier',    RandomForestClassifier(n_estimators=171,
                                                             max_depth=5,
                                                             min_samples_split=7,
                                                             min_samples_leaf=3,
                                                             max_features="auto"))])


random_forest_model = pipeline.fit(X, y_encoded)

Computing transition probabilities:   0%|          | 0/348 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 12/12 [00:00<00:00, 54.75it/s]


In [56]:
randomforest_predictions = X_unknown.copy()
randomforest_predictions["label"] = le.inverse_transform(random_forest_model.predict(X_unknown)).tolist()
randomforest_predictions = randomforest_predictions[["label"]]
randomforest_predictions.to_csv('predictions_random_forest.txt', index_label="stdid", sep=' ')

**Saving Models:**

In [59]:
import pickle

filename = 'mlp_deployed_model.ncmodel'
pickle.dump(MLPClassifiermodel, open(filename,'wb'))

In [61]:
filename = 'qda_deployed_model.ncmodel'
pickle.dump(QUADmodel, open(filename,'wb'))

filename = 'xgb_deployed_model.ncmodel'
pickle.dump(XGBmodel, open(filename,'wb'))

filename = 'rf_deployed_model.ncmodel'
pickle.dump(random_forest_model, open(filename,'wb'))