In [7]:
!pip install torch-geometric node2vec

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl.metadata (743 bytes)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: networkx, torch-geometric, node2vec
  Attempting uninstall: networkx
    Found existing ins

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostClassifier,Pool, cv
from node2vec import Node2Vec
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
import multiprocessing
from tqdm.auto import tqdm, trange
from gensim.models import Word2Vec
import random
import networkx as nx
import torch

In [9]:
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [10]:
from torch_geometric.datasets import Planetoid

# Import dataset from PyTorch Geometric
dataset = Planetoid(root=".", name="CiteSeer")

# Print information about the dataset
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {dataset[0].x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print(f'Has isolated nodes: {dataset[0].has_isolated_nodes()}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...


Number of graphs: 1
Number of nodes: 3327
Number of features: 3703
Number of classes: 6
Has isolated nodes: True


Done!


# CatBoost + SVD | DeepWalk

In [11]:
test_mask = dataset.test_mask
val_mask = dataset.val_mask
train_mask = ~(test_mask + val_mask)

In [12]:
train_idx = torch.arange(0,len(train_mask))[train_mask]
val_idx = torch.arange(0,len(val_mask))[val_mask]
test_idx = torch.arange(0,len(test_mask))[test_mask]

In [13]:
train_feats = dataset.x[train_mask].numpy()
train_label = dataset.y[train_mask].numpy()

val_feats = dataset.x[val_mask].numpy()
val_label = dataset.y[val_mask].numpy()

test_feats = dataset.x[test_mask].numpy()
test_label = dataset.y[test_mask].numpy()

In [14]:
svd_params = {'n_components':64,
              'algorithm':'randomized',
              'n_iter':5,
              'n_oversamples':10,
              'power_iteration_normalizer':'auto',
              'random_state':56}

svd = TruncatedSVD(**svd_params)

train_svd_feats = svd.fit_transform(train_feats)
test_svd_feats = svd.transform(test_feats)
val_svd_feats = svd.transform(val_feats)
all_feats = svd.transform(dataset.x.numpy())

In [15]:
train_id2_label = {}

for i,(mask,label) in enumerate(zip(train_mask,dataset.y)):
    if mask:
        train_id2_label[i] = label.item()
    else:
        train_id2_label[i] = -1

In [16]:
G = nx.Graph()
G.add_edges_from(dataset.edge_index.T.numpy())

In [17]:
isolated_nodes = []
for i in range(dataset.x.shape[0]):
    if i not in G.nodes:
        isolated_nodes.append(i)

In [18]:
def random_walk(G, node, walk_length,used_nodes=True):
    random_walk_length = [node]
    
    for i in range(walk_length-1):
        neighbors = list(G.neighbors(node))
        if used_nodes:
            neighbors = list(set(neighbors) - set(random_walk_length))    
            if len(neighbors) == 0:
                break
                
        random_neighbor = random.choice(neighbors)
        random_walk_length.append(random_neighbor)
        node = random_neighbor
        
    return random_walk_length

def get_random_walks(G, num_iters=20, walk_length=10, used_nodes=True, del_repeats = False):
    walks = []
    walks_hash = set()
    for node in tqdm(G.nodes):
        for i in range(num_iters):
            walk = random_walk(G,node,walk_length=walk_length,used_nodes=used_nodes)
            walk = [str(x) for x in walk]
            if del_repeats:
                walk_hash = '_'.join(walk)
                if walk_hash not in walks_hash:
                    walks_hash.add(walk_hash)
                    walks.append(walk)
            else:
                walks.append(walk)
            
    return walks


In [19]:
random_walks = get_random_walks(G,num_iters=20,walk_length=10,used_nodes=True,del_repeats=False)

  0%|          | 0/3279 [00:00<?, ?it/s]

In [20]:
w2v_params = {'vector_size':16,
              'alpha':0.025,
              'window':4,
              'sg':1,
              'hs':0,
              'negative':20,
              'min_alpha':0.0005,
              'workers':4,
              'seed':56}

w2v_model = Word2Vec(**w2v_params)
w2v_model.build_vocab(random_walks)

In [24]:
w2v_model.train(random_walks,
                total_examples=w2v_model.corpus_count,
                epochs=20,
                report_delay=1)

(7268081, 7307780)

In [29]:
def calc_feature_aggregation(node,G=G,nfeats=64,isolated_nodes=isolated_nodes,all_feats=all_feats,agg=np.mean,null_value=-100):
    feats = []
    if node  in isolated_nodes:
        return [null_value] * nfeats
    for n in G.neighbors(node):
        feats.append(all_feats[n])
    return agg(feats,axis=0).tolist()
    

def calc_label_features(node,G=G,isolated_nodes=isolated_nodes,train_id2_label=train_id2_label,num_classes=6):
    feats = [0] * (num_classes + 2)
    if node  in isolated_nodes:
        return feats
    for n in G.neighbors(node):
        n = train_id2_label[n]
        feats[n+1] += 1
        feats[-1] += 1
        
    for i in range(len(feats)-1):
        feats[i] /= feats[-1]

    return tuple(feats)

def calc_deepwalk_feats(node,w2v_model=w2v_model,isolated_nodes=isolated_nodes,nfeats=16):
    if node in isolated_nodes:
        return [0] * nfeats
    
    return w2v_model.wv[str(node)].tolist()
        
        

def make_df(feats, idxes, num_classes=6,agg_type='mean'):
    df = pd.DataFrame()
    df['id'] = idxes
    for i in range(feats.shape[1]):
        df[f'svd_feature_{i}'] = feats[:,i]

    nearest_feature_cols = [f'target_{i}_percent' for i in range(-1,num_classes)] + ['count_neighbours']
    svd_agg_features = [f'svd_{agg_type}_feature_{i}' for i in range(feats.shape[1])] 
    deepwalk_cols = [f'deepwalk_feature_{i}' for i in range(16)] 
    
    df[nearest_feature_cols] = [calc_label_features(x) for x in df['id']]
    df[svd_agg_features] = [calc_feature_aggregation(x) for x in df['id']]
    df[deepwalk_cols] = [calc_deepwalk_feats(x) for x in df['id']]
    return df.drop(['id'],axis=1)

train_df = make_df(train_svd_feats, train_idx)
val_df = make_df(val_svd_feats, val_idx)
test_df = make_df(test_svd_feats, test_idx)

In [32]:
svd_features = [f'svd_feature_{i}' for i in range(64)]
nearest_feature_cols = [f'target_{i}_percent' for i in range(-1,6)] + ['count_neighbours']
svd_agg_features = [f'svd_mean_feature_{i}' for i in range(64)] 
deepwalk_cols = [f'deepwalk_feature_{i}' for i in range(16)] 

## Only DeepWalk Embeds

In [33]:
train_pool = Pool(data=train_df[deepwalk_cols],
                  label=train_label)

eval_pool = Pool(data=val_df[deepwalk_cols],
                  label=val_label)

test_pool = Pool(data=test_df[deepwalk_cols],
                  label=test_label)

In [34]:
params = {'iterations':1000,
          'learning_rate':0.05,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.3880679	test: 0.3980000	best: 0.3980000 (0)	total: 63.1ms	remaining: 1m 3s
100:	learn: 0.6502463	test: 0.6260000	best: 0.6300000 (99)	total: 714ms	remaining: 6.36s
200:	learn: 0.7077176	test: 0.6340000	best: 0.6380000 (184)	total: 1.35s	remaining: 5.35s
300:	learn: 0.7580733	test: 0.6560000	best: 0.6600000 (288)	total: 1.98s	remaining: 4.59s
400:	learn: 0.7909141	test: 0.6520000	best: 0.6620000 (326)	total: 2.6s	remaining: 3.88s
500:	learn: 0.8139026	test: 0.6620000	best: 0.6620000 (326)	total: 3.25s	remaining: 3.23s
600:	learn: 0.8286809	test: 0.6660000	best: 0.6660000 (588)	total: 3.86s	remaining: 2.56s
700:	learn: 0.8609743	test: 0.6660000	best: 0.6660000 (588)	total: 4.49s	remaining: 1.91s
800:	learn: 0.8773946	test: 0.6760000	best: 0.6820000 (793)	total: 5.1s	remaining: 1.27s
900:	learn: 0.8949097	test: 0.6740000	best: 0.6820000 (793)	total: 5.73s	remaining: 630ms
999:	learn: 0.9080460	test: 0.6720000	best: 0.6820000 (793)	total: 6.35s	remaining: 0us

bestTest = 0.682


<catboost.core.CatBoostClassifier at 0x795223941db0>

In [35]:
cbm.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,deepwalk_feature_2,8.247549
1,deepwalk_feature_11,7.959306
2,deepwalk_feature_8,7.331457
3,deepwalk_feature_15,6.854136
4,deepwalk_feature_14,6.505173
5,deepwalk_feature_10,6.499609
6,deepwalk_feature_9,6.458985
7,deepwalk_feature_13,6.246761
8,deepwalk_feature_4,6.201245
9,deepwalk_feature_7,6.038603


In [36]:
y_p = cbm.predict(test_pool)
accuracy_score(test_label,y_p)

0.654

## DeepWalk + Other Feats

In [37]:
train_pool = Pool(data=train_df[svd_features + nearest_feature_cols + svd_agg_features + deepwalk_cols],
                  label=train_label)

eval_pool = Pool(data=val_df[svd_features + nearest_feature_cols + svd_agg_features + deepwalk_cols],
                  label=val_label)

test_pool = Pool(data=test_df[svd_features + nearest_feature_cols + svd_agg_features + deepwalk_cols],
                  label=test_label)

In [39]:
params = {'iterations':1000,
          'learning_rate':0.05,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.6228790	test: 0.6440000	best: 0.6440000 (0)	total: 86.3ms	remaining: 1m 26s
100:	learn: 0.7509579	test: 0.7460000	best: 0.7460000 (88)	total: 4.72s	remaining: 42s
200:	learn: 0.8056924	test: 0.7560000	best: 0.7560000 (137)	total: 9.29s	remaining: 36.9s
300:	learn: 0.8401752	test: 0.7540000	best: 0.7600000 (288)	total: 13.8s	remaining: 32.1s
400:	learn: 0.8790367	test: 0.7580000	best: 0.7620000 (351)	total: 18.7s	remaining: 27.9s
500:	learn: 0.9025725	test: 0.7620000	best: 0.7620000 (351)	total: 23.2s	remaining: 23.1s
600:	learn: 0.9211823	test: 0.7640000	best: 0.7660000 (540)	total: 27.7s	remaining: 18.4s
700:	learn: 0.9392447	test: 0.7660000	best: 0.7680000 (658)	total: 32.2s	remaining: 13.7s
800:	learn: 0.9512863	test: 0.7680000	best: 0.7720000 (724)	total: 36.7s	remaining: 9.12s
900:	learn: 0.9633279	test: 0.7680000	best: 0.7720000 (724)	total: 41.2s	remaining: 4.52s
999:	learn: 0.9715380	test: 0.7640000	best: 0.7720000 (724)	total: 45.6s	remaining: 0us

bestTest = 0.772

<catboost.core.CatBoostClassifier at 0x795223943250>

In [40]:
cbm.get_feature_importance(prettified=True).head(60)

Unnamed: 0,Feature Id,Importances
0,svd_feature_1,9.270254
1,svd_mean_feature_1,6.271712
2,svd_feature_3,5.831403
3,svd_mean_feature_4,4.963228
4,target_2_percent,4.143901
5,target_5_percent,4.005407
6,svd_feature_4,3.996377
7,svd_mean_feature_7,2.434328
8,target_3_percent,2.327123
9,svd_mean_feature_3,2.162977


In [43]:
y_p = cbm.predict(test_pool)
accuracy_score(test_label,y_p)

0.794

# CatBoost + Node2Vec

In [129]:
test_mask = dataset.test_mask
val_mask = dataset.val_mask
train_mask = ~(test_mask + val_mask)

In [130]:
train_idx = torch.arange(0,len(train_mask))[train_mask]
val_idx = torch.arange(0,len(val_mask))[val_mask]
test_idx = torch.arange(0,len(test_mask))[test_mask]

In [131]:
train_feats = dataset.x[train_mask].numpy()
train_label = dataset.y[train_mask].numpy()

val_feats = dataset.x[val_mask].numpy()
val_label = dataset.y[val_mask].numpy()

test_feats = dataset.x[test_mask].numpy()
test_label = dataset.y[test_mask].numpy()

In [132]:
svd_params = {'n_components':64,
              'algorithm':'randomized',
              'n_iter':5,
              'n_oversamples':10,
              'power_iteration_normalizer':'auto',
              'random_state':56}

svd = TruncatedSVD(**svd_params)

train_svd_feats = svd.fit_transform(train_feats)
test_svd_feats = svd.transform(test_feats)
val_svd_feats = svd.transform(val_feats)
all_feats = svd.transform(dataset.x.numpy())

In [133]:
train_id2_label = {}

for i,(mask,label) in enumerate(zip(train_mask,dataset.y)):
    if mask:
        train_id2_label[i] = label.item()
    else:
        train_id2_label[i] = -1

In [134]:
G = nx.Graph()
G.add_edges_from(dataset.edge_index.T.numpy())

In [135]:
isolated_nodes = []
for i in range(dataset.x.shape[0]):
    if i not in G.nodes:
        isolated_nodes.append(i)

In [136]:
n2v_params = {'dimensions':32,
              'walk_length':20,
              'num_walks':20,
              'p':1.0,
              'q':1.0,
              'workers':4}

node2vec = Node2Vec(G,**n2v_params)

Computing transition probabilities:   0%|          | 0/3279 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:02<00:00,  1.79it/s]
Generating walks (CPU: 2): 100%|██████████| 5/5 [00:02<00:00,  1.84it/s]
Generating walks (CPU: 3): 100%|██████████| 5/5 [00:02<00:00,  1.74it/s]
Generating walks (CPU: 4): 100%|██████████| 5/5 [00:02<00:00,  1.94it/s]


In [137]:
skip_gram_params = {'vector_size':32,
                    'alpha':0.025,
                    'min_alpha':0.0005,
                    'window':4,
                    'seed':56}

node2vec_model = node2vec.fit(**skip_gram_params)

In [139]:
def calc_feature_aggregation(node,G=G,nfeats=64,isolated_nodes=isolated_nodes,all_feats=all_feats,agg=np.mean,null_value=-100):
    feats = []
    if node  in isolated_nodes:
        return [null_value] * nfeats
    for n in G.neighbors(node):
        feats.append(all_feats[n])
    return agg(feats,axis=0).tolist()
    

def calc_label_features(node,G=G,isolated_nodes=isolated_nodes,train_id2_label=train_id2_label,num_classes=6):
    feats = [0] * (num_classes + 2)
    if node  in isolated_nodes:
        return feats
    for n in G.neighbors(node):
        n = train_id2_label[n]
        feats[n+1] += 1
        feats[-1] += 1
        
    for i in range(len(feats)-1):
        feats[i] /= feats[-1]

    return tuple(feats)

def calc_node2vec_cols_feats(node,w2v_model=node2vec_model,isolated_nodes=isolated_nodes,nfeats=32):
    if node in isolated_nodes:
        return [0] * nfeats
    
    return w2v_model.wv[str(node)].tolist()
        
        

def make_df(feats, idxes, num_classes=6,agg_type='mean'):
    df = pd.DataFrame()
    df['id'] = idxes
    for i in range(feats.shape[1]):
        df[f'svd_feature_{i}'] = feats[:,i]

    nearest_feature_cols = [f'target_{i}_percent' for i in range(-1,num_classes)] + ['count_neighbours']
    svd_agg_features = [f'svd_{agg_type}_feature_{i}' for i in range(feats.shape[1])] 
    node2vec_cols = [f'node2vec_feature_{i}' for i in range(32)] 
    
    df[nearest_feature_cols] = [calc_label_features(x) for x in df['id']]
    df[svd_agg_features] = [calc_feature_aggregation(x) for x in df['id']]
    df[node2vec_cols] = [calc_node2vec_cols_feats(x) for x in df['id']]
    return df.drop(['id'],axis=1)

train_df = make_df(train_svd_feats, train_idx)
val_df = make_df(val_svd_feats, val_idx)
test_df = make_df(test_svd_feats, test_idx)

In [140]:
svd_features = [f'svd_feature_{i}' for i in range(64)]
nearest_feature_cols = [f'target_{i}_percent' for i in range(-1,6)] + ['count_neighbours']
svd_agg_features = [f'svd_mean_feature_{i}' for i in range(64)] 
node2vec_cols = [f'node2vec_feature_{i}' for i in range(32)] 

## Only Node2Vec

In [141]:
train_pool = Pool(data=train_df[node2vec_cols],
                  label=train_label)

eval_pool = Pool(data=val_df[node2vec_cols],
                  label=val_label)

test_pool = Pool(data=test_df[node2vec_cols],
                  label=test_label)

In [142]:
params = {'iterations':1000,
          'learning_rate':0.05,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.3804050	test: 0.3960000	best: 0.3960000 (0)	total: 17.5ms	remaining: 17.5s
100:	learn: 0.7307061	test: 0.6380000	best: 0.6380000 (100)	total: 1.51s	remaining: 13.5s
200:	learn: 0.7892720	test: 0.6560000	best: 0.6620000 (173)	total: 2.75s	remaining: 10.9s
300:	learn: 0.8248495	test: 0.6760000	best: 0.6820000 (295)	total: 3.9s	remaining: 9.05s
400:	learn: 0.8472906	test: 0.6860000	best: 0.6920000 (361)	total: 5.03s	remaining: 7.52s
500:	learn: 0.8648057	test: 0.6880000	best: 0.6920000 (361)	total: 6.16s	remaining: 6.14s
600:	learn: 0.8768473	test: 0.6960000	best: 0.6960000 (600)	total: 7.26s	remaining: 4.82s
700:	learn: 0.8905309	test: 0.7000000	best: 0.7040000 (651)	total: 8.36s	remaining: 3.57s
800:	learn: 0.8954570	test: 0.7100000	best: 0.7120000 (797)	total: 9.47s	remaining: 2.35s
900:	learn: 0.9069513	test: 0.7080000	best: 0.7120000 (797)	total: 10.6s	remaining: 1.16s
999:	learn: 0.9157088	test: 0.7100000	best: 0.7120000 (797)	total: 11.7s	remaining: 0us

bestTest = 0.71

<catboost.core.CatBoostClassifier at 0x7951b5f89600>

In [143]:
cbm.get_feature_importance(prettified=True).head(60)

Unnamed: 0,Feature Id,Importances
0,node2vec_feature_11,5.577847
1,node2vec_feature_8,4.553667
2,node2vec_feature_28,4.444986
3,node2vec_feature_24,4.406313
4,node2vec_feature_31,4.055256
5,node2vec_feature_17,3.941912
6,node2vec_feature_4,3.703332
7,node2vec_feature_19,3.487502
8,node2vec_feature_12,3.48723
9,node2vec_feature_29,3.408939


In [145]:
y_p = cbm.predict(test_pool)
accuracy_score(test_label,y_p)

0.717

## Node2Vec + Other Feats

In [125]:
train_pool = Pool(data=train_df,
                  label=train_label)

eval_pool = Pool(data=val_df,
                  label=val_label)

test_pool = Pool(data=test_df,
                  label=test_label)

In [126]:
params = {'iterations':1000,
          'learning_rate':0.05,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'Accuracy',
          'random_seed':56}

cbm = CatBoostClassifier(**params)
cbm.fit(train_pool,eval_set=eval_pool,verbose=100)

0:	learn: 0.6097427	test: 0.6500000	best: 0.6500000 (0)	total: 85.1ms	remaining: 1m 24s
100:	learn: 0.7498632	test: 0.7360000	best: 0.7380000 (85)	total: 5.65s	remaining: 50.3s
200:	learn: 0.8035030	test: 0.7500000	best: 0.7580000 (197)	total: 11.1s	remaining: 44.3s
300:	learn: 0.8374384	test: 0.7620000	best: 0.7680000 (276)	total: 16.6s	remaining: 38.5s
400:	learn: 0.8773946	test: 0.7800000	best: 0.7820000 (392)	total: 22s	remaining: 32.8s
500:	learn: 0.8976464	test: 0.7780000	best: 0.7840000 (411)	total: 27.8s	remaining: 27.7s
600:	learn: 0.9173508	test: 0.7740000	best: 0.7840000 (411)	total: 33.2s	remaining: 22s
700:	learn: 0.9315818	test: 0.7740000	best: 0.7840000 (411)	total: 38.6s	remaining: 16.4s
800:	learn: 0.9501916	test: 0.7720000	best: 0.7840000 (411)	total: 44s	remaining: 10.9s
900:	learn: 0.9578544	test: 0.7720000	best: 0.7840000 (411)	total: 49.4s	remaining: 5.43s
999:	learn: 0.9638752	test: 0.7740000	best: 0.7840000 (411)	total: 55.1s	remaining: 0us

bestTest = 0.784
bes

<catboost.core.CatBoostClassifier at 0x7951a3b5ad40>

In [127]:
cbm.get_feature_importance(prettified=True).head(15)

Unnamed: 0,Feature Id,Importances
0,svd_feature_1,12.002587
1,svd_mean_feature_1,6.53744
2,svd_feature_3,6.455113
3,svd_feature_4,5.968317
4,svd_mean_feature_4,5.613895
5,target_2_percent,4.095894
6,target_5_percent,3.817288
7,svd_mean_feature_3,3.455191
8,svd_mean_feature_7,2.834739
9,target_3_percent,2.377998


In [128]:
y_p = cbm.predict(test_pool)
accuracy_score(test_label,y_p)

0.79

# Final Results

In [147]:
Experements = ['CatBoost + DeepWalk','CatBoost + SVD + DeepWalk',
               'CatBoost + Node2Vec','CatBoost + SVD + Node2Vec']

TestScores = [0.654,0.794,0.717,0.79]
ValScores = [0.682,0.772,0.712,0.784]

pd.DataFrame({'Experements':Experements,'TestScores':TestScores,'ValScores':ValScores})

Unnamed: 0,Experements,TestScores,ValScores
0,CatBoost + DeepWalk,0.654,0.682
1,CatBoost + SVD + DeepWalk,0.794,0.772
2,CatBoost + Node2Vec,0.717,0.712
3,CatBoost + SVD + Node2Vec,0.79,0.784
