### Libraries

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random, os
import networkx as nx
import pandas as pd
import json

### Fix the seed

In [2]:
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = True

In [3]:
SEED = 5051
seed_everything(SEED)

### Get the graph, add target and music

In [4]:
# Load the JSON file
with open('lastfm_asia_features.json', 'r') as file:
    json_data = json.load(file)

In [5]:
graph_csv = "lastfm_asia_edges.csv"
target_csv = "lastfm_asia_target.csv"

graph_df = pd.read_csv(graph_csv)
target_df = pd.read_csv(target_csv)

# Create a new graph
graph = nx.Graph()

# Add nodes from the graph file
nodes = set(graph_df['node_1']).union(set(graph_df['node_2']))
graph.add_nodes_from(nodes)

# Add edges from the graph file
edges = graph_df[['node_1', 'node_2']].values
graph.add_edges_from(edges)

# Add target information to graph nodes
target_mapping = dict(target_df[['id', 'target']].values)
nx.set_node_attributes(graph, target_mapping, 'target')

G = graph
del graph

In [6]:
# Iterate over the JSON data
for node_id, attributes in json_data.items():
    if int(node_id) in G.nodes:
        G.nodes[int(node_id)]["music"] = attributes

### Check

In [7]:
assert len(G.edges) == 27806, "Not enough edges"
assert len(G.nodes) == 7624, "Not enough nodes"

### Divide the graph on train/val/test

In [8]:
# Split the graph into train, validation, and test sets based on targets
nodes = list(G.nodes())
targets = [G.nodes[node]['target'] for node in nodes]

nodes_train_val, nodes_test, targets_train_val, targets_test = train_test_split(
    nodes, targets, test_size=0.2, random_state=SEED)
nodes_train, nodes_val, targets_train, targets_val = train_test_split(
    nodes_train_val, targets_train_val, test_size=0.1, random_state=SEED)

In [9]:
# Create separate graphs for train, validation, and test sets
graph_train = G.subgraph(nodes_train)
graph_val = G.subgraph(nodes_val)
graph_test = G.subgraph(nodes_test)

In [10]:
# Check, that we have same splits
assert list(graph_train.nodes)[0:5] == [0, 2, 3, 4, 7], "Something is wrong"

In [12]:
# Check, that you have target and music there
# graph_train.nodes[2]

### node2vec

In [88]:
from node2vec import Node2Vec
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [58]:
g_emb = Node2Vec(G, seed=SEED, workers=4)
mdl = g_emb.fit()
emb_df = (pd.DataFrame([mdl.wv.get_vector(str(n)) for n in G.nodes()], index = G.nodes))

emb_df_train = emb_df.loc[nodes_train].copy()
emb_df_test = emb_df.loc[nodes_test].copy()
emb_df_val = emb_df.loc[nodes_val].copy()

HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=7624.0, style=Pr…




### different boostings

In [91]:
model = LGBMClassifier(random_state=SEED, max_depth=3, n_estimators=1000, learning_rate=0.05)
model.fit(emb_df_train, targets_train, early_stopping_rounds=10, eval_set=[(emb_df_val, targets_val)],
          eval_metric='multi_logloss')
preds_train = model.predict(emb_df_train)
preds_test = model.predict(emb_df_test)
accuracy_score(targets_train, preds_train), accuracy_score(targets_test, preds_test)

[1]	valid_0's multi_logloss: 2.18375
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 2.07778
[3]	valid_0's multi_logloss: 1.99298
[4]	valid_0's multi_logloss: 1.91799
[5]	valid_0's multi_logloss: 1.85268
[6]	valid_0's multi_logloss: 1.79249
[7]	valid_0's multi_logloss: 1.7392
[8]	valid_0's multi_logloss: 1.68772
[9]	valid_0's multi_logloss: 1.642
[10]	valid_0's multi_logloss: 1.59543
[11]	valid_0's multi_logloss: 1.55385
[12]	valid_0's multi_logloss: 1.51497
[13]	valid_0's multi_logloss: 1.47854
[14]	valid_0's multi_logloss: 1.44543
[15]	valid_0's multi_logloss: 1.41282
[16]	valid_0's multi_logloss: 1.38072
[17]	valid_0's multi_logloss: 1.35249
[18]	valid_0's multi_logloss: 1.32371
[19]	valid_0's multi_logloss: 1.29798
[20]	valid_0's multi_logloss: 1.27286
[21]	valid_0's multi_logloss: 1.24953
[22]	valid_0's multi_logloss: 1.22665
[23]	valid_0's multi_logloss: 1.20503
[24]	valid_0's multi_logloss: 1.18398
[25]	valid_0's multi_logloss: 1.16437
[

[210]	valid_0's multi_logloss: 0.594891
[211]	valid_0's multi_logloss: 0.594822
[212]	valid_0's multi_logloss: 0.594638
[213]	valid_0's multi_logloss: 0.59455
[214]	valid_0's multi_logloss: 0.59447
[215]	valid_0's multi_logloss: 0.594313
[216]	valid_0's multi_logloss: 0.594057
[217]	valid_0's multi_logloss: 0.593537
[218]	valid_0's multi_logloss: 0.593462
[219]	valid_0's multi_logloss: 0.593695
[220]	valid_0's multi_logloss: 0.593162
[221]	valid_0's multi_logloss: 0.593088
[222]	valid_0's multi_logloss: 0.592963
[223]	valid_0's multi_logloss: 0.592821
[224]	valid_0's multi_logloss: 0.592684
[225]	valid_0's multi_logloss: 0.592891
[226]	valid_0's multi_logloss: 0.592552
[227]	valid_0's multi_logloss: 0.592414
[228]	valid_0's multi_logloss: 0.592339
[229]	valid_0's multi_logloss: 0.591892
[230]	valid_0's multi_logloss: 0.591625
[231]	valid_0's multi_logloss: 0.591147
[232]	valid_0's multi_logloss: 0.590838
[233]	valid_0's multi_logloss: 0.591016
[234]	valid_0's multi_logloss: 0.591311
[2

(0.9635634906175988, 0.8708196721311475)

In [97]:
model = CatBoostClassifier(random_state=SEED, max_depth=3, n_estimators=1000)
model.fit(emb_df_train, targets_train, early_stopping_rounds=10, eval_set=[(emb_df_val, targets_val)])
preds_train = model.predict(emb_df_train)
preds_test = model.predict(emb_df_test)
accuracy_score(targets_train, preds_train), accuracy_score(targets_test, preds_test)

Learning rate set to 0.111717
0:	learn: 2.5369248	test: 2.5366067	best: 2.5366067 (0)	total: 24.4ms	remaining: 24.4s
1:	learn: 2.3940660	test: 2.3900120	best: 2.3900120 (1)	total: 46.1ms	remaining: 23s
2:	learn: 2.2423963	test: 2.2287882	best: 2.2287882 (2)	total: 71.1ms	remaining: 23.6s
3:	learn: 2.1360130	test: 2.1212845	best: 2.1212845 (3)	total: 93.6ms	remaining: 23.3s
4:	learn: 2.0379577	test: 2.0176105	best: 2.0176105 (4)	total: 117ms	remaining: 23.3s
5:	learn: 1.9410588	test: 1.9240606	best: 1.9240606 (5)	total: 140ms	remaining: 23.3s
6:	learn: 1.8797966	test: 1.8644174	best: 1.8644174 (6)	total: 163ms	remaining: 23.2s
7:	learn: 1.8020812	test: 1.7915366	best: 1.7915366 (7)	total: 188ms	remaining: 23.3s
8:	learn: 1.7344163	test: 1.7226104	best: 1.7226104 (8)	total: 213ms	remaining: 23.5s
9:	learn: 1.6844860	test: 1.6738049	best: 1.6738049 (9)	total: 236ms	remaining: 23.4s
10:	learn: 1.6288022	test: 1.6224828	best: 1.6224828 (10)	total: 260ms	remaining: 23.4s
11:	learn: 1.5857767

98:	learn: 0.6345541	test: 0.7185258	best: 0.7185258 (98)	total: 3.07s	remaining: 27.9s
99:	learn: 0.6329715	test: 0.7163474	best: 0.7163474 (99)	total: 3.11s	remaining: 28s
100:	learn: 0.6306995	test: 0.7142783	best: 0.7142783 (100)	total: 3.16s	remaining: 28.1s
101:	learn: 0.6286292	test: 0.7123863	best: 0.7123863 (101)	total: 3.21s	remaining: 28.2s
102:	learn: 0.6276260	test: 0.7109930	best: 0.7109930 (102)	total: 3.25s	remaining: 28.3s
103:	learn: 0.6255219	test: 0.7092883	best: 0.7092883 (103)	total: 3.29s	remaining: 28.4s
104:	learn: 0.6235053	test: 0.7079815	best: 0.7079815 (104)	total: 3.33s	remaining: 28.4s
105:	learn: 0.6219467	test: 0.7065414	best: 0.7065414 (105)	total: 3.38s	remaining: 28.5s
106:	learn: 0.6209925	test: 0.7056314	best: 0.7056314 (106)	total: 3.42s	remaining: 28.5s
107:	learn: 0.6197018	test: 0.7047017	best: 0.7047017 (107)	total: 3.46s	remaining: 28.6s
108:	learn: 0.6188772	test: 0.7046765	best: 0.7046765 (108)	total: 3.5s	remaining: 28.6s
109:	learn: 0.617

193:	learn: 0.5356264	test: 0.6404486	best: 0.6404486 (193)	total: 7.17s	remaining: 29.8s
194:	learn: 0.5353275	test: 0.6403127	best: 0.6403127 (194)	total: 7.21s	remaining: 29.8s
195:	learn: 0.5346917	test: 0.6397766	best: 0.6397766 (195)	total: 7.25s	remaining: 29.8s
196:	learn: 0.5343355	test: 0.6395043	best: 0.6395043 (196)	total: 7.3s	remaining: 29.7s
197:	learn: 0.5336523	test: 0.6398132	best: 0.6395043 (196)	total: 7.34s	remaining: 29.7s
198:	learn: 0.5323013	test: 0.6390172	best: 0.6390172 (198)	total: 7.38s	remaining: 29.7s
199:	learn: 0.5317774	test: 0.6382714	best: 0.6382714 (199)	total: 7.42s	remaining: 29.7s
200:	learn: 0.5309434	test: 0.6374465	best: 0.6374465 (200)	total: 7.46s	remaining: 29.7s
201:	learn: 0.5300088	test: 0.6371468	best: 0.6371468 (201)	total: 7.5s	remaining: 29.6s
202:	learn: 0.5295857	test: 0.6367551	best: 0.6367551 (202)	total: 7.54s	remaining: 29.6s
203:	learn: 0.5291568	test: 0.6360492	best: 0.6360492 (203)	total: 7.58s	remaining: 29.6s
204:	learn: 

289:	learn: 0.4826615	test: 0.6159289	best: 0.6159289 (289)	total: 11.1s	remaining: 27.2s
290:	learn: 0.4822979	test: 0.6159036	best: 0.6159036 (290)	total: 11.2s	remaining: 27.2s
291:	learn: 0.4818641	test: 0.6157512	best: 0.6157512 (291)	total: 11.2s	remaining: 27.2s
292:	learn: 0.4814616	test: 0.6156894	best: 0.6156894 (292)	total: 11.3s	remaining: 27.2s
293:	learn: 0.4807518	test: 0.6150590	best: 0.6150590 (293)	total: 11.3s	remaining: 27.1s
294:	learn: 0.4802682	test: 0.6150101	best: 0.6150101 (294)	total: 11.3s	remaining: 27.1s
295:	learn: 0.4797985	test: 0.6150106	best: 0.6150101 (294)	total: 11.4s	remaining: 27.1s
296:	learn: 0.4794058	test: 0.6150799	best: 0.6150101 (294)	total: 11.4s	remaining: 27s
297:	learn: 0.4792033	test: 0.6149711	best: 0.6149711 (297)	total: 11.5s	remaining: 27s
298:	learn: 0.4786973	test: 0.6147862	best: 0.6147862 (298)	total: 11.5s	remaining: 27s
299:	learn: 0.4783650	test: 0.6146189	best: 0.6146189 (299)	total: 11.6s	remaining: 27s
300:	learn: 0.4780

(0.8877755511022044, 0.8727868852459016)

In [101]:
model = XGBClassifier(random_state=SEED, max_depth=3, n_estimators=1000, learning_rate=0.05)
model.fit(emb_df_train, targets_train, early_stopping_rounds=10, eval_set=[(emb_df_val, targets_val)])
preds_train = model.predict(emb_df_train)
preds_test = model.predict(emb_df_test)
accuracy_score(targets_train, preds_train), accuracy_score(targets_test, preds_test)



[0]	validation_0-mlogloss:2.68507
[1]	validation_0-mlogloss:2.52421
[2]	validation_0-mlogloss:2.38984
[3]	validation_0-mlogloss:2.27723
[4]	validation_0-mlogloss:2.17558
[5]	validation_0-mlogloss:2.08538
[6]	validation_0-mlogloss:2.00654
[7]	validation_0-mlogloss:1.93680
[8]	validation_0-mlogloss:1.87167
[9]	validation_0-mlogloss:1.81004
[10]	validation_0-mlogloss:1.75401
[11]	validation_0-mlogloss:1.70210
[12]	validation_0-mlogloss:1.65251
[13]	validation_0-mlogloss:1.60828
[14]	validation_0-mlogloss:1.56679
[15]	validation_0-mlogloss:1.52758
[16]	validation_0-mlogloss:1.48980
[17]	validation_0-mlogloss:1.45462
[18]	validation_0-mlogloss:1.42247
[19]	validation_0-mlogloss:1.39293
[20]	validation_0-mlogloss:1.36284
[21]	validation_0-mlogloss:1.33542
[22]	validation_0-mlogloss:1.30859
[23]	validation_0-mlogloss:1.28305
[24]	validation_0-mlogloss:1.25914
[25]	validation_0-mlogloss:1.23567
[26]	validation_0-mlogloss:1.21345
[27]	validation_0-mlogloss:1.19109
[28]	validation_0-mlogloss:1.1

[231]	validation_0-mlogloss:0.58421
[232]	validation_0-mlogloss:0.58389
[233]	validation_0-mlogloss:0.58376
[234]	validation_0-mlogloss:0.58343
[235]	validation_0-mlogloss:0.58339
[236]	validation_0-mlogloss:0.58308
[237]	validation_0-mlogloss:0.58275
[238]	validation_0-mlogloss:0.58232
[239]	validation_0-mlogloss:0.58222
[240]	validation_0-mlogloss:0.58206
[241]	validation_0-mlogloss:0.58180
[242]	validation_0-mlogloss:0.58184
[243]	validation_0-mlogloss:0.58186
[244]	validation_0-mlogloss:0.58183
[245]	validation_0-mlogloss:0.58171
[246]	validation_0-mlogloss:0.58165
[247]	validation_0-mlogloss:0.58099
[248]	validation_0-mlogloss:0.58069
[249]	validation_0-mlogloss:0.58076
[250]	validation_0-mlogloss:0.58093
[251]	validation_0-mlogloss:0.58074
[252]	validation_0-mlogloss:0.58072
[253]	validation_0-mlogloss:0.58044
[254]	validation_0-mlogloss:0.58004
[255]	validation_0-mlogloss:0.57994
[256]	validation_0-mlogloss:0.57971
[257]	validation_0-mlogloss:0.57971
[258]	validation_0-mlogloss:

(0.9734013481508471, 0.8662295081967213)