### Libraries

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random, os
import networkx as nx
import pandas as pd
import json

### Fix the seed

In [2]:
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = True

In [3]:
SEED = 5051
seed_everything(SEED)

### Get the graph, add target and music

In [4]:
PATH = "/kaggle/input/graphs-social/feather-lastfm-social/"

In [5]:
# Load the JSON file
with open(PATH + 'lastfm_asia_features.json', 'r') as file:
    json_data = json.load(file)

In [6]:
graph_csv = PATH + "lastfm_asia_edges.csv"
target_csv = PATH + "lastfm_asia_target.csv"

graph_df = pd.read_csv(graph_csv)
target_df = pd.read_csv(target_csv)

# Create a new graph
graph = nx.Graph()

# Add nodes from the graph file
nodes = set(graph_df['node_1']).union(set(graph_df['node_2']))
graph.add_nodes_from(nodes)

# Add edges from the graph file
edges = graph_df[['node_1', 'node_2']].values
graph.add_edges_from(edges)

# Add target information to graph nodes
target_mapping = dict(target_df[['id', 'target']].values)
nx.set_node_attributes(graph, target_mapping, 'target')

G = graph
del graph

In [7]:
# Iterate over the JSON data
for node_id, attributes in json_data.items():
    if int(node_id) in G.nodes:
        G.nodes[int(node_id)]["music"] = attributes

### Check

In [8]:
assert len(G.edges) == 27806, "Not enough edges"
assert len(G.nodes) == 7624, "Not enough nodes"

### Divide the graph on train/val/test

In [9]:
# Split the graph into train, validation, and test sets based on targets
nodes = list(G.nodes())
targets = [G.nodes[node]['target'] for node in nodes]

nodes_train_val, nodes_test, targets_train_val, targets_test = train_test_split(
    nodes, targets, test_size=0.2, random_state=SEED)
nodes_train, nodes_val, targets_train, targets_val = train_test_split(
    nodes_train_val, targets_train_val, test_size=0.1, random_state=SEED)

In [10]:
# Create separate graphs for train, validation, and test sets
graph_train = G.subgraph(nodes_train)
graph_val = G.subgraph(nodes_val)
graph_test = G.subgraph(nodes_test)

In [11]:
# Check, that we have same splits
assert list(graph_train.nodes)[0:5] == [0, 2, 3, 4, 7], "Something is wrong"

In [12]:
# Check, that you have target and music there
# graph_train.nodes[2]

### SVD: extract features and targets

In [13]:
# Assuming your graph is stored in variable G and target classes in class_dict
adj_matrix = nx.adjacency_matrix(G)

In [14]:
max_music_length = max(len(G.nodes()[node]['music']) for node in G.nodes())

In [15]:
# Create the music feature matrix
music_feature_matrix = np.zeros((len(G.nodes()), max_music_length))

for node in G.nodes():
    music = G.nodes()[node]['music']
    padding_length = max_music_length - len(music)
    padded_music = music + [0] * padding_length
    
    music_feature_matrix[node] = padded_music

In [16]:
# Convert the adjacency matrix to a dense matrix
adj_dense = adj_matrix.toarray()

# Concatenate the adjacency and music feature matrices
updated_adj_matrix = np.concatenate((adj_dense, music_feature_matrix), axis=1)

In [17]:
updated_adj_matrix.shape

(7624, 8568)

In [18]:
adj_matrix.shape

(7624, 7624)

In [19]:
k = 512  # Number of singular values/vectors to keep
U, S, V = np.linalg.svd(adj_matrix.todense())

In [20]:
train_features = U[sorted(nodes_train)]
val_features = U[sorted(nodes_val)]
test_features = U[sorted(nodes_test)]

In [21]:
train_labels = [graph_train.nodes()[node]["target"] for node in graph_train.nodes()]
val_labels = [graph_val.nodes()[node]["target"] for node in graph_val.nodes()]
test_labels = [graph_test.nodes()[node]["target"] for node in graph_test.nodes()]

### Train the classifier

In [22]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [23]:
model = LGBMClassifier(random_state=SEED, max_depth=3, n_estimators=1000, learning_rate=0.11)
model.fit(train_features, train_labels, early_stopping_rounds=10, eval_set=[(val_features, val_labels)],
          eval_metric='multi_logloss')
preds_train = model.predict(train_features)
preds_test = model.predict(test_features)
accuracy_score(train_labels, preds_train), accuracy_score(test_labels, preds_test)



[1]	valid_0's multi_logloss: 2.51229
[2]	valid_0's multi_logloss: 2.5624
[3]	valid_0's multi_logloss: 2.65556
[4]	valid_0's multi_logloss: 2.73375
[5]	valid_0's multi_logloss: 2.8213
[6]	valid_0's multi_logloss: 2.90443
[7]	valid_0's multi_logloss: 2.98585
[8]	valid_0's multi_logloss: 3.06787
[9]	valid_0's multi_logloss: 3.14725
[10]	valid_0's multi_logloss: 3.22547
[11]	valid_0's multi_logloss: 3.30029


(0.7050464565494625, 0.7016393442622951)

In [24]:
model = CatBoostClassifier(random_state=SEED, max_depth=3, n_estimators=1000)
model.fit(train_features, train_labels, early_stopping_rounds=10, eval_set=[(val_features, val_labels)])
preds_train = model.predict(train_features)
preds_test = model.predict(test_features)
accuracy_score(train_labels, preds_train), accuracy_score(test_labels, preds_test)

Learning rate set to 0.111717
0:	learn: 2.5029202	test: 2.8052966	best: 2.8052966 (0)	total: 4.24s	remaining: 1h 10m 34s
1:	learn: 2.2317356	test: 2.7765585	best: 2.7765585 (1)	total: 7.31s	remaining: 1h 48s
2:	learn: 2.0670276	test: 2.7616996	best: 2.7616996 (2)	total: 9.98s	remaining: 55m 18s
3:	learn: 1.9595320	test: 2.7708173	best: 2.7616996 (2)	total: 12.7s	remaining: 52m 41s
4:	learn: 1.8498059	test: 2.7736628	best: 2.7616996 (2)	total: 15.8s	remaining: 52m 23s
5:	learn: 1.7564330	test: 2.7778520	best: 2.7616996 (2)	total: 18.3s	remaining: 50m 30s
6:	learn: 1.6872724	test: 2.7968542	best: 2.7616996 (2)	total: 21.5s	remaining: 50m 55s
7:	learn: 1.6275255	test: 2.8081847	best: 2.7616996 (2)	total: 23.8s	remaining: 49m 15s
8:	learn: 1.5781561	test: 2.8248163	best: 2.7616996 (2)	total: 26.5s	remaining: 48m 40s
9:	learn: 1.5311209	test: 2.8300796	best: 2.7616996 (2)	total: 28.9s	remaining: 47m 38s
10:	learn: 1.4855632	test: 2.8232657	best: 2.7616996 (2)	total: 31.1s	remaining: 46m 34s

(0.5829841501184186, 0.6163934426229508)

In [25]:
model = XGBClassifier(random_state=SEED, max_depth=3, n_estimators=1000, learning_rate=0.05)
model.fit(train_features, train_labels, early_stopping_rounds=10, eval_set=[(val_features, val_labels)])
preds_train = model.predict(train_features)
preds_test = model.predict(test_features)
accuracy_score(train_labels, preds_train), accuracy_score(test_labels, preds_test)



[0]	validation_0-mlogloss:2.86285
[1]	validation_0-mlogloss:2.85045
[2]	validation_0-mlogloss:2.84372
[3]	validation_0-mlogloss:2.84118
[4]	validation_0-mlogloss:2.83988
[5]	validation_0-mlogloss:2.84097
[6]	validation_0-mlogloss:2.84457
[7]	validation_0-mlogloss:2.84795
[8]	validation_0-mlogloss:2.85271
[9]	validation_0-mlogloss:2.85902
[10]	validation_0-mlogloss:2.86689
[11]	validation_0-mlogloss:2.87438
[12]	validation_0-mlogloss:2.88350
[13]	validation_0-mlogloss:2.89327
[14]	validation_0-mlogloss:2.90406


(0.8019675715066497, 0.7836065573770492)