# Load and prepare the dataset

In [None]:
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.0+cu113.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 4.6 MB/s 
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_sparse-0.6.15-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 43.3 MB/s 
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_cluster-1.6.0-cp37-cp37m-linux_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 30.0 MB/s 
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_spline_conv-1.2.1-cp37-cp37m-linux_x86_64.whl (709 kB)
[K     |████████████████████████████████| 709 kB 35.9 MB/s

In [None]:
!gdown 1OEpM2j-xwKboMcj3bj9QNT-UbYGaDfr8

Downloading...
From: https://drive.google.com/uc?id=1OEpM2j-xwKboMcj3bj9QNT-UbYGaDfr8
To: /content/dataset_filtered_normalizado.parquet
  0% 0.00/7.38M [00:00<?, ?B/s]100% 7.38M/7.38M [00:00<00:00, 210MB/s]


In [None]:
import pandas as pd

graph_filtered = pd.read_parquet("dataset_filtered_normalizado.parquet")
graph_filtered.reset_index(drop=True, inplace=True)
df = graph_filtered[['artist','song','BERT','art_representation','relationships', 'popularity']]

threshold = 50

df_int = df[df.popularity > threshold]
df_out = df[df.popularity <= threshold]

df_int['target'] = [1] * len(df[df.popularity > threshold])
df_out['target'] = [-1] * len(df[df.popularity <= threshold])

graph_filtered = pd.concat([df_int,df_out])

graph_filtered.tail()

# Graph Modeling


In [None]:
art_id = {}
song_id = {}

song_cnt = 0
art_cnt = 0
for row in graph_filtered.itertuples(index=False):
  if f"{row.artist}_{row.song}" not in song_id.keys():
    song_id.update({f"{row.artist}_{row.song}":song_cnt})
    song_cnt+=1
  if row.artist not in art_id.keys():
    art_id.update({row.artist:art_cnt})
    art_cnt+=1

In [None]:
songID, artID, relationshipID = [], [], []
for row in graph_filtered.itertuples(index=False):
  songID.append(song_id[f"{row.artist}_{row.song}"])
  artID.append(art_id[row.artist])
  aux = []
  for relationship in row.relationships:
    try:    
      aux.append(art_id[relationship])
    except:      
      pass
  relationshipID.append(aux)
graph_filtered['song_id'] = songID
graph_filtered['artist_id'] = artID
graph_filtered['relationship_id'] = relationshipID
graph_filtered.tail()

In [None]:
import random
from random import sample
from tqdm import tqdm
positive_song, negative_song = [], []
positive_art, negative_art = [], []

random.seed(81)

unique_song = list(graph_filtered.song_id.unique())
unique_artist = list(graph_filtered.artist_id.unique())

for row in graph_filtered.itertuples(index=False):
  positive_song.append(row.song_id)
  neg_song = sample(unique_song,1)[0]
  while neg_song == row.song_id:
    neg_song = sample(unique_song,1)[0]
  negative_song.append(neg_song)

  neg_art = sample(unique_artist,1)[0]
  if len(row.relationship_id)>0:
    positive_art.append(sample(row.relationship_id,1)[0])
  else:
    positive_art.append(row.artist_id)
  while neg_art == row.artist_id or neg_art in row.relationship_id:
    neg_art = sample(unique_artist,1)[0]
  negative_art.append(neg_art)

graph_filtered["positive_song"] = positive_song
graph_filtered["negative_song"] = negative_song
graph_filtered["positive_artist"] = positive_art
graph_filtered["negative_artist"] = negative_art

graph_filtered.tail(5)

In [None]:
import torch

"""def load_node_df(df, index_col, feature_col):
  mapping = {index: i for i, index in enumerate(df[index_col].unique())}
  x = torch.tensor(df[feature_col], dtype=torch.float32)
  return x, mapping"""


def load_node_df(df, index_col, feature_col):
  if index_col=="artist_id":
    mapping = {index: index for i, index in enumerate(df[index_col].unique())}
  elif index_col=="song_id":
    mapping = {index: i for i, index in enumerate(df[index_col].unique())}
  x = torch.tensor(df[feature_col], dtype=torch.float32)
  return x, mapping

In [None]:
song_x, song_mapping = load_node_df(df=graph_filtered, index_col="song_id", feature_col="BERT")

In [None]:
art_x, art_mapping = load_node_df(df=graph_filtered, index_col="artist_id", feature_col="art_representation")

In [None]:
from torch_geometric.data import HeteroData

graph_data = HeteroData()

graph_data['song'].num_nodes = len(song_mapping) 
graph_data['song'].x = song_x
graph_data['artist'].num_nodes = len(art_mapping) 
graph_data['artist'].x = art_x 

print(graph_data)

In [None]:
def load_edge_graph(df, src_index_col, src_mapping, dst_index_col, dst_mapping):
  # artist -> song
  src = [src_mapping[index] for index in df[src_index_col]]
  dst = [dst_mapping[index] for index in df[dst_index_col]]
  edge_index = torch.tensor([src, dst])

  edge_attr = None
  return edge_index, edge_attr

In [None]:
art_song_edge_index, art_song_edge_label = load_edge_graph(
    graph_filtered,
    src_index_col='artist_id',
    src_mapping=art_mapping,
    dst_index_col='song_id',
    dst_mapping=song_mapping,    
)

In [None]:
graph_filtered_tmp = graph_filtered.copy()
graph_filtered_tmp = graph_filtered_tmp[['artist_id', 'relationship_id']]

list_art_id, list_relationship_art_id = [], []
for row in graph_filtered_tmp.itertuples(index=False):
  for i in row.relationship_id:
    list_art_id.append(row.artist_id)
    list_relationship_art_id.append(i)
graph_filtered_art_related = pd.DataFrame({"artist_id":list_art_id, "related_artist":list_relationship_art_id})
graph_filtered_art_related

In [None]:
art_art_edge_index, art_art_edge_label = load_edge_graph(
    graph_filtered_art_related,
    src_index_col='artist_id',
    src_mapping=art_mapping,
    dst_index_col='related_artist',
    dst_mapping=art_mapping,    
)

In [None]:
graph_data["artist", "has", "song"].edge_index = art_song_edge_index
graph_data["artist", "has", "song"].edge_label = art_song_edge_label

graph_data["artist", "connect", "artist"].edge_index = art_art_edge_index
graph_data["artist", "connect", "artist"].edge_label = art_art_edge_label

print(graph_data)

# Unsupervised GNN

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
import random
import numpy as np
import torch

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels[0])
        self.out = SAGEConv((-1, -1), out_channels)
        self.n_layers = len(hidden_channels)
          
        if self.n_layers == 2:
          self.conv2 = SAGEConv((-1, -1), hidden_channels[1])
        elif self.n_layers == 3:
          self.conv2 = SAGEConv((-1, -1), hidden_channels[1])
          self.conv3 = SAGEConv((-1, -1), hidden_channels[2])       
        elif self.n_layers == 4:
          self.conv2 = SAGEConv((-1, -1), hidden_channels[1])
          self.conv3 = SAGEConv((-1, -1), hidden_channels[2])        
          self.conv4 = SAGEConv((-1, -1), hidden_channels[3])        

    def forward(self, x, edge_index):
        
        x = self.conv1(x, edge_index).tanh()
        
        if self.n_layers == 2:
          x = self.conv2(x, edge_index).tanh()
        elif self.n_layers == 3:
          x = self.conv2(x, edge_index).tanh()
          x = self.conv3(x, edge_index).tanh()
        elif self.n_layers == 4:
          x = self.conv2(x, edge_index).tanh()
          x = self.conv3(x, edge_index).tanh()       
          x = self.conv4(x, edge_index).tanh()        
        
        x = self.out(x, edge_index)
        return x

def create_GNN(hidden_channels, out_channels, graph_data):
  
  random.seed(81)
  np.random.seed(81)
  torch.manual_seed(81)
  if torch.cuda.is_available():
    torch.cuda.manual_seed(81)

  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

  model = GNN(hidden_channels=hidden_channels, out_channels=out_channels)
  model = to_hetero(model, graph_data.metadata(), aggr='max')

  return model

In [None]:
def train_gnn(epochs, lr, hidden_channels, out_channels, graph_data, inductive, test_graph_data, out_graph_data, train_df):

  model = create_GNN(hidden_channels, out_channels, graph_data)

  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  
  list_loss = []
  for epoch in range(0, epochs):
    loss, emd_song, emd_art = train(0,model,optimizer, graph_data, train_df)    
    list_loss.append(loss)
  
  if inductive:
    test_embeddings = model(test_graph_data.x_dict, test_graph_data.edge_index_dict)
    out_embeddings = model(out_graph_data.x_dict, out_graph_data.edge_index_dict)
    return list_loss, emd_song, emd_art, test_embeddings, out_embeddings
  else:
    return list_loss, emd_song, emd_art

  
def train(total_loss, model, optimizer, graph_data, train_df): 

    model.train()    

    optimizer.zero_grad()

    out = model(graph_data.x_dict, graph_data.edge_index_dict)    
    
    out_song = out['song']
    pos_out_song = out['song'][train_df.positive_song.tolist()]
    neg_out_song = out['song'][train_df.negative_song.tolist()]
    
    pos_loss = F.logsigmoid((out_song * pos_out_song).sum(-1)).mean()
    neg_loss = F.logsigmoid(-(out_song * neg_out_song).sum(-1)).mean()
    loss = -pos_loss - neg_loss

    out_art = out['artist']
    pos_out_art = out['artist'][train_df.positive_artist.tolist()]
    neg_out_art = out['artist'][train_df.negative_artist.tolist()]

    pos_loss = F.logsigmoid((out_art * pos_out_art).sum(-1)).mean()
    neg_loss = F.logsigmoid(-(out_art * neg_out_art).sum(-1)).mean()
    loss += -pos_loss - neg_loss
    
    
    loss.backward()
    optimizer.step()
    total_loss += float(loss) * (out_song.size(0) + out_art.size(0))    

    return total_loss / graph_data.num_nodes, out['song'], out['artist']

## Transdutive


In [None]:
arq = [128, 64]
             
output_len = 32

lr = 0.0001

epoch = 250

df_representations = pd.DataFrame(columns=['n_neurons','output_len','lr','epoch','emd_song','emd_art', 'concat'])

path = "./results"

_, emd_song, emd_art = train_gnn(epoch, lr, arq, output_len, graph_data, False, None, None, graph_filtered)

song = []
for i in emd_song:
  song.append(i.cpu().detach().numpy().tolist())

art = []
for i in emd_art:
  art.append(i.cpu().detach().numpy().tolist())

concat = []
for i, j in zip(song, art):
  concat.append(i+j)


dic = {'n_neurons': n, 'output_len': out, 'lr': lr, 'epoch': epoch, 'emd_song': song, 'emd_art': art, 'concat': concat}

df_representations = df_representations.append(dic, ignore_index = True)

df_representations.to_pickle(path + 'df_representations.pkl')

## Indutive

In [None]:
!gdown 1OEpM2j-xwKboMcj3bj9QNT-UbYGaDfr8

Downloading...
From: https://drive.google.com/uc?id=1OEpM2j-xwKboMcj3bj9QNT-UbYGaDfr8
To: /content/dataset_filtered_normalizado.parquet
  0% 0.00/7.38M [00:00<?, ?B/s]100% 7.38M/7.38M [00:00<00:00, 92.9MB/s]


In [None]:
import pandas as pd

def set_label(row):
  if row <= 50:
    return -1
  else:
    return 1

graph_filtered = pd.read_parquet("dataset_filtered_normalizado.parquet")
graph_filtered.reset_index(drop=True, inplace=True)
graph_filtered['target'] = graph_filtered.popularity.apply(set_label)
graph_filtered = graph_filtered[['artist','song','BERT','art_representation','relationships', 'popularity', "target"]]
graph_filtered.tail()

In [None]:
from sklearn.model_selection import train_test_split
from torch_geometric.data import HeteroData
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.svm import OneClassSVM as OCSVM
from sklearn.metrics import classification_report
from pathlib import Path
import numpy as np
import random
from random import sample
from tqdm import tqdm
import pandas as pd
from gc import collect

def create_mapping(df):
  art_id = {}
  song_id = {}

  song_cnt = 0
  art_cnt = 0
  for row in df.itertuples(index=False):
    if f"{row.artist}_{row.song}" not in song_id.keys():
      song_id.update({f"{row.artist}_{row.song}":song_cnt})
      song_cnt+=1
    if row.artist not in art_id.keys():
      art_id.update({row.artist:art_cnt})
      art_cnt+=1

  songID, artID, relationshipID = [], [], []
  for row in df.itertuples(index=False):
    songID.append(song_id[f"{row.artist}_{row.song}"])
    artID.append(art_id[row.artist])
    aux = []
    for relationship in row.relationships:
      try:    
        aux.append(art_id[relationship])
      except:      
        pass
    relationshipID.append(aux)

  df['song_id'] = songID
  df['artist_id'] = artID
  df['relationship_id'] = relationshipID
  return df

def generate_positive_negative(df):

  positive_song, negative_song = [], []
  positive_art, negative_art = [], []

  random.seed(81)

  unique_song = list(df.song_id.unique())
  unique_artist = list(df.artist_id.unique())

  for row in df.itertuples(index=False):
    positive_song.append(row.song_id)
    neg_song = sample(unique_song,1)[0]
    while neg_song == row.song_id:
      neg_song = sample(unique_song,1)[0]
    negative_song.append(neg_song)

    neg_art = sample(unique_artist,1)[0]
    if len(row.relationship_id)>0:
      positive_art.append(sample(row.relationship_id,1)[0])
    else:
      positive_art.append(row.artist_id)
    while neg_art == row.artist_id or neg_art in row.relationship_id:
      neg_art = sample(unique_artist,1)[0]
    negative_art.append(neg_art)

  df["positive_song"] = positive_song
  df["negative_song"] = negative_song
  df["positive_artist"] = positive_art
  df["negative_artist"] = negative_art

  return df

def foldValidation(folds):
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    return kf

def train_test_split_one_class(kf, df_int):
    train_test = []

    for train_index, test_index in kf.split(df_int):
        train_test.append((df_int.iloc[train_index], df_int.iloc[test_index]))
    
    return train_test

def train_test_pragh_OCL_music(df):
  df_out = df[df.target == -1]
  df_int = df[df.target == 1]

  folds = 5
  kf = foldValidation(folds)

  train_test = train_test_split_one_class(kf, df_int)

  return train_test, df_out

def load_node_df(df, index_col, feature_col):
  if index_col=="artist_id":
    mapping = {index: index for i, index in enumerate(df[index_col].unique())}
  elif index_col=="song_id":
    mapping = {index: i for i, index in enumerate(df[index_col].unique())}
  x = torch.tensor(df[feature_col].tolist(), dtype=torch.float32)
  return x, mapping


"""def load_node_df(df, index_col, feature_col):
    mapping = {index: i for i, index in enumerate(df[index_col].unique())}  
    x = torch.tensor(df[feature_col].tolist(), dtype=torch.float32)
    return x, mapping"""

def get_graph(song_mapping, art_mapping, song_x, art_x):

    graph_data = HeteroData()

    graph_data['song'].num_nodes = len(song_mapping) 
    graph_data['song'].x = song_x
    graph_data['artist'].num_nodes = len(art_mapping) 
    graph_data['artist'].x = art_x 
    return graph_data

def load_edge_graph(df, src_index_col, src_mapping, dst_index_col, dst_mapping):
    # artist -> song
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    return edge_index, edge_attr

def get_graph_filtered_art_related(graph_filtered):
    graph_filtered_tmp = graph_filtered.copy()
    graph_filtered_tmp = graph_filtered_tmp[['artist_id', 'relationship_id', 'song_id']]

    list_art_id, list_relationship_art_id = [], []
    unique_art = list(graph_filtered_tmp.artist_id.unique())
    for row in graph_filtered_tmp.itertuples(index=False):
      for i in row.relationship_id:
        if i in unique_art:
          list_art_id.append(row.artist_id)
          list_relationship_art_id.append(i)
    
    graph_filtered_art_related = pd.DataFrame({"artist_id":list_art_id, "related_artist":list_relationship_art_id})
    return graph_filtered_art_related

def set_learned_features(emd_song, emd_art):
  tmp_song = []
  for i in emd_song:
    tmp_song.append(i.cpu().detach().numpy().tolist())

  tmp_art = []
  for i in emd_art:
    tmp_art.append(i.cpu().detach().numpy().tolist())

  tmp_concat = []
  for i, j in zip(tmp_song, tmp_art):
    tmp_concat.append(i+j)
  
  return tmp_song, tmp_art, tmp_concat

def prepare_for_GNN(train_graph_filtered, test_graph_filtered, out_graph_filtered):
  

  train_song_x, train_song_mapping = load_node_df(df=train_graph_filtered, index_col="song_id", feature_col="BERT")
  train_art_x, train_art_mapping = load_node_df(df=train_graph_filtered, index_col="artist_id", feature_col="art_representation")

  test_song_x, test_song_mapping = load_node_df(df=pd.DataFrame(test_graph_filtered), index_col="song_id", feature_col="BERT")
  test_art_x, test_art_mapping = load_node_df(df=test_graph_filtered, index_col="artist_id", feature_col="art_representation")

  out_song_x, out_song_mapping = load_node_df(df=pd.DataFrame(out_graph_filtered), index_col="song_id", feature_col="BERT")
  out_art_x, out_art_mapping = load_node_df(df=out_graph_filtered, index_col="artist_id", feature_col="art_representation")

  train_graph_data = get_graph(train_song_mapping, train_art_mapping, train_song_x, train_art_x)
  test_graph_data = get_graph(test_song_mapping, test_art_mapping, test_song_x, test_art_x)
  out_graph_data = get_graph(out_song_mapping, out_art_mapping, out_song_x, out_art_x)

  
  train_art_song_edge_index, train_art_song_edge_label = load_edge_graph(
      train_graph_filtered,
      src_index_col='artist_id',
      src_mapping=train_art_mapping,
      dst_index_col='song_id',
      dst_mapping=train_song_mapping,    
  )

  test_art_song_edge_index, test_art_song_edge_label = load_edge_graph(
      test_graph_filtered,
      src_index_col='artist_id',
      src_mapping=test_art_mapping,
      dst_index_col='song_id',
      dst_mapping=test_song_mapping,    
  )

  out_art_song_edge_index, out_art_song_edge_label = load_edge_graph(
      out_graph_filtered,
      src_index_col='artist_id',
      src_mapping=out_art_mapping,
      dst_index_col='song_id',
      dst_mapping=out_song_mapping,    
  )


  train_graph_filtered_art_related = get_graph_filtered_art_related(train_graph_filtered)
  test_graph_filtered_art_related = get_graph_filtered_art_related(test_graph_filtered)
  out_graph_filtered_art_related = get_graph_filtered_art_related(out_graph_filtered)
  
  train_art_art_edge_index, train_art_art_edge_label = load_edge_graph(
      train_graph_filtered_art_related,
      src_index_col='artist_id',
      src_mapping=train_art_mapping,
      dst_index_col='related_artist',
      dst_mapping=train_art_mapping,    
  )

  test_art_art_edge_index, test_art_art_edge_label = load_edge_graph(
      test_graph_filtered_art_related,
      src_index_col='artist_id',
      src_mapping=test_art_mapping,
      dst_index_col='related_artist',
      dst_mapping=test_art_mapping,    
  )

  out_art_art_edge_index, out_art_art_edge_label = load_edge_graph(
      out_graph_filtered_art_related,
      src_index_col='artist_id',
      src_mapping=out_art_mapping,
      dst_index_col='related_artist',
      dst_mapping=out_art_mapping,    
  )

  train_graph_data["artist", "has", "song"].edge_index = train_art_song_edge_index
  train_graph_data["artist", "has", "song"].edge_label = train_art_song_edge_label

  train_graph_data["artist", "connect", "artist"].edge_index = train_art_art_edge_index
  train_graph_data["artist", "connect", "artist"].edge_label = train_art_art_edge_label

  test_graph_data["artist", "has", "song"].edge_index = test_art_song_edge_index
  test_graph_data["artist", "has", "song"].edge_label = test_art_song_edge_label

  test_graph_data["artist", "connect", "artist"].edge_index = test_art_art_edge_index
  test_graph_data["artist", "connect", "artist"].edge_label = test_art_art_edge_label

  out_graph_data["artist", "has", "song"].edge_index = out_art_song_edge_index
  out_graph_data["artist", "has", "song"].edge_label = out_art_song_edge_label

  out_graph_data["artist", "connect", "artist"].edge_index = out_art_art_edge_index
  out_graph_data["artist", "connect", "artist"].edge_label = out_art_art_edge_label

  return [train_graph_data, test_graph_data, out_graph_data]

def evaluation_one_class(preds_interest, preds_outliers):
    y_true = [1] * len(preds_interest) + [-1] * len(preds_outliers)
    y_pred = list(preds_interest) + list(preds_outliers)
    return classification_report(y_true, y_pred, output_dict=True)

def evaluate_model(X_train, X_test, X_outlier, model):

    one_class_classifier = model.fit(X_train)

    Y_pred_interest = one_class_classifier.predict(X_test)
    Y_pred_ruido = one_class_classifier.predict(X_outlier)

    return evaluation_one_class(Y_pred_interest, Y_pred_ruido)

def evaluate_models(models, representations, file_name, line_parameters, path):

    for model in models:
        lp = model + '_' + line_parameters
        fn = file_name + '_' + model.split('_')[0] + '.csv'
        l_values = []

        for reps in representations:
            values = evaluate_model(reps[0], reps[1], reps[2], models[model])
            l_values.append(values)

        write_results(l_values, fn, lp, path)

def write_results(l_values, file_name, line_parameters, path):
    if not Path(path + file_name).is_file():
        file_ = open(path + file_name, 'w')
        string = 'Parameters'

        for i in range(1,6):
            string += ';fold' + str(i)
        string += '\n'

        file_.write(string)
        file_.close()

    file_ = open(path + file_name, 'a')
    string = line_parameters

    for values in l_values:
        string += ';' + str(values)

    string += '\n'
    file_.write(string)
    file_.close()

def foldValidation(folds):
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    return kf

def train_test_split_one_class(kf, df_int):
    train_test = []

    for train_index, test_index in kf.split(df_int):
        train_test.append((df_int.iloc[train_index], df_int.iloc[test_index]))
    
    return train_test

def dfs(df):
    df_out = df[df['target'] == -1]
    df_int = df[df['target'] == 1]

    folds = 5
    kf = foldValidation(folds)

    train_test = train_test_split_one_class(kf, df_int)

    return train_test, df_out


In [None]:
from sklearn.svm import OneClassSVM as OCSVM

models = {
	'OCSVM_RBF_0.0001_scale': OCSVM(kernel='rbf', nu=0.0001, gamma='scale'),
	'OCSVM_RBF_0.001_scale': OCSVM(kernel='rbf', nu=0.001, gamma='scale'),
	'OCSVM_RBF_0.005_scale': OCSVM(kernel='rbf', nu=0.005, gamma='scale'),
	'OCSVM_RBF_0.01_scale': OCSVM(kernel='rbf', nu=0.01, gamma='scale'),
	'OCSVM_RBF_0.05_scale': OCSVM(kernel='rbf', nu=0.05, gamma='scale'),
	'OCSVM_RBF_0.1_scale': OCSVM(kernel='rbf', nu=0.1, gamma='scale'),
	'OCSVM_RBF_0.2_scale': OCSVM(kernel='rbf', nu=0.2, gamma='scale'),
	'OCSVM_RBF_0.3_scale': OCSVM(kernel='rbf', nu=0.3, gamma='scale'),
	'OCSVM_RBF_0.4_scale': OCSVM(kernel='rbf', nu=0.4, gamma='scale'),
	'OCSVM_RBF_0.5_scale': OCSVM(kernel='rbf', nu=0.5, gamma='scale'),
	'OCSVM_RBF_0.6_scale': OCSVM(kernel='rbf', nu=0.6, gamma='scale'),
	'OCSVM_RBF_0.7_scale': OCSVM(kernel='rbf', nu=0.7, gamma='scale'),
	'OCSVM_RBF_0.8_scale': OCSVM(kernel='rbf', nu=0.8, gamma='scale'),
	'OCSVM_RBF_0.9_scale': OCSVM(kernel='rbf', nu=0.9, gamma='scale'),
	'OCSVM_RBF_0.0001_auto': OCSVM(kernel='rbf', nu=0.001, gamma='auto'),
	'OCSVM_RBF_0.001_auto': OCSVM(kernel='rbf', nu=0.001, gamma='auto'),
	'OCSVM_RBF_0.005_auto': OCSVM(kernel='rbf', nu=0.005, gamma='auto'),
	'OCSVM_RBF_0.01_auto': OCSVM(kernel='rbf', nu=0.01, gamma='auto'),
	'OCSVM_RBF_0.05_auto': OCSVM(kernel='rbf', nu=0.05, gamma='auto'),
	'OCSVM_RBF_0.1_auto': OCSVM(kernel='rbf', nu=0.1, gamma='auto'),
	'OCSVM_RBF_0.2_auto': OCSVM(kernel='rbf', nu=0.2, gamma='auto'),
	'OCSVM_RBF_0.3_auto': OCSVM(kernel='rbf', nu=0.3, gamma='auto'),
	'OCSVM_RBF_0.4_auto': OCSVM(kernel='rbf', nu=0.4, gamma='auto'),
	'OCSVM_RBF_0.5_auto': OCSVM(kernel='rbf', nu=0.5, gamma='auto'),
	'OCSVM_RBF_0.6_auto': OCSVM(kernel='rbf', nu=0.6, gamma='auto'),
	'OCSVM_RBF_0.7_auto': OCSVM(kernel='rbf', nu=0.7, gamma='auto'),
	'OCSVM_RBF_0.8_auto': OCSVM(kernel='rbf', nu=0.8, gamma='auto'),
	'OCSVM_RBF_0.9_auto': OCSVM(kernel='rbf', nu=0.9, gamma='auto')    
}

In [None]:
import warnings
warnings.filterwarnings('ignore')

arq = [128, 64]
             
output_len = 32

lr = 0.0001

epoch = 250

path = './results/'

train_test, df_out  = train_test_pragh_OCL_music(graph_filtered)
df_out = create_mapping(df_out)
df_out = generate_positive_negative(df_out)
train_test_out_graph =[]
for df_train, df_test in train_test:
  df_train = create_mapping(df_train)
  df_test = create_mapping(df_test)
  df_train = generate_positive_negative(df_train)
  df_test = generate_positive_negative(df_test)

  l = prepare_for_GNN(df_train, df_test, df_out)
  l.append(df_train)
  train_test_out_graph.append(l)

representations_song = []
representations_song_art = []
for train_graph_data, test_graph_data, out_graph_data, df_train in train_test_out_graph:

  _, emd_song, emd_art, test_embeddings, out_embeddings = train_gnn(epoch, lr, n, out, train_graph_data, True, test_graph_data, out_graph_data, df_train)

  tmp_song_train, tmp_art_train, tmp_concat_train = set_learned_features(emd_song, emd_art)

  tmp_song_test, tmp_art_test, tmp_concat_test = set_learned_features(test_embeddings["song"], test_embeddings['artist'])

  tmp_song_out, tmp_art_out, tmp_concat_out = set_learned_features(out_embeddings["song"], out_embeddings['artist'])

  representations_song.append([tmp_song_train,tmp_song_test,tmp_song_out])
  representations_song_art.append([tmp_concat_train,tmp_concat_test,tmp_concat_out])

line_parameters = str(n) + '_' + str(out) + '_' + str(lr) + '_' + str(epoch)
evaluate_models(models, representations_song, 'GNN-song-Inductive', line_parameters, path)
evaluate_models(models, representations_song_art, 'GNN-song-art-Inductive', line_parameters, path)

# OCSVM

In [None]:
!gdown 1GmFv6-6JIYaUAYXKykpjDTAg9DjmOZvH

import pandas as pd

df_rep = pd.read_pickle('df_representations.pkl')

Downloading...
From: https://drive.google.com/uc?id=1GmFv6-6JIYaUAYXKykpjDTAg9DjmOZvH
To: /content/df_representations.pkl
100% 272M/272M [00:02<00:00, 122MB/s]


In [None]:
path = './results/'

def run(df, graph_filtered, column_1, column_2, file_name, line_parameters):

    train_test, df_out = dfs(graph_filtered)

    representations= []

    for df_train,df_test in train_test:
      x_train = df_train[column_1].to_list()
      x_test = df_test[column_1].to_list()
      x_outlier = df_out[column_1].to_list()

      representations.append([x_train,x_test,x_outlier])

    evaluate_models(models, representations, file_name, line_parameters, path)

## Transdutive GNN

In [None]:
import warnings
warnings.filterwarnings('ignore')

from tqdm.notebook import tqdm

column_1 = 'emb_song_art'
column_2 = 'concat'
file_name = 'GNN-song-art'

pbar = tqdm(range(0, len(df_rep)))

for i in pbar:
  graph_filtered[column_1] = df_rep.iloc[i][column_2]
  line_parameters = str(df_rep.iloc[i][['n_neurons','output_len','lr','epoch']].to_list())
  run(df_rep, graph_filtered, column_1,column_2, file_name, line_parameters)

In [None]:
import warnings
warnings.filterwarnings('ignore')

column_1 = 'emb_song'
column_2 = 'emd_song'
file_name = 'GNN-song'

pbar = tqdm(range(0, len(df_rep)))

for i in pbar:
  graph_filtered[column_1] = df_rep.iloc[i][column_2]
  line_parameters = str(df_rep.iloc[i][['n_neurons','output_len','lr','epoch']].to_list())
  run(df_rep, graph_filtered, column_1,column_2, file_name, line_parameters)

## BERT

In [None]:
column_1 = 'BERT'
column_2 = ''
file_name = 'BERT'
line_parameters = ''

run(df, graph_filtered, column_1,column_2, file_name, line_parameters)

## BERT + 12 features artist

In [None]:
BERT_artF = []
for index,row in graph_filtered.iterrows():
  BERT_artF.append(np.concatenate([row['BERT'],row['art_representation']]))

graph_filtered['BERT+ART_F'] = BERT_artF 

In [None]:
column_1 = 'BERT+ART_F'
column_2 = ''
file_name = 'BERT-ARTF'
line_parameters = ''
run(df, graph_filtered, column_1,column_2, file_name, line_parameters)

# Results Analysis

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2

def show_results(df_result):

  dic_avg = {'f1-score' : [[],[]],
                'recall' : [[],[]],
                'precision' : [[],[]]}

  for index,row in df_result.iterrows():
    dic_result = {'f1-score' : [],
                'recall' : [],
                'precision' : []}

    for i in range(1,6):
      fold = 'fold' + str(i)
      for metric in ['f1-score','recall', 'precision']:
        dic_result[metric].append(eval(row[fold])['weighted avg'][metric])

    for metric in ['f1-score','recall', 'precision']:
      dic_avg[metric][0].append(np.mean(dic_result[metric]))
      dic_avg[metric][1].append(np.std(dic_result[metric]))

  for metric in ['f1-score','recall', 'precision']:
    df_result[metric + '-mean'] = dic_avg[metric][0]
    df_result[metric + '-std'] = dic_avg[metric][1]

  f1_max = max(df_result['f1-score-mean'])

  return f1_max

def regression_results(ocsvm, df_train, df_test, df_out, column):
  distances_train = ocsvm.decision_function(df_train[column].to_list())

  distances_int = ocsvm.decision_function(df_test[column].to_list())

  distances_out = ocsvm.decision_function(df_out[column].to_list())

  total = np.concatenate([distances_train, distances_int, distances_out])

  total_geq = total[total >= 0]
  total_less = total[total < 0]

  minmax_geq = MinMaxScaler(feature_range=(0.5,0.89)).fit(total_geq.reshape(-1, 1))
  minmax_less = MinMaxScaler(feature_range=(0,0.5)).fit(total_less.reshape(-1, 1))

  predicted_popularity = []

  for distances in [distances_int, distances_out]:
    for distance in distances:
      if distance < 0:
        predicted_popularity.append(minmax_less.transform([[distance]])[0][0] * 100)
      else:
        predicted_popularity.append(minmax_geq.transform([[distance]])[0][0] * 100)

  real_popularity = pd.concat([df_test['popularity'], df_out['popularity']]).to_list()
  m1 = mse(real_popularity, predicted_popularity)
  m2 = mae(real_popularity, predicted_popularity)
  m3 = r2(real_popularity, predicted_popularity)
  
  return m1,m2,m3

def metrics_regression(train_test, df_out, column, ocsvm):

  l_mae = []
  l_mse = []
  l_r2 = []

  for df_train,df_test in train_test:
    x_train = df_train[column].to_list()

    ocsvm.fit(x_train)
    mae_, mse_, r2_ = regression_results(ocsvm, df_train, df_test, df_out, column)

    l_mae.append(mae_)
    l_mse.append(mse_)
    l_r2.append(r2_)
  
  return (np.mean(l_mae), np.mean(l_mse), np.mean(l_r2)), (np.std(l_mae), np.std(l_mse), np.std(l_r2))


def regression_results_inductive(ocsvm, x_train, x_test, x_out, y_test, y_out):
  distances_train = ocsvm.decision_function(x_train)

  distances_int = ocsvm.decision_function(x_test)

  distances_out = ocsvm.decision_function(x_out)

  total = np.concatenate([distances_train, distances_int, distances_out])

  total_geq = total[total >= 0]
  total_less = total[total < 0]

  minmax_geq = MinMaxScaler(feature_range=(0.5,0.89)).fit(total_geq.reshape(-1, 1))
  minmax_less = MinMaxScaler(feature_range=(0,0.5)).fit(total_less.reshape(-1, 1))

  predicted_popularity = []

  for distances in [distances_int, distances_out]:
    for distance in distances:
      if distance < 0:
        predicted_popularity.append(minmax_less.transform([[distance]])[0][0] * 100)
      else:
        predicted_popularity.append(minmax_geq.transform([[distance]])[0][0] * 100)

  real_popularity = pd.concat([y_test, y_out]).to_list()
  m1 = mse(real_popularity, predicted_popularity)
  m2 = mae(real_popularity, predicted_popularity)
  m3 = r2(real_popularity, predicted_popularity)
  
  return m1,m2,m3

def metrics_regression_inductive(reps, ocsvm, popularities):

  l_mae = []
  l_mse = []
  l_r2 = []

  i=0
  for x_train, x_test, x_out in reps:
    y_test = popularities[i][1]
    y_out = popularities[i][2]
    ocsvm.fit(x_train)

    mae_, mse_, r2_ = regression_results_inductive(ocsvm, x_train,x_test,x_out,y_test,y_out)

    l_mae.append(mae_)
    l_mse.append(mse_)
    l_r2.append(r2_)

    i+=1
  
  return (np.mean(l_mae), np.mean(l_mse), np.mean(l_r2)), (np.std(l_mae), np.std(l_mse), np.std(l_r2))

## BERT

In [None]:
!gdown 1-HHur2wgx1Ae889EkkZEfv20HdnCC-Rm

df_bert = pd.read_csv('BERT_OCSVM.csv', sep=';')

f1_max = show_results(df_bert)

df_bert[df_bert['f1-score-mean'] == f1_max]

Downloading...
From: https://drive.google.com/uc?id=1-HHur2wgx1Ae889EkkZEfv20HdnCC-Rm
To: /content/BERT_OCSVM.csv
  0% 0.00/73.7k [00:00<?, ?B/s]100% 73.7k/73.7k [00:00<00:00, 64.3MB/s]


Unnamed: 0,Parameters,fold1,fold2,fold3,fold4,fold5,f1-score-mean,f1-score-std,recall-mean,recall-std,precision-mean,precision-std
23,OCSVM_RBF_0.5_auto_,"{'-1': {'precision': 0.4943820224719101, 'reca...","{'-1': {'precision': 0.49624060150375937, 'rec...","{'-1': {'precision': 0.4835164835164835, 'reca...","{'-1': {'precision': 0.49624060150375937, 'rec...","{'-1': {'precision': 0.4870848708487085, 'reca...",0.49757,0.005395,0.497568,0.005375,0.497672,0.005311


In [None]:
ocsvm = OCSVM(kernel='rbf', nu=0.5, gamma='auto')

train_test, df_out = dfs(graph_filtered)

column = 'BERT'

metrics_regression(train_test, df_out, column, ocsvm)

((1012.1945039867596, 25.41068699413478, -0.3171302669360222),
 (18.89998789626124, 0.27364696322175097, 0.03190108261905054))

## BERT + art features


In [None]:
!gdown 1-I9iXy6QjYU4rN_FYPO11GZbyNwnDLGk

df_bert_song = pd.read_csv('BERT-ARTF_OCSVM.csv', sep=';')

f1_max = show_results(df_bert_song)

df_bert_song[df_bert_song['f1-score-mean'] == f1_max]

Downloading...
From: https://drive.google.com/uc?id=1-I9iXy6QjYU4rN_FYPO11GZbyNwnDLGk
To: /content/BERT-ARTF_OCSVM.csv
  0% 0.00/73.7k [00:00<?, ?B/s]100% 73.7k/73.7k [00:00<00:00, 66.1MB/s]


Unnamed: 0,Parameters,fold1,fold2,fold3,fold4,fold5,f1-score-mean,f1-score-std,recall-mean,recall-std,precision-mean,precision-std
23,OCSVM_RBF_0.5_auto_,"{'-1': {'precision': 0.4943820224719101, 'reca...","{'-1': {'precision': 0.4962686567164179, 'reca...","{'-1': {'precision': 0.4855072463768116, 'reca...","{'-1': {'precision': 0.4981132075471698, 'reca...","{'-1': {'precision': 0.4889705882352941, 'reca...",0.498674,0.004871,0.498691,0.004827,0.498819,0.004743


In [None]:
ocsvm = OCSVM(kernel='rbf', nu=0.5, gamma='auto')

train_test, df_out = dfs(graph_filtered)

column = 'BERT+ART_F'

metrics_regression(train_test, df_out, column, ocsvm)

((1010.8974573187563, 25.389208896168153, -0.315437630068791),
 (19.354805629610645, 0.2774460994166107, 0.03214523491429133))

## GNN song

In [None]:
!gdown 1-83x5UjlnR3O39yp1ZBThxt-eG5Rn7q-

df_gnn_song = pd.read_csv('GNN-song_OCSVM.csv', sep=';')

f1_max = show_results(df_gnn_song)

df_gnn_song[df_gnn_song['f1-score-mean'] == f1_max]

Downloading...
From: https://drive.google.com/uc?id=1-83x5UjlnR3O39yp1ZBThxt-eG5Rn7q-
To: /content/GNN-song_OCSVM.csv
  0% 0.00/14.3M [00:00<?, ?B/s]100% 14.3M/14.3M [00:00<00:00, 162MB/s]


Unnamed: 0,Parameters,fold1,fold2,fold3,fold4,fold5,f1-score-mean,f1-score-std,recall-mean,recall-std,precision-mean,precision-std
3410,"OCSVM_RBF_0.4_auto_[[256, 128, 64], 16, 0.01, 50]","{'-1': {'precision': 0.6006825938566553, 'reca...","{'-1': {'precision': 0.6227106227106227, 'reca...","{'-1': {'precision': 0.6245353159851301, 'reca...","{'-1': {'precision': 0.6099290780141844, 'reca...","{'-1': {'precision': 0.6236559139784946, 'reca...",0.627121,0.007166,0.627386,0.006894,0.628162,0.006371


In [None]:
ocsvm = OCSVM(kernel='rbf', nu=0.4, gamma='auto')

column = 'emb_song'

graph_filtered['emb_song'] = df_rep[(df_rep.n_neurons.apply(lambda x: x==[256, 128, 64])) & (df_rep.output_len == 16) & (df_rep.lr == 0.01) & (df_rep.epoch == 50)]['emd_song'].iloc[0]

train_test, df_out = dfs(graph_filtered)

metrics_regression(train_test, df_out, column, ocsvm)

((891.7325717064148, 22.791888231888954, -0.16030990034128534),
 (10.381181773833712, 0.2099806069243001, 0.018564117449104824))

## GNN song art


In [None]:
!gdown 1-2VWFKuULi3ZhKTBJQicMCCnW6Ch23m0
df_gnn_song_art = pd.read_csv('GNN-song-art_OCSVM.csv', sep=';')

f1_max = show_results(df_gnn_song_art)

df_gnn_song_art[df_gnn_song_art['f1-score-mean'] == f1_max].Parameters.iloc[0]

Downloading...
From: https://drive.google.com/uc?id=1-2VWFKuULi3ZhKTBJQicMCCnW6Ch23m0
To: /content/GNN-song-art_OCSVM.csv
  0% 0.00/14.3M [00:00<?, ?B/s]100% 14.3M/14.3M [00:00<00:00, 227MB/s]


'OCSVM_RBF_0.0001_scale_[[256, 128, 64], 32, 0.01, 400]'

In [None]:
ocsvm = OCSVM(kernel='rbf', nu=0.0001, gamma='scale')

column = 'emb_song_art'

graph_filtered['emb_song_art'] = df_rep[(df_rep.n_neurons.apply(lambda x: x==[256, 128, 64])) & (df_rep.output_len == 32) & (df_rep.lr == 0.01) & (df_rep.epoch == 400)]['concat'].iloc[0]

train_test, df_out = dfs(graph_filtered)

metrics_regression(train_test, df_out, column, ocsvm)

((625.2384392288848, 18.236797914442242, 0.18659308102379912),
 (31.021454426983123, 0.3055824696834845, 0.03848729966830608))

## GNN indutiva song art


In [None]:
!gdown 1-5LShCco2QRWS6DpflROOFjUMwbCoJ15

df_gnn_song_art_ind = pd.read_csv('GNN-song-art-Inductive_OCSVM.csv', sep=';')

f1_max = show_results(df_gnn_song_art_ind)

df_gnn_song_art_ind[df_gnn_song_art_ind['f1-score-mean'] == f1_max]

Downloading...
From: https://drive.google.com/uc?id=1-5LShCco2QRWS6DpflROOFjUMwbCoJ15
To: /content/GNN-song-art-Inductive_OCSVM.csv
100% 11.9M/11.9M [00:00<00:00, 32.9MB/s]


Unnamed: 0,Parameters,fold1,fold2,fold3,fold4,fold5,f1-score-mean,f1-score-std,recall-mean,recall-std,precision-mean,precision-std
2850,"OCSVM_RBF_0.4_auto_[256, 128, 64]_32_0.01_500","{'-1': {'precision': 0.5620437956204379, 'reca...","{'-1': {'precision': 0.5663082437275986, 'reca...","{'-1': {'precision': 0.5403726708074534, 'reca...","{'-1': {'precision': 0.5734265734265734, 'reca...","{'-1': {'precision': 0.5677655677655677, 'reca...",0.570939,0.011129,0.572022,0.009598,0.57324,0.008518


In [None]:
train_test, df_out  = train_test_pragh_OCL_music(graph_filtered)
df_out = create_mapping(df_out)
df_out = generate_positive_negative(df_out)
train_test_out_graph =[]
for df_train, df_test in train_test:
  df_train = create_mapping(df_train)
  df_test = create_mapping(df_test)
  df_train = generate_positive_negative(df_train)
  df_test = generate_positive_negative(df_test)

  l = prepare_for_GNN(df_train, df_test, df_out)
  l.append(df_train)
  train_test_out_graph.append(l)

In [None]:
epoch = 500
lr = 0.01
n = [256, 128, 64]
out = 32

representations_song_art = []
for train_graph_data, test_graph_data, out_graph_data, df_train in train_test_out_graph:
  _, emd_song, emd_art, test_embeddings, out_embeddings = train_gnn(epoch, lr, n, out, train_graph_data, True, test_graph_data, out_graph_data, df_train)

  tmp_song_train, tmp_art_train, tmp_concat_train = set_learned_features(emd_song, emd_art)

  tmp_song_test, tmp_art_test, tmp_concat_test = set_learned_features(test_embeddings["song"], test_embeddings['artist'])

  tmp_song_out, tmp_art_out, tmp_concat_out = set_learned_features(out_embeddings["song"], out_embeddings['artist'])

  representations_song_art.append([tmp_concat_train, tmp_concat_test,tmp_concat_out])

In [None]:
popularities = []
for df_train, df_test in train_test:
  popularities.append([df_train['popularity'], df_test['popularity'], df_out['popularity']])

ocsvm = OCSVM(kernel='rbf', nu=0.4, gamma='auto')

metrics_regression_inductive(representations_song_art, ocsvm, popularities)

((954.6839443516025, 24.126530155713848, -0.24227241675679476),
 (27.568451723825707, 0.290163876458278, 0.040006661064793295))

## GNN indutiva song

In [None]:
!gdown 1EmWXY5MPcdjpaCeOdJB80OsPLNbvGof9

df_gnn_song_ind = pd.read_csv('GNN-song-Inductive_OCSVM.csv', sep=';')

f1_max = show_results(df_gnn_song_ind)

df_gnn_song_ind[df_gnn_song_ind['f1-score-mean'] == f1_max]

Downloading...
From: https://drive.google.com/uc?id=1EmWXY5MPcdjpaCeOdJB80OsPLNbvGof9
To: /content/GNN-song-Inductive_OCSVM.csv
  0% 0.00/11.8M [00:00<?, ?B/s]100% 11.8M/11.8M [00:00<00:00, 193MB/s]


Unnamed: 0,Parameters,fold1,fold2,fold3,fold4,fold5,f1-score-mean,f1-score-std,recall-mean,recall-std,precision-mean,precision-std
2752,"OCSVM_RBF_0.4_scale_[256, 128, 64]_32_0.01_100","{'-1': {'precision': 0.5841584158415841, 'reca...","{'-1': {'precision': 0.5714285714285714, 'reca...","{'-1': {'precision': 0.5158227848101266, 'reca...","{'-1': {'precision': 0.581081081081081, 'recal...","{'-1': {'precision': 0.6181818181818182, 'reca...",0.58567,0.035085,0.587373,0.033868,0.589371,0.03349


In [None]:
train_test, df_out  = train_test_pragh_OCL_music(graph_filtered)
df_out = create_mapping(df_out)
df_out = generate_positive_negative(df_out)
train_test_out_graph =[]
for df_train, df_test in train_test:
  df_train = create_mapping(df_train)
  df_test = create_mapping(df_test)
  df_train = generate_positive_negative(df_train)
  df_test = generate_positive_negative(df_test)

  l = prepare_for_GNN(df_train, df_test, df_out)
  l.append(df_train)
  train_test_out_graph.append(l)

In [None]:
epoch = 100
lr = 0.01
n = [256, 128, 64]
out = 32

representations_song = []

for train_graph_data, test_graph_data, out_graph_data, df_train in train_test_out_graph:
  _, emd_song, emd_art, test_embeddings, out_embeddings = train_gnn(epoch, lr, n, out, train_graph_data, True, test_graph_data, out_graph_data, df_train)

  tmp_song_train, tmp_art_train, tmp_concat_train = set_learned_features(emd_song, emd_art)

  tmp_song_test, tmp_art_test, tmp_concat_test = set_learned_features(test_embeddings["song"], test_embeddings['artist'])

  tmp_song_out, tmp_art_out, tmp_concat_out = set_learned_features(out_embeddings["song"], out_embeddings['artist'])

  representations_song.append([tmp_song_train,tmp_song_test,tmp_song_out])

In [None]:
popularities = []
for df_train, df_test in train_test:
  popularities.append([df_train['popularity'], df_test['popularity'], df_out['popularity']])

ocsvm = OCSVM(kernel='rbf', nu=0.4, gamma='scale')

metrics_regression_inductive(representations_song, ocsvm, popularities)

((980.8785065086342, 24.98490566566649, -0.2759758769178193),
 (31.620299518077157, 0.5568877846302609, 0.03213786229123085))

## SVR

In [None]:
from sklearn.svm import SVR

def metrics_regression_svr(train_test, df_out, column):

  l_mae = []
  l_mse = []
  l_r2 = []

  for df_train,df_test in train_test:
    x_train = df_train[column].to_list()
    x_test = df_test[column].to_list()
    x_out = df_out[column].to_list()

    svr = SVR(C=50, epsilon=0.2)

    y_train = df_train['popularity'].to_list()

    svr.fit(x_train, y_train)

    y_pred_test = svr.predict(x_test)
    y_pred_out = svr.predict(x_out)
    
    real_popularity = pd.concat([df_test['popularity'], df_out['popularity']]).to_list()
    predicted_popularity = np.concatenate([y_pred_test,y_pred_out])

    m1 = mse(real_popularity, predicted_popularity)
    m2 = mae(real_popularity, predicted_popularity)
    m3 = r2(real_popularity, predicted_popularity)

    l_mae.append(m1)
    l_mse.append(m2)
    l_r2.append(m3)
  
  return np.mean(l_mae), np.mean(l_mse), np.mean(l_r2), np.std(l_mae), np.std(l_mse), np.std(l_r2)

In [None]:
train_test, df_out = dfs(graph_filtered)

column = 'BERT'

metrics_regression_svr(train_test, df_out, column)

(1255.0443720328378,
 25.224657763506297,
 -0.6332318464664995,
 11.28455371901567,
 0.11569832095889726,
 0.033909747026962846)

In [None]:
column = 'BERT+ART_F'

metrics_regression_svr(train_test, df_out, column)

(1256.6388992811383,
 25.229131216607946,
 -0.635299214267615,
 10.935430647542585,
 0.11388676979349352,
 0.03337651883386881)

In [None]:
column = 'emb_song'

metrics_regression_svr(train_test, df_out, column)

(1255.069578315764,
 25.581796310295818,
 -0.6332380627797158,
 8.618644208435542,
 0.0969082971687331,
 0.031186705499168313)

In [None]:
column = 'emb_song_art'

metrics_regression_svr(train_test, df_out, column)

(1260.0312016428918,
 25.440309699559716,
 -0.6396637332139317,
 7.8672704081390465,
 0.16647156588750092,
 0.02927322622073044)