#Read dataset base
- [PMEmo datset](https://github.com/HuiZhangDB/PMEmo).

In [None]:
import pandas as pd
df = pd.read_csv("dataset_base.tsv",sep='\t')
df.acoustic = df.acoustic.apply(eval)
df.bert = df.bert.apply(eval)
df.tail(3)

Unnamed: 0,musicId,acoustic,lyrics,arousal,valence,bert
602,985,"[7.818191000000001, 0.4787813, 0.0, 3.404724, ...",I live my day as if it was the last Live...,positive,positive,"[-0.0325904675, 0.0452854037, -0.0056039067, 0..."
603,993,"[8.378411999999999, 0.6805897, 0.0, 2.728126, ...",Waiting for the time to pass you by Hope the w...,positive,positive,"[0.0326712728, -0.00807604566, -0.0398680158, ..."
604,996,"[7.961765, 0.9235694, 0.0, 3.891528, 5.317545,...",I'm facing the battle Through all of my probl...,positive,neutral,"[-0.0557158515, 0.0489445478, -0.0287081152, -..."


#Read data splits

In [None]:
import pandas as pd
data_splits = pd.read_csv("data_splits.tsv", sep='\t')
data_splits.TRAIN_INDEX = data_splits.TRAIN_INDEX.apply(eval)
data_splits.TEST_INDEX = data_splits.TEST_INDEX.apply(eval)
data_splits.tail(3)

Unnamed: 0,TRAIN_INDEX,TEST_INDEX
97,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[34, 45, 47, 54, 58, 59, 67, 68, 72, 80, 88, 1..."
98,"[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 23, 40, 46, 48, 55, 70, 85, 108, 113, 120,..."
99,"[0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15...","[7, 10, 29, 32, 35, 42, 49, 50, 51, 56, 57, 60..."


#Create features based on clustering to audio and textual modalities

In [None]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
import numpy as np

def get_audio_cluster(df_train, df_test, k):
  kmeans = KMeans(n_clusters=k, random_state=13, init='random').fit(df_train.acoustic.to_list())
  df_train['audio_features'] = kmeans.labels_
  df_test['audio_features'] = kmeans.predict(df_test.acoustic.to_list())
  return df_train, df_test

def get_bert_cluster(df_train, df_test, k):
  kmeans = KMeans(n_clusters=k, random_state=13, init='random').fit(df_train.bert.to_list())
  df_train['text_features'] = kmeans.labels_
  df_test['text_features'] = kmeans.predict(df_test.bert.to_list())
  return df_train, df_test


In [None]:
train_index = data_splits.iloc[0].TRAIN_INDEX
test_index = data_splits.iloc[0].TEST_INDEX

df_train, df_test = get_audio_cluster(df[df.index.isin(train_index)], df[df.index.isin(test_index)], 3)
df_train, df_test = get_bert_cluster(df_train, df_test, 3)

In [None]:
df_train.head(3)

Unnamed: 0,musicId,acoustic,lyrics,arousal,valence,bert,audio_features,text_features
1,5,"[8.152512, 0.3680324, 0.0, 1.404577, 1.969597,...","{Hook} Young Savage, why you trappin' so hard?...",negative,negative,"[-0.00151691, 0.04835999, -0.00339831, -0.0051...",2,2
2,6,"[8.527122, 0.2817285, 0.0, 2.106767, 3.345879,...","Yeah, Yeah, Yeah I pull up roll the window dow...",neutral,negative,"[0.0164210964, 0.0812988877, -0.026285233, 0.0...",2,2
3,9,"[8.879118, 0.5097954, 0.0, 2.919309, 4.152384,...","Bitch, I'm really timeless I cannot waste no...",negative,negative,"[-0.015698934, 0.103245117, 0.0547486469, -0.0...",1,2


#Create heterogeneous graph with music features

In [None]:
import networkx as nx

def create_graph(G, df, label, split):
  for index,row in df.iterrows():
    node1 = str(row['musicId'])+':music'
    node2 = str(row['audio_features'])+':audio'
    node3 = str(row['text_features'])+':text'
    G.add_edge(node1,node2)
    G.add_edge(node1,node3)
    G.nodes[node1][label] = row[label]
    G.nodes[node1]['acoustic'] = row['acoustic']
    G.nodes[node1]['bert'] = row['bert']
    G.nodes[node1]['split'] = split    
  return G

G = nx.Graph()
G = create_graph(G=G, df=df_train, label="arousal", split='train') 
G = create_graph(G=G, df=df_test, label="arousal", split='test') 


#Graph regularization

In [None]:
import numpy as np
from tqdm.notebook import tqdm
import random


def regularization(G, dim, embedding_feature: str = 'embedding', iterations=3):

    nodes = []
    
    for node in G.nodes():
        G.nodes[node]['f'] = np.array([0.0]*dim)
        if embedding_feature in G.nodes[node]:
          G.nodes[node]['f'] = np.array(G.nodes[node][embedding_feature])*1.0 
        nodes.append(node)

    pbar = tqdm(range(0, iterations))

    for iteration in pbar:
        random.shuffle(nodes)
        energy = 0.0

        for node in nodes:
            f_new = np.array([0.0]*dim)
            f_old = np.array(G.nodes[node]['f'])*1.0
            sum_w = 0.0
            
            w = 1
            for neighbor in G.neighbors(node):

                f_new += G.nodes[neighbor]['f']

                sum_w += w

            f_new /= sum_w

            G.nodes[node]['f'] = f_new*1.0

            if embedding_feature in G.nodes[node]:
                G.nodes[node]['f'] = np.array(G.nodes[node][embedding_feature])

            G.nodes[node]['f_'+embedding_feature] = G.nodes[node]['f']
            energy += np.linalg.norm(f_new-f_old)

        iteration += 1
        message = 'Iteration '+str(iteration)+' | Energy = '+str(energy)
        pbar.set_description(message)

    return G

In [None]:
regularization(G,6373,'acoustic', iterations=30)

  0%|          | 0/30 [00:00<?, ?it/s]

<networkx.classes.graph.Graph at 0x7fc8713ec250>

In [None]:
regularization(G,512,'bert', iterations=30)

  0%|          | 0/30 [00:00<?, ?it/s]

<networkx.classes.graph.Graph at 0x7fc8713ec250>

#Save and read graph

In [None]:
nx.write_gpickle(G, f'graph.nx')

In [None]:
G = nx.read_gpickle(f'graph.nx')
G

<networkx.classes.graph.Graph at 0x7fc86d55b650>