In [None]:
#!pip install datasets
#!pip install SPARQLWrapper

In [1]:
from datasets import load_dataset
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import os
from rdflib import Graph, Namespace, RDF
from pyvis.network import Network
from transformers import pipeline

### Extração do dataset

In [None]:

# extrair o dataset
df = pd.read_parquet("hf://datasets/manoh2f2/tsterbak-lyrics-dataset-with-emotions/data/train-00000-of-00001.parquet")
print(df.shape)

# shuffle dos dados e reseta também o indice
df = df.sample(frac=1).reset_index(drop=True)

# redução do dataset
df = df.iloc[0:1000, :]

print(df.shape)

#tratamento do problema das letras ter este _x000D_ caracter especial
df['seq'] = df['seq'].str.replace("_x000D_", "", regex=False)


# print(df)

#x = df.iloc[2]['seq']
#print(x)


(36897, 6)
(1000, 6)


### Query do SPARQL

In [26]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
cache_file = "genre_cache.json"
# cache_str_keys é do tipo { "song||artist": genre }
if os.path.exists(cache_file):
    with open(cache_file, "r", encoding="utf-8") as f:
        cache_str_keys = json.load(f)
        print(len(cache_str_keys))
else:
    cache_str_keys = {}
    with open(cache_file, "w", encoding="utf-8") as f:
        json.dump(cache_str_keys, f, ensure_ascii=False, indent=2)


def count_unique_artists(cache_file):
    with open(cache_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # extrai o artista da chave "song||artist"
    artists = [key.split("||")[1] for key in data.keys()]
    unique_artists = set(artists)
    
    print(f"Total de artistas únicos: {len(unique_artists)}")
    return unique_artists


def get_genre2(artist_name):
    query = f"""
    SELECT ?genreLabel WHERE {{
      ?artist rdfs:label "{artist_name}"@en;
              wdt:P136 ?genre.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["genreLabel"]["value"]

    else:
        return "Unknown"


def get_genre(song_name,artist_name):
    
    song_name = song_name.replace('"', '').replace("'", '')
    artist_name = artist_name.replace('"', '').replace("'", '')
    query = f"""
    SELECT ?genreLabel WHERE {{
      ?artist rdfs:label "{song_name}"@en;
            wdt:P31 wd:Q7366;          # garante que é uma canção
            wdt:P136 ?genre.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["genreLabel"]["value"]

    else:
        return get_genre2(artist_name) # Caso não encontre género para a música em especifico, vai buscar o género do artista



def get_genre_cached(song_name, artist_name):
    key = f"{song_name}||{artist_name}"  # chave como string
    if key in cache_str_keys:
        return cache_str_keys[key]
    
    genre = get_genre(song_name, artist_name)  # faz SPARQL se não estiver
    cache_str_keys[key] = genre

    # salva a cache imediatamente no JSON
    with open("genre_cache.json", "w", encoding="utf-8") as f:
        json.dump(cache_str_keys, f, ensure_ascii=False, indent=2)

    return genre



# artist = df.iloc[2]["artist"]
# song = df.iloc[2]["song"]
# genre = get_genre(song,artist)

# print(f"Artista: {artist}")
# print(f"Género: {genre}")

#df["genre"] = df["artist"].apply(get_genre)


count_unique_artists(cache_file)
df["genre"] = [get_genre_cached(title, artist) for title, artist in zip(df["song"], df["artist"])]


df.to_csv("lyrics_with_genre.csv", index=False)



1000
Total de artistas únicos: 642


In [None]:
def emotionSelection(i, lista):
        lista_sent = list(lista[i].keys())
        first_key = lista_sent[0]
        first_key_score = lista[i][first_key]
        print(first_key)
        for j in lista_sent:
            score = lista[i][j]
            print(score)
            if(score < first_key_score/2):
                del emotion_list[i][j]                
        return
    


classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k = 3,truncation=True) #truncation faz com que corte quando for muito longo

data = pd.read_csv("lyrics_with_genre.csv")
data['seq'] = data['seq'].str.replace("_x000D_", "", regex=False)

#dataHead = data.head()
#print(dataHead)

#x = dataHead.iloc[0]['seq']

#print(classifier(x))
#print(dataHead.iloc[0]['emotions'])
#print(x)

emotion_list = []

for i,row in data.iterrows():
    temp = row['seq']

    #print(classifier(temp))

    prediction = classifier(temp)
    inner = prediction[0]   # devolve a lista interna
    d = {item['label']: item['score'] for item in inner}

    emotion_list.append(d)
    emotionSelection(i,emotion_list)

#print(emotion_list[0].keys())
#first_key = list(emotion_list[0].keys())[0]
#print(first_key)
#del emotion_list[0]['fear']
#print(emotion_list[0].keys())

data['predicted_emotions'] = emotion_list


data.to_csv("final.csv", index=False)


Device set to use cpu


sadness
0.8350851535797119
0.10378336906433105
0.03163175284862518
fear
0.6831692457199097
0.17267188429832458
0.07465752214193344
fear
0.4248541295528412
0.21741630136966705
0.1181325763463974
fear
0.5999545454978943
0.32163679599761963
0.039006754755973816
anger
0.47958657145500183
0.2513619363307953
0.15195214748382568


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataHead['predicted_emotions'] = emotion_list


### Gráfico de conhecimento

In [None]:
data = {
  "I Was Born About Ten Thousand Years Ago||Elvis Presley": "rock and roll",
  "Citadel||The Damned": "drama film",
  "Down the Drain||Down by Law": "drama film",
  "Hymn||Patti Smith": "Unknown",
  "Candy||LL Cool J": "pop music",
  "Little Birds||Dead to Fall": "indie rock",
  "Hannah||Sheila Nicholls": "Unknown",
  "Mental Slavery||Kreator": "thrash metal",
  "Playin' Dominoes and Shootin' Dice||Willie Nelson": "blues"
}

# Criar o grafo
g = Graph()

EX = Namespace("http://example.org/")
g.bind("", EX)

# Criar classes
g.add((EX.Music, RDF.type, EX.Class))
g.add((EX.Artist, RDF.type, EX.Class))
g.add((EX.Genre, RDF.type, EX.Class))
#TODO: criar classe sentimento

# Criar propriedades
g.add((EX.hasArtist, RDF.type, EX.Property))
g.add((EX.hasGenre, RDF.type, EX.Property))
#TODO: criar propriedade tem sentimento

# --------- ADICIONAR INDIVÍDUOS ---------
for key, genre in data.items():
    title, artist = key.split("||")

    # Criar URIs "limpos"
    title_uri = EX[title.replace(" ", "_").replace("/", "_").replace("\"", "").replace("'", "_")]
    artist_uri = EX[artist.replace(" ", "_").replace("/", "_").replace("\"", "")]
    genre_uri = EX[genre.replace(" ", "_").replace("/", "_")]

    # Música é indivíduo da classe Music
    g.add((title_uri, RDF.type, EX.Music))

    # Artista é indivíduo da classe Artist
    g.add((artist_uri, RDF.type, EX.Artist))

    # Género é indivíduo da classe Genre
    g.add((genre_uri, RDF.type, EX.Genre))

    # Relacionar a música com artista e género
    g.add((title_uri, EX.hasArtist, artist_uri))
    g.add((title_uri, EX.hasGenre, genre_uri))

g.serialize("musicas.ttl", format="turtle")

# Carregar o teu grafo RDFLib
g = Graph()
g.parse("musicas.ttl", format="turtle")

# --- preparar visualização PyVis com processamento mais limpo ---
net = Network(height="750px", width="100%", directed=True)
net.barnes_hut()  # layout melhor para grafos maiores

# iremos recolher tipos (Music / Artist / Genre) e depois construir nós/arestas sem triples rdf:type visíveis
class_uris = {EX.Music, EX.Artist, EX.Genre}
node_type = {}   # mapa: URI -> 'Music'|'Artist'|'Genre'

# primeiro passar pelos triples para identificar rdf:type de instâncias
for s, p, o in g:
    if p == RDF.type and o in class_uris:
        # marca o tipo da instância (s é a instância, o é a classe)
        if o == EX.Music:
            node_type[s] = "Music"
        elif o == EX.Artist:
            node_type[s] = "Artist"
        elif o == EX.Genre:
            node_type[s] = "Genre"

# depois criar nós e arestas (ignorando triples que apenas declaram as classes em si)
seen_nodes = set()

def pretty_label(uri):
    """Gera label legível: tenta qname, fallback para o último segmento do URIRef"""
    try:
        return g.qname(uri)
    except Exception:
        s = str(uri)
        return s.split("/")[-1].split("#")[-1]

# cores/grupos para PyVis (o "group" facilita legenda/estética)
group_map = {
    "Music": "music",
    "Artist": "artist",
    "Genre": "genre"
}

# Adicionar nós e arestas: para cada triple, se for rdf:type (instância->classe) já processado -> ignorar visualmente.
for s, p, o in g:
    # Só mostrar indivíduos, não classes
    if o in (EX.Music, EX.Artist, EX.Genre):
        continue
    # ignorar declarações do próprio esquema (ex.: EX.Music rdf:type EX.Class) e rdf:type ligações já processadas
    if s in {EX.Music, EX.Artist, EX.Genre}:
        continue
    if p == RDF.type and o in class_uris:
        # Não criar aresta rdf:type visível — apenas asseguramos node_type acima
        continue

    # garantir nós s e o com labels legíveis
    if s not in seen_nodes:
        lbl = pretty_label(s)
        grp = group_map.get(node_type.get(s, "Other"), "other")
        net.add_node(str(s), label=lbl, title=str(s), group=grp)
        seen_nodes.add(s)
    if o not in seen_nodes:
        lbl = pretty_label(o)
        grp = group_map.get(node_type.get(o, "Other"), "other")
        net.add_node(str(o), label=lbl, title=str(o), group=grp)
        seen_nodes.add(o)

    # adicionar aresta com rótulo do predicado (localname)
    pred_label = pretty_label(p)
    net.add_edge(str(s), str(o), label=pred_label, title=pred_label)


# PyVis aplica cores automaticamente por group.
net.set_options("""
var options = {
  "nodes": {
    "font": {"size": 14}
  },
  "edges": {
    "arrows": {"to": {"enabled": true}},
    "font": {"align": "top"}
  },
  "physics": {
    "stabilization": { "enabled": true }
  }
}
""")

net.write_html("grafico_interativo_limpo.html")