In [None]:
#!pip install datasets
#!pip install SPARQLWrapper

In [13]:
from datasets import load_dataset
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import os


### Extração do dataset

In [None]:

# extrair o dataset
df = pd.read_parquet("hf://datasets/manoh2f2/tsterbak-lyrics-dataset-with-emotions/data/train-00000-of-00001.parquet")
print(df.shape)

# shuffle dos dados e reseta também o indice
df = df.sample(frac=1).reset_index(drop=True)

# redução do dataset
df = df.iloc[0:1000, :]

print(df.shape)

#tratamento do problema das letras ter este _x000D_ caracter especial
df['seq'] = df['seq'].str.replace("_x000D_", "", regex=False)


# print(df)

#x = df.iloc[2]['seq']
#print(x)


(36897, 6)
(1000, 6)


### Query do SPARQL

In [26]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
cache_file = "genre_cache.json"
# cache_str_keys é do tipo { "song||artist": genre }
if os.path.exists(cache_file):
    with open(cache_file, "r", encoding="utf-8") as f:
        cache_str_keys = json.load(f)
        print(len(cache_str_keys))
else:
    cache_str_keys = {}
    with open(cache_file, "w", encoding="utf-8") as f:
        json.dump(cache_str_keys, f, ensure_ascii=False, indent=2)


def count_unique_artists(cache_file):
    with open(cache_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # extrai o artista da chave "song||artist"
    artists = [key.split("||")[1] for key in data.keys()]
    unique_artists = set(artists)
    
    print(f"Total de artistas únicos: {len(unique_artists)}")
    return unique_artists


def get_genre2(artist_name):
    query = f"""
    SELECT ?genreLabel WHERE {{
      ?artist rdfs:label "{artist_name}"@en;
              wdt:P136 ?genre.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["genreLabel"]["value"]

    else:
        return "Unknown"


def get_genre(song_name,artist_name):
    
    song_name = song_name.replace('"', '').replace("'", '')
    artist_name = artist_name.replace('"', '').replace("'", '')
    query = f"""
    SELECT ?genreLabel WHERE {{
      ?artist rdfs:label "{song_name}"@en;
            wdt:P31 wd:Q7366;          # garante que é uma canção
            wdt:P136 ?genre.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if results["results"]["bindings"]:
        return results["results"]["bindings"][0]["genreLabel"]["value"]

    else:
        return get_genre2(artist_name) # Caso não encontre género para a música em especifico, vai buscar o género do artista



def get_genre_cached(song_name, artist_name):
    key = f"{song_name}||{artist_name}"  # chave como string
    if key in cache_str_keys:
        return cache_str_keys[key]
    
    genre = get_genre(song_name, artist_name)  # faz SPARQL se não estiver
    cache_str_keys[key] = genre

    # salva a cache imediatamente no JSON
    with open("genre_cache.json", "w", encoding="utf-8") as f:
        json.dump(cache_str_keys, f, ensure_ascii=False, indent=2)

    return genre



# artist = df.iloc[2]["artist"]
# song = df.iloc[2]["song"]
# genre = get_genre(song,artist)

# print(f"Artista: {artist}")
# print(f"Género: {genre}")

#df["genre"] = df["artist"].apply(get_genre)


count_unique_artists(cache_file)
df["genre"] = [get_genre_cached(title, artist) for title, artist in zip(df["song"], df["artist"])]


df.to_csv("lyrics_with_genre.csv", index=False)



1000
Total de artistas únicos: 642
