# Prepare

In [7]:
import re
import string
import pandas as pd
import numpy as np
import swifter
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer


## stop-words

In [8]:
spanish_punct = string.punctuation + "¡¿«»"
min_cnt = 15
df_stop = pd.read_csv('c:/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_lemmaNouns_counters.csv')
df_stop = df_stop[df_stop['count'] <= min_cnt]

stop_words = df_stop['noun'].values.tolist()
display(f"Stop words loaded: {len(stop_words)} words")

'Stop words loaded: 172549 words'

In [9]:
def remove_stuff(text):
    return re.sub(r"(?<!\w)-|-(?!\w)|[{}]".format(re.escape(spanish_punct.replace("-", ""))), "", text)


def clean_lyrics(letra):
    cleaned = ''
    for text in letra.split('\n')[1:]:   # remove the first line
        if text == '':
            continue
        text = re.sub(r'\[.+?\]', ' ', text) # remove whatever is between brackets
        text = re.sub(r'\((.+?)\)', r'\1', text) # keep what is between parentheses
        text = re.sub(r'\"(.+?)\"', r'\1', text) # keep what is between quotes
        text = re.sub(r'[\s]+', ' ', text) # spaces
        text = text.strip() # remove leading and trailing spaces
        text = text.lower() # lowercase
        cleaned += text + ' '
    cleaned = re.sub(r'\s+', ' ', cleaned) # remove multiple spaces
    cleaned = cleaned.strip() # remove leading and trailing spaces
    return ' '.join(remove_stuff(tok) for tok in cleaned.split(' '))

# Example Spanish documents
df = pd.read_csv('c:/Users/rmessina/Eli/data/song_lyrics_es_1950_2020.csv', encoding='utf8')  # Load your dataset

display(f"Loaded {len(df)} rows from the dataset")
display("Cleaning lyrics...")
# Apply multiprocessing to the DataFrame with a progress bar
df['cleaned_lyrics'] = df['lyrics'].swifter.apply(clean_lyrics)

# Remove NaN values from the 'lemmatized_lyrics' column
initial_count = len(df)
df = df.dropna(subset=['cleaned_lyrics'])
final_count = len(df)
removed_count = initial_count - final_count
display(f"Removed {removed_count} rows with NaN values from 'cleaned_lyrics'.")

df.to_csv('c:/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_cleaned2.csv', encoding='utf8', index=False)

# Convert the cleaned column to a list
documents = df['cleaned_lyrics'].tolist()
process_col = 'cleaned_lyrics'



'Loaded 275059 rows from the dataset'

'Cleaning lyrics...'

Pandas Apply:   0%|          | 0/275059 [00:00<?, ?it/s]

"Removed 0 rows with NaN values from 'cleaned_lyrics'."

# For BERT, we should NOT need this... but results are horrible without!?

In [10]:
stopwords_set = set(stop_words)  # for faster lookup

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords_set])

# Apply to the column
df['no_stopwords'] = df['cleaned_lyrics'].swifter.apply(remove_stopwords)

initial_count = len(df)
df = df.dropna(subset=['no_stopwords'])
final_count = len(df)
removed_count = initial_count - final_count
display(f"Removed {removed_count} rows with NaN values from 'no_stopwords'.")

df.to_csv('c:/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_cleaned_stop.csv', encoding='utf8', index=False)



Pandas Apply:   0%|          | 0/275059 [00:00<?, ?it/s]

"Removed 0 rows with NaN values from 'no_stopwords'."

In [12]:
import os
import time

if __name__ == "__main__":
    # Load a Spanish-compatible sentence embedding model
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device='cuda')

    # Convert the cleaned column to a list
    process_col = 'no_stopwords'
    documents = df[process_col].tolist()

    # Convert the cleaned column to a list
    np_emb = 'c:/Users/rmessina/Eli/data/lyrics_es_embeddings_nostop.npy'
    documents = df[process_col].tolist()
    if not os.path.isfile(np_emb):
        display(f"Saving embeddings to {np_emb}")
        embeddings = embedding_model.encode(documents, show_progress_bar=True)
        with open(np_emb, "wb") as f:
            np.save(f, embeddings)
    else:
        display(f"Loading embeddings from {np_emb}")
        with open(np_emb, "rb") as f:
            embeddings = np.load(f)

    try:
        for size in  [20, 30, 40, 50, 100]:  # [5, 10, 15, 25, 50, 100]:
            display(f"Doing {size}" + "^" * 37)
            hdbscan_model = HDBSCAN(min_cluster_size=size, min_samples=5, prediction_data=True)
            start = time.time()
            topic_model = BERTopic(
                embedding_model=embedding_model,
                hdbscan_model=hdbscan_model,          # Still used for clustering
                language="spanish",
                verbose=True,
                calculate_probabilities=False
            )

            topics, _ = topic_model.fit_transform(documents, embeddings)

            display(f"min_cluster_size={size} → {len(set(topics)) - (1 if -1 in topics else 0)} topics")
            topic_model.save(f"c:/Users/rmessina/Eli/models/bertopic_spanish_lyrics_lemma_umap_hdbscan{size}", save_embedding_model=True)

            display(f"Took {(time.time()-start)/60:.3f} minutes to process")

            df['topic'] = topics
            df.to_csv(f'c:/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_bertopic_lemma_umap_hdbscan{size}.csv', encoding='utf8', index=False)

            # Print the topics
            display(topic_model.get_topic_info())
            # Display the top words for each topic

            display("*" * 42)
    except Exception as exc:
        pass


'Saving embeddings to c:/Users/rmessina/Eli/data/lyrics_es_embeddings_nostop.npy'

Batches:   0%|          | 0/8596 [00:00<?, ?it/s]

'Doing 20^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'

2025-05-06 19:56:14,458 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-06 19:57:44,998 - BERTopic - Dimensionality - Completed ✓
2025-05-06 19:57:45,007 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-06 19:58:12,741 - BERTopic - Cluster - Completed ✓
2025-05-06 19:58:12,776 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 19:58:33,509 - BERTopic - Representation - Completed ✓


'min_cluster_size=20 → 815 topics'



'Took 2.719 minutes to process'

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,216994,-1_no_que_me_el,"[no, que, me, el, en, la, mi, lo, te, es]",[you yo qué primos yo royal eh soulsonic studi...
1,0,4781,0_you_the_it_my,"[you, the, it, my, im, know, like, up, on, and]",[tú me quieres que tú lo sientes malu ah uh ju...
2,1,2765,1_baby_bebé_mami_yeah,"[baby, bebé, mami, yeah, dime, tú, oh, cama, u...",[qué tú quieres baby qué tú quieres baby qué t...
3,2,2479,2_jesús_gloria_dios_señor,"[jesús, gloria, dios, señor, santo, cristo, al...",[y dolido estás y cansado peso cristo llama al...
4,3,1458,3_pueblo_gobierno_democracia_país,"[pueblo, gobierno, democracia, país, violencia...",[yo no creo en la democracia son que generan l...
...,...,...,...,...,...
811,810,20,810_lará_ge_vuela_aguita,"[lará, ge, vuela, aguita, clavelito, rio, mm, ...",[aguita rio rosario flores – balada mira pasa ...
812,811,20,811_despues_duele_lloraba_rios,"[despues, duele, lloraba, rios, dolor, humedad...",[despues todo el dolor que causaste en mi cora...
813,812,20,812_volveremos_date_juntarnos_remedio,"[volveremos, date, juntarnos, remedio, pacient...",[cuesta solos buscamos mil maneras la estupide...
814,813,20,813_sacatao_autismo_desacatao_hood,"[sacatao, autismo, desacatao, hood, imágenes, ...",[caminan con estilo sobre marvel moncloa y pol...


'******************************************'

'Doing 30^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'

2025-05-06 19:59:10,169 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-06 20:00:38,970 - BERTopic - Dimensionality - Completed ✓
2025-05-06 20:00:38,978 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-06 20:01:02,889 - BERTopic - Cluster - Completed ✓
2025-05-06 20:01:02,922 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 20:01:23,587 - BERTopic - Representation - Completed ✓


'min_cluster_size=30 → 456 topics'



'Took 2.598 minutes to process'

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,222208,-1_que_no_me_la,"[que, no, me, la, te, en, el, mi, lo, yo]",[siento frío en el corazón siento que yo me mu...
1,0,5705,0_you_the_it_my,"[you, the, it, my, yeah, up, to, like, know, on]",[me dieron la ayer eso el plan te emocionaba q...
2,1,2499,1_pueblo_gobierno_violencia_país,"[pueblo, gobierno, violencia, país, revolución...",[con todo el dinero invertido en rescates a la...
3,2,2467,2_jesús_dios_gloria_señor,"[jesús, dios, gloria, señor, santo, cristo, al...",[la creación se al voz tú el yo transforma en ...
4,3,1331,3_bailar_baila_bailando_baile,"[bailar, baila, bailando, baile, ritmo, cumbia...",[es la moda y a todos gusta bailar yo se que a...
...,...,...,...,...,...
452,451,30,451_vacío_recarga_desaparecido_razon,"[vacío, recarga, desaparecido, razon, perdido,...",[no queda amnesia en no alguien llego al momen...
453,452,30,452_friki_game_juego_honeys,"[friki, game, juego, honeys, atienda, teen, en...",[game para todo el que se me ha virao game par...
454,453,30,453_volveré_pum_doblas_lograr,"[volveré, pum, doblas, lograr, soñó, vertical,...",[volveré a irme y volveré a mi casa y volveré ...
455,454,30,454_caiga_piquete_ta_condón,"[caiga, piquete, ta, condón, negrito, notón, i...",[ah volví a la mala ah ah ah te digo todo lo q...


'******************************************'

'Doing 40^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'

2025-05-06 20:01:58,780 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-06 20:03:41,511 - BERTopic - Dimensionality - Completed ✓
2025-05-06 20:03:41,520 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-06 20:04:34,011 - BERTopic - Cluster - Completed ✓
2025-05-06 20:04:34,043 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 20:04:53,711 - BERTopic - Representation - Completed ✓


'min_cluster_size=40 → 319 topics'



'Took 3.310 minutes to process'

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,224539,-1_que_no_me_la,"[que, no, me, la, el, en, te, mi, lo, yo]",[descarteinédito el álbum intrabellum eh 2-0-2...
1,0,6040,0_you_the_my_it,"[you, the, my, it, yeah, to, like, on, and, im]",[no cuidao va ser dispare no lucha por ti no p...
2,1,2379,1_pueblo_violencia_gobierno_libertad,"[pueblo, violencia, gobierno, libertad, gente,...",[y miro la noticia el pueblo gente la milicia ...
3,2,2302,2_baby_bebé_tú_oh,"[baby, bebé, tú, oh, mami, yeah, dime, te, yo,...",[da la sensación que me a ver en follón que yo...
4,3,1864,3_gloria_jesús_dios_señor,"[gloria, jesús, dios, señor, santo, aleluya, r...",[cómo todas cosas que has hecho por cosas inme...
...,...,...,...,...,...
315,314,40,314_dímelo_tudime_callada_mmm,"[dímelo, tudime, callada, mmm, toques, hables,...",[que llena igual aquel lugar que yo para ti di...
316,315,40,315_california_class_mario_carmen,"[california, class, mario, carmen, despuntan, ...",[california bienvenidos a california x2 bienve...
317,316,40,316_siguiendo_imposible_pausa_serás,"[siguiendo, imposible, pausa, serás, das, noct...",[tú me das el aire que respiro tú serás lo que...
318,317,40,317_arcángeles_cielo_lindo_dirás,"[arcángeles, cielo, lindo, dirás, algiva, mar,...",[creo que el sol me y el pasto es verde que to...


'******************************************'

'Doing 50^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'

2025-05-06 20:05:43,953 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-06 20:07:44,838 - BERTopic - Dimensionality - Completed ✓
2025-05-06 20:07:44,852 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-06 20:08:37,425 - BERTopic - Cluster - Completed ✓
2025-05-06 20:08:37,455 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 20:08:58,139 - BERTopic - Representation - Completed ✓


'min_cluster_size=50 → 8 topics'



'Took 3.683 minutes to process'

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,647,-1_el_la_que_en,"[el, la, que, en, macarena, te, con, cuerpo, b...",[no el primero en vida que virginidad intimida...
1,0,272664,0_que_no_la_el,"[que, no, la, el, me, en, te, lo, mi, yo]",[siento que vuelvo te veo caer pero no hay tie...
2,1,878,1_que_la_te_yo,"[que, la, te, yo, me, no, el, tú, lo, pa]",[que estás estrésica tú necesitas this is the ...
3,2,525,2_despacito_pasito_poquito_woah,"[despacito, pasito, poquito, woah, quiero, peg...",[so thankful for that its such a blessin yeah ...
4,3,76,3_dado_tú_ha_importas,"[dado, tú, ha, importas, gracias, me, canto, q...",[me dio que abro perfecto distingo lo negro bl...
5,4,73,4_panda_la_que_el,"[panda, la, que, el, no, te, en, me, yo, se]",[ey panda panda panda todo el mundo con el fuc...
6,5,72,5_bésame_perderte_mucho_besame,"[bésame, perderte, mucho, besame, miedo, últim...",[bésame mucho fuera noche la última bésame bés...
7,6,66,6_hacerte_bellos_amo_amor,"[hacerte, bellos, amo, amor, extensos, propone...",[ser aliado largas horas y el tiempo para pode...
8,7,58,7_hacerte_bellos_amo_extensos,"[hacerte, bellos, amo, extensos, proponerte, a...",[ser aliado largas horas y el tiempo para pode...


'******************************************'

'Doing 100^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'

2025-05-06 20:10:00,635 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-06 20:11:49,076 - BERTopic - Dimensionality - Completed ✓
2025-05-06 20:11:49,087 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-06 20:12:13,066 - BERTopic - Cluster - Completed ✓
2025-05-06 20:12:13,105 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-06 20:12:34,189 - BERTopic - Representation - Completed ✓


'min_cluster_size=100 → 3 topics'



'Took 3.030 minutes to process'

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,828,-1_que_el_la_me,"[que, el, la, me, en, te, amor, cuerpo, amo, no]",[ser aliado largas horas y el tiempo para pode...
1,0,272783,0_que_no_la_el,"[que, no, la, el, me, en, te, lo, mi, yo]",[con el tiempo todo pasa el dolor se tú yo que...
2,1,942,1_que_la_te_no,"[que, la, te, no, me, yo, el, lo, en, tú]",[this is the remix te llevo alto rafa pabön on...
3,2,506,2_konde_273_cris_estribillo,"[konde, 273, cris, estribillo, , , , , , ]","[, , 273]"


'******************************************'

In [25]:
from sklearn.cluster import KMeans


# Define the number of clusters (topics)
num_topics = 20  # You can tune this based on your data

# Initialize KMeans clustering model
kmeans = KMeans(n_clusters=num_topics, random_state=42)

# Initialize BERTopic with KMeans as the clustering model
topic_model = BERTopic(embedding_model=embedding_model, 
                       hdbscan_model=kmeans,
                       language="spanish")  # Set language if needed


# Fit the BERTopic model with your documents
topics, probabilities = topic_model.fit_transform(documents, embeddings)

# Save the model
topic_model.save(f"c:/Users/rmessina/Eli/models/bertopic_kmeans_model_{num_topics}_topics")

# Add topics to your dataframe
df['topic'] = topics
df.to_csv(f'c:/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_bertopic_kmeans_{num_topics}_topics.csv', encoding='utf8', index=False)




In [26]:
import plotly.express as px
from umap import UMAP


# Step 2: Reduce with UMAP
umap_model = UMAP(n_components=2, random_state=42)
reduced = umap_model.fit_transform(embeddings)

# Step 3: Create dataframe
viz_df = pd.DataFrame(reduced, columns=['x', 'y'])
viz_df['topic'] = topics
viz_df['decade'] = df['decade']

# Step 4: Plot with Plotly
fig = px.scatter(
    viz_df, x='x', y='y',
    color=viz_df['topic'].astype(str),
    symbol='decade',
    title="Topics visualized via UMAP",
    labels={'color': 'Topic', 'symbol': 'Decade'},
    opacity=0.7
)
fig.show()

# Swap the embedding model: redo this without the "stopwords" removed!

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("hiiamsid/sentence_similarity_spanish_es")

topic_model = BERTopic(embedding_model=embedding_model, language="spanish")

umap_model = UMAP(n_components=5, random_state=42)
kmeans_model = KMeans(n_clusters=50, random_state=42)

topic_model = BERTopic(
    embedding_model=embedding_model,
    language="spanish",
    umap_model=umap_model,
    hdbscan_model=kmeans_model  # note: param is still called `hdbscan_model`
)

docs = df['cleaned_lyrics'].tolist()
topics, probs = topic_model.fit_transform(docs)

topic_model.visualize_topics()



modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [28]:
topic_model.save(f"c:/Users/rmessina/Eli/models/bertopic_kmeans_model_hiiamsid_{num_topics}_topics")

