# Explore here

In [1]:
import pandas as pd
import re

import spacy
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("lyrics,label.txt")


In [None]:
def pre_clean(text):
    text = text.lower()
    text = re.sub(r"[',]", "", text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub('bis', ' ', text)
    text = re.sub('[()]', ' ', text)
    text = re.sub('"', ' ', text)
    text = re.sub('\s+', ' ', text).strip()
    return text

In [4]:
df["label"].value_counts()

label
Rock        135
Pop         130
Cumbia      125
Bachata     125
Flamenco    125
Trap        125
Salsa       124
Blues       124
en            1
Name: count, dtype: int64

In [5]:
df = df.dropna(subset=["label"])
df['label'] = df['label'].astype(str).str.strip()
df = df[df['label'].str.lower() != 'nan']

In [6]:
valid_labels = {"Rock", "Pop", "Cumbia", "Bachata",
    "Flamenco", "Trap", "Salsa", "Blues"}

df= df[df["label"].isin(valid_labels)]

In [7]:
df["label"].value_counts()

label
Rock        135
Pop         130
Cumbia      125
Bachata     125
Flamenco    125
Trap        125
Salsa       124
Blues       124
Name: count, dtype: int64

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-z√°√©√≠√≥√∫√º√±\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [9]:
pre_clean(df.loc[0, 'lyrics'])

'volvi√≥ de estudiar en columbia a la isla sin nada que hacer el a√±o se le hizo largo estudi√≥ y cumplir su deber en llamada a su amiga le dice que este verano es pa beber solo quiere salir y de nadie depender hasta que me conoci√≥ ella no se lo esperaba la vi entrando en la disco me devolvi√≥ la mirada sonrisita nerviosa entre besos se enfadaba se le escap√≥ un te quiero a la que no quer√≠a nada hasta que me conoci√≥ ella no se lo esperaba la vi entrando en la disco me devolvi√≥ la mirada sonrisita nerviosa entre besos se enfadaba se le escap√≥ un te quiero a la que no quer√≠a nada beb√© los dos sabemos que es verano y que tal vez cuando termine agosto no nos volvemos a ver pero quiero tener algo pa cuando no est√©s as√≠ que acumulemos recuerdos cada vez que el labio te muerdo me empujas y me pegas a la pared mir√°ndonos fijamente como en una √∫ltima vez s√© que estudia en la usa y me usa a su merced beb√© s√© que le tienes mucho miedo al compromiso y yo tambi√©n quieres olvidarme mm

In [10]:
import re
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
!python -m spacy download es_core_news_sm > null


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
expresion_stopwords = {"oh","eh","uh","ay","yeh","yeah","oh oh", "yo","ey",
                       "ah","ah ah", "oh oh"}

nlp = spacy.load("es_core_news_sm")

def clean_text(text):
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop
        and not token.is_punct
        and token.lemma_ not in expresion_stopwords
        and len(token.lemma_)>2]
    return " ".join(tokens)

In [13]:
clean_text(pre_clean(df.loc[0, 'lyrics']))

'volver estudiar columbia isla a√±o estudiar cumplir deber llamado amiga verano beber salir depender conocer esperar entrar disco devolver mirada sonrisita nervioso beso enfadar escapar querer querer conocer esperar entrar disco devolver mirada sonrisita nervioso beso enfadar escapar querer querer beb√© verano terminar agosto volver querer est√©s acumuler recuerdo labio muerdo empujas pegar pared mir√°ndono fijamente estudiar merced beb√© tener miedo compromiso querer olvidar yo mmm √≥diame ayudar entrar duda re√≠r chinguir cuarto luz echar copa dalir cu√≠dese salud sentimental actitud conocer esperar entrar disco devolver mirada sonrisita nervioso beso enfadar escapar querer querer reviveir hacer verano playa mano beso amar haci√©ndolo levantarno cantaba tema tocar piano isla peque√±o mirar escuchar reggaet√≥n 180 tramo esperar vano sano conocer esperar entrar disco devolver mirada sonrisita nervioso beso enfadar escapar querer querer volver estudiar columbia isla a√±o estudiar cumpli

In [14]:
df['lyrics_clean'] = df['lyrics'].apply(lambda x: clean_text(pre_clean(x)))
df[['lyrics', 'lyrics_clean']].head()

Unnamed: 0,lyrics,lyrics_clean
0,"Volvi√≥ de estudiar en Columbia\r\nA la isla, s...",volver estudiar columbia isla a√±o estudiar cum...
1,"Lo que quiero, lo tengo\r\nSin perd√≥n y sin pe...",querer perd√≥n permiso beb√© tar cuidao estar li...
2,Quiero bailar perreando toda la noche\r\nCon l...,querer bailar perrear noche babi querer brinda...
3,T√∫ y yo frente al mar\r\n¬øTe acuerdas de m√≠? ¬ø...,frente mar acuerdas estar querer verte convenc...
4,T√∫ est√°s en otro lugar\r\nYo estoy por Medallo...,estar lugar medallo empezar recordar gana ciud...


In [15]:
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(str)

In [16]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["label"])

class_names =label_encoder.classes_
print(class_names)

['Bachata' 'Blues' 'Cumbia' 'Flamenco' 'Pop' 'Rock' 'Salsa' 'Trap']


In [17]:
vectorizerCount = CountVectorizer()

X = vectorizerCount.fit_transform(df["lyrics_clean"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred,target_names=class_names))

Accuracy: 0.42105263157894735
              precision    recall  f1-score   support

     Bachata       0.43      0.36      0.39        42
       Blues       0.40      0.08      0.13        26
      Cumbia       0.41      0.29      0.34        45
    Flamenco       0.61      0.69      0.65        45
         Pop       0.20      0.02      0.04        41
        Rock       0.39      0.74      0.51        34
       Salsa       0.31      0.51      0.39        39
        Trap       0.44      0.66      0.53        32

    accuracy                           0.42       304
   macro avg       0.40      0.42      0.37       304
weighted avg       0.40      0.42      0.38       304



In [20]:
new_song = """
Cuando llegan las horas de la tarde
Que me encuentro tan solo y muy lejos de ti
"""

new_song_clean = clean_text(pre_clean(new_song))
new_song_vec = vectorizerCount.transform([new_song_clean])

pred = model.predict(new_song_vec)
pred_genre = label_encoder.inverse_transform(pred)

print("G√©nero predicho:", pred_genre[0])

G√©nero predicho: Salsa


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

vectorizerTF = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X = vectorizerTF.fit_transform(df["lyrics_clean"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(
    y_test,
    y_pred,
    target_names=class_names
))

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizerTF, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


Accuracy: 0.3684210526315789
              precision    recall  f1-score   support

     Bachata       0.38      0.31      0.34        42
       Blues       0.40      0.15      0.22        26
      Cumbia       0.38      0.07      0.11        45
    Flamenco       0.55      0.62      0.58        45
         Pop       0.25      0.02      0.04        41
        Rock       0.40      0.76      0.53        34
       Salsa       0.31      0.26      0.28        39
        Trap       0.27      0.84      0.41        32

    accuracy                           0.37       304
   macro avg       0.37      0.38      0.32       304
weighted avg       0.37      0.37      0.31       304



In [22]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_pred, y_test))
print(classification_report(
    y_test,
    y_pred,
    target_names=class_names))

Accuracy: 0.3684210526315789
              precision    recall  f1-score   support

     Bachata       0.38      0.31      0.34        42
       Blues       0.40      0.15      0.22        26
      Cumbia       0.38      0.07      0.11        45
    Flamenco       0.55      0.62      0.58        45
         Pop       0.25      0.02      0.04        41
        Rock       0.40      0.76      0.53        34
       Salsa       0.31      0.26      0.28        39
        Trap       0.27      0.84      0.41        32

    accuracy                           0.37       304
   macro avg       0.37      0.38      0.32       304
weighted avg       0.37      0.37      0.31       304



In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
model = MultinomialNB()
model.fit(X_train, y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [25]:
print("Unique labels in y:", set(y))
print("Unique labels in y_pred:", set(y_pred))

Unique labels in y: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)}
Unique labels in y_pred: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)}


In [26]:
labels_present = list(set(y_pred))
print("Class names:", class_names)
print("Labels present in predictions:", labels_present)

Class names: ['Bachata' 'Blues' 'Cumbia' 'Flamenco' 'Pop' 'Rock' 'Salsa' 'Trap']
Labels present in predictions: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7)]


In [27]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(
    y_test,
    y_pred,
    target_names=class_names))

Accuracy: 0.3684210526315789
              precision    recall  f1-score   support

     Bachata       0.38      0.31      0.34        42
       Blues       0.40      0.15      0.22        26
      Cumbia       0.38      0.07      0.11        45
    Flamenco       0.55      0.62      0.58        45
         Pop       0.25      0.02      0.04        41
        Rock       0.40      0.76      0.53        34
       Salsa       0.31      0.26      0.28        39
        Trap       0.27      0.84      0.41        32

    accuracy                           0.37       304
   macro avg       0.37      0.38      0.32       304
weighted avg       0.37      0.37      0.31       304



In [28]:
feature_names = vectorizerTF.get_feature_names_out()

for i, genre in enumerate(label_encoder.classes_):
    top_features = model.feature_log_prob_[i].argsort()[-10:]
    print(f"\nüîπ G√©nero: {genre}")
    for idx in top_features:
        print(feature_names[idx])


üîπ G√©nero: Bachata
ver
you
yo
vida
coraz√≥n
beso
noche
amar
querer
amor

üîπ G√©nero: Blues
but
vida
querer
that
amor
love
your
and
the
you

üîπ G√©nero: Cumbia
vida
pensar
ah
√©l
ver
pasar
baby
mami
amor
querer

üîπ G√©nero: Flamenco
noche
sentir
yo
esperar
agua
morir
vida
coraz√≥n
amor
querer

üîπ G√©nero: Pop
that
all
√©l
uh
but
oh
querer
and
the
you

üîπ G√©nero: Rock
were
like
dont
down
your
know
that
and
you
the

üîπ G√©nero: Salsa
ver
alma
yo
mujer
decir
vivir
√©l
vida
querer
amor

üîπ G√©nero: Trap
sentir
erir
estar
oh
baby
mami
ah ah
ver
querer
ah


In [29]:
#funcion de prueba
def predict_genre(text):
    text_clean = clean_text(pre_clean(text))
    vec = vectorizerTF.transform([text_clean])
    pred = model.predict(vec)
    return label_encoder.inverse_transform(pred)[0]

In [30]:
predict_genre("""
coraz√≥n no me abandones
que sin tu amor no s√© vivir""")

'Bachata'

In [31]:
#comparar con regresion logistica para ver si con ese modelo las metricas de entrenamiento mejoran

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

model_log = LogisticRegression()
model_log.fit(X_train,y_train)
preds = model_log.predict(X_test)

In [32]:
print(classification_report(preds, y_test))

              precision    recall  f1-score   support

           0       0.26      0.30      0.28        37
           1       0.31      0.35      0.33        23
           2       0.16      0.41      0.23        17
           3       0.60      0.57      0.59        47
           4       0.15      0.46      0.22        13
           5       0.62      0.42      0.50        50
           6       0.38      0.32      0.35        47
           7       0.78      0.36      0.49        70

    accuracy                           0.39       304
   macro avg       0.41      0.40      0.37       304
weighted avg       0.50      0.39      0.42       304



In [33]:
predict_genre("bailar fiesta sabroso ritmo palma")

'Rock'

In [34]:
predict_genre("noche guitarra calle libertad gritar romper")

'Flamenco'

In [35]:
predict_genre("amor amor amor coraz√≥n coraz√≥n sufrir sufrir")

'Salsa'

In [36]:
predict_genre("""
AYYYYY!!! coraz√≥nnnnn,,,√±!
no me abandones!!!!!
xq sin tu amor... no s√© vivir :(
""")

'Salsa'

In [37]:
def predict_song_genre(text):
    """
    Predicts the musical genre of a song based on its lyrics.
    """
    text_clean = clean_text(pre_clean(text))
    vec = vectorizerTF.transform([text_clean])
    pred = model.predict(vec)
    return label_encoder.inverse_transform(pred)[0]

In [38]:
predict_song_genre("""
Cuando llegan las horas de la tarde
y me encuentro tan solo sin tu amor
""")

'Salsa'

In [39]:
def top_words_for_genre(genre, n=10):
    idx = list(label_encoder.classes_).index(genre)
    top_features = model.feature_log_prob_[idx].argsort()[-n:]
    return [feature_names[i] for i in top_features]

In [40]:
top_words_for_genre("Bachata")

['ver',
 'you',
 'yo',
 'vida',
 'coraz√≥n',
 'beso',
 'noche',
 'amar',
 'querer',
 'amor']