# Tokenizzazione

In [29]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from transformers import AutoTokenizer
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import kagglehub

path = kagglehub.dataset_download("mateibejan/multilingual-lyrics-for-genre-classification")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [30]:

modified_path = path.replace("C:\\", "/").replace("\\", "/") + "/train.csv"
df = pd.read_csv(modified_path)
df.head()
# print(df.Genre.unique())
df = df[df['Language']=='en']
columns_to_drop = ['Language']
df = df.drop(columns=columns_to_drop)

df = df.groupby(['Artist', 'Song']).agg({
    'Lyrics': 'first',  # Take the first instance of the lyrics for each group
    'Genre': lambda x: list(set(x))       # Aggregate all genres into a list
}).reset_index()
print(len(df))
multi_genre_songs = df[df['Genre'].apply(len) > 1]

# Display these songs
print(multi_genre_songs.head())
columns_to_drop = ['Artist',"Song"]
df = df.drop(columns=columns_to_drop)

210010
       Artist          Song  \
122  2 chainz    100 joints   
169  2 chainz         kesha   
713   50 cent  all his love   
714   50 cent     all of me   
726   50 cent      bad news   

                                                Lyrics           Genre  
122  [Hook]\nNo matter what they say I smoke 100 jo...  [Pop, Hip-Hop]  
169  I am in love, with what we are, not what we sh...  [Pop, Hip-Hop]  
713  [Chorus: singing]\nHe gon' give you all his lo...  [Pop, Hip-Hop]  
714  Mary~\nnow if i give ya all of me what ya gon ...  [Pop, Hip-Hop]  
726  Lloyd Banks in the house, bad news\nTony Yayo ...  [Pop, Hip-Hop]  


In [52]:
# Handling negations

def handle_negations(text):
    sub_tokens = text.split()
    neg = False
    neg_tokens = []
    punctuation = [',','.', '?', '!', ';', 'exclamations', 'questionMarks', 'ellipsis', '\n']
    negation_words = {"n't", "not", "no", "never", "dont", "don't", "cannot", "wont"}

    for token in sub_tokens:
        # Remove trailing punctuation before checking
        cleaned_token = re.sub(r'[\.\?\!;]+$', '', token)

        if cleaned_token in negation_words:
            neg = True

        neg_token = f"NOT_{token}" if neg else token
        neg_tokens.append(neg_token)

        # Reset negation if the original token had sentence-ending punctuation
        if any(p in token for p in punctuation):
            neg = False

    return ' '.join(neg_tokens)

text = df['Lyrics'][1]
print(text)
print(sent_tokenize(text))
print(handle_negations(text))


Hazy Days With Lazy Ways You get less done but more Out of your days How can you work yet avoid Getting out of your lazy ways You'll never get up if you don't get up You'll always stay down if you sit around You'll never get up if you don't get up You'll always stay down if you sit around Hazy Days Lazy Ways We got less done but more Out of our days How can we ever recapture the feeling Of lazy ways You'll never get up if you don't get up You'll always stay down if you sit around Where nobody cares and nobody tries 'Cos a daydreams resting on the back of your eyes On the back of your eyes Taking five Bring love to me With your body Let me hold you Endless, endlessly You'll never get up if you don't get up You'll always stay down if you sit around Where nobody cares and nobody tries 'Cos a daydreams resting on the back of your eyes On the back of your eyes Hazy days Lazy ways You get less done but more Out of your days Crazy days Lazy ways
["Hazy Days With Lazy Ways You get less done bu

In [54]:

def split_into_sentences(lyrics):
    sentences = sent_tokenize(lyrics)
    return '\n '.join(sentences)

df['Lyrics'] = df['Lyrics'].apply(split_into_sentences)
print(df['Lyrics'][1])
# to reduce vocab size let s try to remove more unecessary symbols and parts
# Define a cleaning function
def clean_lyrics(lyrics):
    # Remove section tags like [Intro], [Verse 1], etc.
    cleaned = re.sub(r'\[.*?\]', '', lyrics)

    # Remove credits or text after "---"
    cleaned = re.split(r'---', cleaned)[0]

    # Remove symbols except for line breaks, alphanumeric characters, and key punctuation
    cleaned = re.sub(r"[^\w\s\n.!?]", '', cleaned)

    # Remove extra whitespace and blank lines
    cleaned = re.sub(r'\n\s*\n', '\n', cleaned).strip()

    cleaned = cleaned.lower()

    # Repeated punctuation sign normalization
    cleaned = re.sub(r'(\!{2,})', ' exclamations ', cleaned)
    cleaned = re.sub(r'(\?{2,})', ' questionMarks ', cleaned)
    cleaned = re.sub(r'(\.{3,})', ' ellipsis ', cleaned)

    return cleaned

# Apply the cleaning function
df['Lyrics'] = df['Lyrics'].apply(clean_lyrics)

print(df['Lyrics'][1])

# Step 2: Initialize the tokenizer with an OOV token
tokenizer = Tokenizer(oov_token="<UNK>")

# Fit the tokenizer on the sampled lyrics data
tokenizer.fit_on_texts(df['Lyrics'])

# Step 3: Convert lyrics to sequences
sequences = tokenizer.texts_to_sequences(df['Lyrics'])

# Determine the longest sequence length dynamically
sequence_lengths = np.array([len(seq) for seq in sequences])
max_sequence_length = int(np.percentile(sequence_lengths, 95))
print(f"Dynamic max_sequence_length: {max_sequence_length}")

# Step 4: Pad or truncate sequences to match the dynamic length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Print the top 10 most frequent words in the tokenizer's word index
print(f"Top 10 most frequent words in tokenizer vocabulary: {list(tokenizer.word_index.items())[:10]}")

df.head()

Hazy Days With Lazy Ways You get less done but more Out of your days How can you work yet avoid Getting out of your lazy ways You'll never get up if you don't get up You'll always stay down if you sit around You'll never get up if you don't get up You'll always stay down if you sit around Hazy Days Lazy Ways We got less done but more Out of our days How can we ever recapture the feeling Of lazy ways You'll never get up if you don't get up You'll always stay down if you sit around Where nobody cares and nobody tries 'Cos a daydreams resting on the back of your eyes On the back of your eyes Taking five Bring love to me With your body Let me hold you Endless, endlessly You'll never get up if you don't get up You'll always stay down if you sit around Where nobody cares and nobody tries 'Cos a daydreams resting on the back of your eyes On the back of your eyes Hazy days Lazy ways You get less done but more Out of your days Crazy days Lazy ways
hazy days with lazy ways you get less done but 

Unnamed: 0,Lyrics,Genre
0,dont call me horse.\n im a unicorn.\n mystical...,[Electronic]
1,hazy days with lazy ways you get less done but...,[Rock]
2,the summers were longer they went on forever o...,[Rock]
3,you promised me a life you promised me everyth...,[Rock]
4,i looked in the mirror what did you see lookin...,[Rock]


# Embedding

In [None]:
from sentence_transformers import SentenceTransformer

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Small & fast model

df['embedding'] = df['Lyrics'].apply(model.encode)


In [None]:
#Save and Load

np.save("lyrics_embeddings.npy", df['embeddings'])
#loaded_embeddings = np.load("lyrics_embeddings.npy")

# Random Forest

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
#As factor transform

# Create LabelEncoder
label_encoder = LabelEncoder()

# Transform genres into numeric labels
df['Genre'] = label_encoder.fit_transform(df['Genre'])

# Train and test

X_train, X_test, y_train, y_test = train_test_split(df['Lyrics'], df['Genre'], test_size=0.2, random_state=42)
print(X_train, X_test, y_train, y_test)

In [None]:
# Model training
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 alberi nella foresta

# Fitting
rf.fit(X_train, y_train)

In [None]:
# Model testing
# Prediction
y_pred = rf.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuratezza del modello: {accuracy:.2f}")
