# Prepare notebook

In [None]:
!pip install ipywidgets

In [1]:
import pandas as pd
import numpy as np
import spacy
import swifter
from IPython.display import Image, display
import ipywidgets as widgets
import re

# Load the Spanish language model
nlp = spacy.load("es_core_news_sm")

In [None]:

def clean_lyrics(letra):
    cleaned = ''
    for text in letra.split('\n')[1:]:   # remove the first line
        if text == '':
            continue
        text = re.sub(r'\[.+?\]', ' ', text) # remove whatever is between brackets
        text = re.sub(r'\((.+?)\)', r'\1', text) # keep what is between parentheses
        text = re.sub(r'\"(.+?)\"', r'\1', text) # keep what is between quotes
        text = re.sub('[\s]+', ' ', text) # spaces
        text = text.strip() # remove leading and trailing spaces
        text = text.lower() # lowercase
        cleaned += text + ' '
    cleaned = re.sub(r'\s+', ' ', cleaned) # remove multiple spaces
    cleaned = cleaned.strip() # remove leading and trailing spaces
    return cleaned

def load_data(
        csv_path: str,
        languages: list = None,
        chunk_size: int = 20000
        ) -> pd.DataFrame:
    # dafault is Spanish
    if languages is None:
        languages = ['es']
    # Read the CSV file in chunks
    chunks = pd.read_csv(csv_path, chunksize=chunk_size, encoding='utf-8')
    # Process each chunk
    filtered_chunks = []
    for chunk in chunks:
        filtered_chunk = chunk[chunk['language'].isin(languages)]
        filtered_chunks.append(filtered_chunk)
    # Combine all filtered chunks into a single DataFrame
    filtered_df = pd.concat(filtered_chunks)
    return filtered_df

data_soure = '/mnt/c/Users/rmessina/Eli/data/song_lyrics_es_1950_2020.csv'

df = load_data(data_soure, languages=['es'])
df['cleaned_lyrics'] = df['lyrics'].swifter.apply(clean_lyrics)

df.to_csv('/mnt/c/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_cleaned.csv', index=False, encoding='utf-8')

# Extract the noums only

In [3]:
from concurrent.futures import ProcessPoolExecutor

# Load the Spanish language model
nlp = spacy.load("es_core_news_sm")

# Function to process a single text
def nominize_text(text):
    """
    Extract nouns from the input text using spaCy's Spanish language model,
    excluding stopwords and punctuation.
    """
    doc = nlp(text)
    return " ".join([
        token.lemma_ for token in doc
        if token.pos_ == "NOUN" and not token.is_stop and not token.is_punct
    ])

# Function to process a DataFrame chunk
def process_chunk(chunk):
    chunk['lemmatized_lyrics'] = chunk['cleaned_lyrics'].apply(nominize_text)
    return chunk

# Parallel processing function
def parallel_process(df, func, num_cores=20):
    df_split = np.array_split(df, num_cores)  # Split DataFrame into chunks
    with ProcessPoolExecutor(max_workers=num_cores) as executor:
        results = executor.map(func, df_split)
    return pd.concat(results)  # Combine results back into a single DataFrame

if __name__ == "__main__":
    display('Reading notebook')
    df = pd.read_csv('/mnt/c/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_cleaned.csv')
    
    # Remove rows with NaN in the 'cleaned_lyrics' column
    initial_row_count = len(df)
    df = df.dropna(subset=['cleaned_lyrics'])
    removed_row_count = initial_row_count - len(df)
    print(f"Removed {removed_row_count} rows with NaN values in 'cleaned_lyrics'.")

    display('Processing...')
    # Apply multiprocessing to the DataFrame with a progress bar
    df = parallel_process(df, process_chunk)

    display('Saving...')
    # Save the processed DataFrame
    df.to_csv('/mnt/c/Users/rmessina/Eli/data/song_lyrics_es_1950_2020_lemmaNouns.csv', index=False, encoding='utf-8')

'Reading notebook'

Removed 499 rows with NaN values in 'cleaned_lyrics'.


'Processing...'

'Saving...'