In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import re
import pandas as pd

def load_songs_to_dataframe(filepath, label=None):
    """
    Loads songs from a text file into a Pandas DataFrame.

    Args:
        filepath (str): The path to the text file containing song data.

    Returns:
        pandas.DataFrame: A DataFrame with columns 'title', 'style', and 'lyrics'.
                          Returns an empty DataFrame if the file is empty or
                          no songs are found.
    """
    songs_data = []
    current_song = {}
    lyrics_buffer = []

    # Regex to identify the start of a new song entry
    # Matches: Optional spaces, digits, dot, space, "Title", space, (Estilo: Style)
    # Captures: Title (group 1), Style (group 2)
    # Updated to handle potential variations in spacing and numbering inconsistencies
    song_start_pattern = re.compile(r'^\s*\d+\.\s*"(.*?)"\s*\(Estilo:\s*(.*?)\)\s*$')

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line_stripped = line.strip()

                # Check if the line matches the start of a song
                match = song_start_pattern.match(line_stripped)

                if match:
                    # --- Found a new song ---
                    # 1. If we were already collecting lyrics for a previous song,
                    #    store that song's data.
                    if current_song:
                        current_song['lyrics'] = "\n".join(lyrics_buffer).strip()
                        songs_data.append(current_song)

                    # 2. Start the new song's data
                    title = f"GS_{label}_{match.group(1)}"

                    current_song = {'id': title, 'label': label}
                    lyrics_buffer = [] # Reset the lyrics buffer

                elif current_song and line_stripped:
                    # --- This line is part of the current song's lyrics ---
                    # (and it's not a blank line, and we have started a song)
                    lyrics_buffer.append(line_stripped)

                # Ignore header lines (like "Categoría H...") or lines before the first song match
                # Also ignore blank lines between songs

            # --- After the loop: Store the last song read ---
            if current_song:
                current_song['lyrics'] = "\n".join(lyrics_buffer).strip()
                songs_data.append(current_song)

    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return pd.DataFrame(columns=['id', 'lyrics', 'label']) # Return empty DataFrame
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame(columns=['id', 'lyrics', 'label']) # Return empty DataFrame


    # Create the DataFrame
    df = pd.DataFrame(songs_data)
    return df

In [None]:


# --- Usage ---
filepathH = '/content/drive/MyDrive/MiSongGiny/data/Task2/songs_H.txt'
filepathV = '/content/drive/MyDrive/MiSongGiny/data/Task2/songs_V.txt'
songs_df = pd.DataFrame(columns=['id', 'lyrics', 'label'])
songs_df = pd.concat([songs_df, load_songs_to_dataframe(filepathH, label='H')], ignore_index=True)
songs_df = pd.concat([songs_df, load_songs_to_dataframe(filepathV, label='V')], ignore_index=True)

songs_df.head()

In [None]:
songs_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
H,195
V,195


In [None]:
songs_df.to_csv('/content/drive/MyDrive/MiSongGiny/data/Task2/generated_songs.csv', index=False)