In [5]:
import pandas as pd
import numpy
import re

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
csv_file = 'drive/MyDrive/Colab Notebooks/w266/Project/data/ds2.csv'

In [8]:
df = pd.read_csv(csv_file)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5913411 entries, 0 to 5913410
Data columns (total 8 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   title     object
 1   tag       object
 2   artist    object
 3   year      int64 
 4   views     int64 
 5   features  object
 6   lyrics    object
 7   id        int64 
dtypes: int64(3), object(5)
memory usage: 360.9+ MB


Hip hop as a stand alone category accoutned for almost half of the total songs in the dataset

In [10]:
  df_rap = df[df['tag'].str.lower() == 'rap'].copy()

In [11]:
mainstream_hiphop_artists = [
    '2 Chainz', '50 Cent', 'Busta Rhymes', 'Chief Keef', 'DMX', 'Drake',
    'Eminem', 'Fat Joe', 'French Montana', 'Future', 'G-Eazy', 'Gucci Mane',
    'G-Unit', 'ICE-T', 'JAY-Z', 'Joey Bada$$', 'Kanye West', 'Lil Durk', 'Lil Wayne',
    'Ludacris', 'Mary J. Blige', 'Meek Mill', 'Method Man', 'Missy Elliott',
    'Nas', 'Nelly', 'Nicki Minaj', 'N.W.A.', 'Polo G', 'Pusha T', 'Rick Ross',
    'Snoop Dogg', 'Tech N9ne', 'The Game', 'Tyga', 'Wiz Khalifa', 'YG', 'Young Thug'
]
df_rap = df_rap[df_rap['artist'].isin(mainstream_hiphop_artists)]

In [14]:
def split_lyrics_to_line_pairs(df):
    records = []

    for idx, row in df.iterrows():
        lyrics = row['lyrics']

        if pd.isnull(lyrics):
            continue

        lyrics = re.sub(r'\[.*?\]', '', str(lyrics))

        # Split lyrics by line breaks and remove empty lines
        lines = [line.strip() for line in str(lyrics).split('\n') if line.strip()]

        # Only proceed if there are at least 2 lines
        for i in range(len(lines) - 1):
            records.append({
                'line': lines[i],
                'actual_line': lines[i + 1],
                'title': row['title'],
                'artist': row['artist'],
                'year': row['year'],
                'views': row['views'],
                'features': row['features'],
                'id': row['id'],
            })

    return pd.DataFrame(records)

# Usage
new_df = split_lyrics_to_line_pairs(df_rap)


In [15]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939745 entries, 0 to 939744
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   line         939745 non-null  object
 1   actual_line  939745 non-null  object
 2   title        939745 non-null  object
 3   artist       939745 non-null  object
 4   year         939745 non-null  int64 
 5   views        939745 non-null  int64 
 6   features     939745 non-null  object
 7   id           939745 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 57.4+ MB


In [16]:
new_df.head()

Unnamed: 0,line,actual_line,title,artist,year,views,features,id
0,"Yeah, hah, yeah, Roc-A-Fella","We invite you to somethin' epic, you know?",Can I Live,JAY-Z,1996,468624,{},3
1,"We invite you to somethin' epic, you know?","Well, we hustle out of a sense of hopelessness",Can I Live,JAY-Z,1996,468624,{},3
2,"Well, we hustle out of a sense of hopelessness",Sort of a desperation,Can I Live,JAY-Z,1996,468624,{},3
3,Sort of a desperation,"Through that desperation, we become addicted",Can I Live,JAY-Z,1996,468624,{},3
4,"Through that desperation, we become addicted",Sort of like the fiends we accustomed to servin',Can I Live,JAY-Z,1996,468624,{},3


In [17]:
# Shuffle and reset index
new_df_shuffled = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Take first 60k for training
train_df = new_df_shuffled.iloc[:60000]

# Next 15k for validation
val_df = new_df_shuffled.iloc[60000:75000]

# Last 15k for testing
test_df = new_df_shuffled.iloc[75000:90000]

# Optional: check sizes
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 60000, Val: 15000, Test: 15000


In [None]:
train_df.to_excel('drive/MyDrive/Colab Notebooks/w266/Project/data/3_line_train_dataset.xlsx', index=False)
val_df.to_excel('drive/MyDrive/Colab Notebooks/w266/Project/data/3_line_val_dataset.xlsx', index=False)
test_df.to_excel('drive/MyDrive/Colab Notebooks/w266/Project/data/3_line_test_dataset.xlsx', index=False)