In [27]:
import pandas as pd
import re
import random

df = pd.read_csv("spotify_dataset_cleaned.csv")
print("Total rows before cleaning:", len(df))


Total rows before cleaning: 497473


In [28]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\[.*?\]", " ", text)          # remove [verse], [chorus], [intro], etc.
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)    # soooo â†’ soo
    text = re.sub(r"[^a-z\s]", " ", text)         # remove punctuation/symbols
    text = re.sub(r"\s+", " ", text).strip()      # remove extra spaces
    return text

df["lyrics"] = df["lyrics"].apply(clean_text)


In [29]:
df = df.drop_duplicates(subset=["lyrics", "mood"])
df = df[df["lyrics"].str.strip() != ""]
print("Rows after cleaning:", len(df))


Rows after cleaning: 495971


In [30]:
# ----- 5. Balance dataset by capping max rows per class -----
MAX_PER_CLASS = 40000  # You can change this to 30k or 50k based on RAM

df = df.groupby("mood").apply(
    lambda x: x.sample(
        n=min(len(x), MAX_PER_CLASS),
        random_state=42
    )
).reset_index(drop=True)

print(df['mood'].value_counts())
print("Total rows after balancing:", len(df))


mood
anger       40000
joy         40000
sadness     40000
fear        25881
love        25219
surprise     4952
Name: count, dtype: int64
Total rows after balancing: 176052


  df = df.groupby("mood").apply(


In [31]:
df.to_csv("spotify_dataset_final_cleaned.csv", index=False)
print("Saved as spotify_dataset_final_cleaned.csv")


Saved as spotify_dataset_final_cleaned.csv
