In [1]:
!pip install git+https://github.com/openai/whisper.git -q
!pip install yt-dlp
!pip install pytube
!pip install pandas



In [2]:
!pip install ffmpeg-python



In [3]:
from pytube import Playlist
import yt_dlp
import os
import pandas as pd
from tqdm import tqdm

In [4]:
playlist_dict = {
    "rock": "https://www.youtube.com/playlist?list=PLw-VjHDlEOgvIaXEz6xErW8uo2gqg5xmx",
    "pop": "https://www.youtube.com/playlist?list=PLr7xQC-cXWL9EZ3dqpu8E_Xf_4nhS6xEJ",
    "rap": "https://www.youtube.com/playlist?list=PL-cqjqtOUe_M-qMzg2rQ_s-Z8eSEDsmMh",
    "country": "https://www.youtube.com/playlist?list=PLNLQuN_16YRiejNNdwmdTJuFygMp25zUM"
}

In [5]:
import whisper


model = whisper.load_model("base")

In [6]:
def get_video_data_from_playlist(playlist_url):
    ydl_opts = {
        'extract_flat': True,
        'quiet': True,
        'cookiefile': "www.youtube.com_cookies.txt",  # ✅ FIXED THIS LINE
        'forcejson': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info_dict = ydl.extract_info(playlist_url, download=False)
            return [{"title": entry.get("title"), "url": entry.get("url")} for entry in info_dict.get("entries", [])]
        except Exception as e:
            print(f"⚠️ Error extracting playlist: {e}")
            return []

In [7]:
import os
import pandas as pd

csv_path = "music_dataset_simple.csv"
if os.path.exists(csv_path):
    print("📄 CSV already exists. Loading existing dataset...")
    df = pd.read_csv(csv_path)
else:
    df_data = []

    for genre, playlist_url in playlist_dict.items():
        print(f"\n🎵 Scraping {genre.upper()} playlist...")
        video_data = get_video_data_from_playlist(playlist_url)[:500]

        for entry in tqdm(video_data, desc=f"Collecting {genre} songs"):
            df_data.append({
                "title": entry["title"],
                "genre": genre,
                "youtube_url": f"https://www.youtube.com/watch?v={entry['url']}"
            })
    df = pd.DataFrame(df_data)
    df.to_csv("music_dataset_simple.csv", index=False)
    print(f"✅ Scraping complete. {len(df)} songs saved to CSV.")

📄 CSV already exists. Loading existing dataset...


In [8]:
# Fix duplicate YouTube link prefixes if needed
df["youtube_url"] = df["youtube_url"].apply(lambda x: x.split("watch?v=")[-1])
df["youtube_url"] = "https://www.youtube.com/watch?v=" + df["youtube_url"]

# Improved title cleaning for lyric search
df["title_clean"] = df["title"].str.replace(r"\(.*?\)", "", regex=True)  # Remove text in parentheses
df["title_clean"] = df["title_clean"].str.replace(r"[^a-zA-Z0-9' ]", "", regex=True)  # Remove weird characters
df["title_clean"] = df["title_clean"].str.replace(r"\s+", " ", regex=True).str.strip()  # Remove excess whitespace


df.head(15)

Unnamed: 0,title,genre,youtube_url,title_clean
0,Guns N' Roses - November Rain,rock,https://www.youtube.com/watch?v=8SbUC-UaAxE,Guns N' Roses November Rain
1,Nirvana - Smells Like Teen Spirit (Official Mu...,rock,https://www.youtube.com/watch?v=hTWKbfoikeg,Nirvana Smells Like Teen Spirit
2,4 Non Blondes - What's Up (Official Music Video),rock,https://www.youtube.com/watch?v=6NXnxTNIWkc,4 Non Blondes What's Up
3,Queen – Bohemian Rhapsody (Official Video Rema...,rock,https://www.youtube.com/watch?v=fJ9rUzIMcZQ,Queen Bohemian Rhapsody
4,Guns N' Roses - Sweet Child O' Mine (Official ...,rock,https://www.youtube.com/watch?v=1w7OgIMMRc4,Guns N' Roses Sweet Child O' Mine
5,The Cranberries - Zombie (Official Music Video),rock,https://www.youtube.com/watch?v=6Ejga4kJUts,The Cranberries Zombie
6,AC/DC - Thunderstruck (Official Video),rock,https://www.youtube.com/watch?v=v2AC41dglnM,ACDC Thunderstruck
7,Metallica - Nothing Else Matters (Official Mus...,rock,https://www.youtube.com/watch?v=HyrWd_gfQNQ,Metallica Nothing Else Matters
8,The Police - Every Breath You Take (Official M...,rock,https://www.youtube.com/watch?v=OMOGaugKpzs,The Police Every Breath You Take
9,R.E.M. - Losing My Religion (Official HD Music...,rock,https://www.youtube.com/watch?v=xwtdhWltSIg,REM Losing My Religion


In [9]:
!pip install lyricsgenius



In [10]:

import lyricsgenius

genius = lyricsgenius.Genius("n4HDEv1t1D4Ot6Ko8xmiu4ow6XWwVB69oaR9rVhH4RA6b7cat9V2pfcKSE2kfelP", skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"])

def get_lyrics(title,cnt=None):
    try:
        song = genius.search_song(title)
        return song.lyrics if song else None
    except:
        return None
    
csv_path="music_lyrics.csv"
if os.path.exists(csv_path):
    print("📄 CSV already exists. Loading existing dataset...")
    df = pd.read_csv(csv_path)
else:
    df["lyrics"] = df["title_clean"].apply(get_lyrics)
    df.to_csv("music_lyrics.csv",index=False)


📄 CSV already exists. Loading existing dataset...


In [11]:
import re
import pandas as pd
import lyricsgenius

genius = lyricsgenius.Genius("n4HDEv1t1D4Ot6Ko8xmiu4ow6XWwVB69oaR9rVhH4RA6b7cat9V2pfcKSE2kfelP",
                             skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"])

def clean_title_for_genius_search(title):
    title = re.sub(r'\([^)]*\)', '', title)
    title = re.sub(r'\s*-\s*', ' ', title)
    title = re.sub(r'\s+', ' ', title)
    return title.strip()

def get_lyrics(title):
    try:
        song = genius.search_song(title)
        return song.lyrics if song else None
    except Exception as e:
        print(f"Error fetching lyrics for {title}: {e}")
        return None

def lyrics_empty(text):
    if pd.isna(text):
        return True
    try:
        return len(text.strip()) == 0
    except:
        return True

def update_lyrics(row):
    genre = str(row["genre"]).strip().lower()
    if genre in ["rap", "country"] and lyrics_empty(row.get("lyrics", None)):
        cleaned_title = clean_title_for_genius_search(row.get("title_clean", ""))
        print(f"Fetching lyrics for: {cleaned_title}")
        return get_lyrics(cleaned_title)
    else:
        print(f"Skipping: {row.get('title_clean', '')} | Genre: {row.get('genre', '')} | Lyrics exists: {not lyrics_empty(row.get('lyrics', None))}")
        return row.get("lyrics", None)

csv_path = "lyrics_please.csv"
if os.path.exists(csv_path):
    print("📄 CSV already exists. Loading existing dataset...")
    df = pd.read_csv(csv_path)
else:
    df["lyrics"] = df.apply(update_lyrics, axis=1)
    df.to_csv("lyrics_please.csv",index=False)


📄 CSV already exists. Loading existing dataset...


In [12]:
df

Unnamed: 0,title,genre,youtube_url,title_clean,lyrics
0,Guns N' Roses - November Rain,rock,https://www.youtube.com/watch?v=8SbUC-UaAxE,Guns N' Roses November Rain,88 ContributorsTranslationsBahasa IndonesiaNov...
1,Nirvana - Smells Like Teen Spirit (Official Mu...,rock,https://www.youtube.com/watch?v=hTWKbfoikeg,Nirvana Smells Like Teen Spirit,
2,4 Non Blondes - What's Up (Official Music Video),rock,https://www.youtube.com/watch?v=6NXnxTNIWkc,4 Non Blondes What's Up,97 ContributorsTranslationsDeutschWhat’s Up? L...
3,Queen – Bohemian Rhapsody (Official Video Rema...,rock,https://www.youtube.com/watch?v=fJ9rUzIMcZQ,Queen Bohemian Rhapsody,
4,Guns N' Roses - Sweet Child O' Mine (Official ...,rock,https://www.youtube.com/watch?v=1w7OgIMMRc4,Guns N' Roses Sweet Child O' Mine,125 ContributorsTranslationsBahasa IndonesiaSw...
...,...,...,...,...,...
1995,[Deleted video],country,https://www.youtube.com/watch?v=9-B6anffx10,Deleted video,2 ContributorsDELETED VIDEO 12/20/2021 Lyrics[...
1996,Jon Wolfe - Smile on Mine (Official Audio Track),country,https://www.youtube.com/watch?v=jjmpd_kejZw,Jon Wolfe Smile on Mine,1 ContributorSmile on Mine LyricsDo I pick you...
1997,"Post Monroe: ""Red Hot American Summer"" (Music ...",country,https://www.youtube.com/watch?v=LUu93Oz9gPk,Post Monroe Red Hot American Summer,
1998,Jon Wolfe - Don't It Feel Good (Official Audio...,country,https://www.youtube.com/watch?v=1ggVcOATu40,Jon Wolfe Don't It Feel Good,1 ContributorDon’t it Feel Good LyricsThere's ...


In [13]:
import re
import unicodedata
import pandas as pd

def clean_lyrics(text):
    if pd.isna(text):
        return None

    # Normalize Unicode (smart quotes, etc.)
    text = unicodedata.normalize("NFKC", text)

    # Fix common encoded crap
    text = text.replace("â€™", "'").replace("â€“", "-").replace("â€œ", '"').replace("â€", '"')
    text = text.replace("Â", "").replace("â€¦", "...")

    # Look for the first real lyric section
    match = re.search(r'\[(verse|chorus|intro|pre[- ]?chorus)[^\]]*\]', text, flags=re.IGNORECASE)
    if match:
        text = text[match.end():]  # Start AFTER the match, not at start()
    else:
        return None  # No meaningful section? Bye.

    # Remove all other tags like [Bridge], [Solo], etc.
    text = re.sub(r'\[.*?\]', '', text)

    # Remove non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


In [14]:
csv_path = "final_csv_file.csv"
if os.path.exists(csv_path):
    print("📄 CSV already exists. Loading existing dataset...")
    df = pd.read_csv(csv_path)
else:
    df = df[df["lyrics"].notna()]
    df["lyrics"] = df["lyrics"].apply(clean_lyrics)
    df = df[df["lyrics"].notna()]  # Only keep songs that had a valid match

    # Optional: Keep decently long lyrics
    df = df[df["lyrics"].str.len() > 100]

    # # Save
    # df.to_csv("lyrics_cleaned_final_please.csv", index=False)
    print(f"✅ Final dataset: {len(df)} songs cleaned and saved.")

    # Drop deleted videos
    df = df[~df["title_clean"].str.lower().str.strip().eq("deleted video")]

    # Drop exact duplicate YouTube videos
    df = df.drop_duplicates(subset=["youtube_url"])

    # (Optional) Drop duplicate songs based on title_clean (keep first occurrence)
    df = df.drop_duplicates(subset=["title_clean"], keep="first")

    # Reset index after cleaning
    df = df.reset_index(drop=True)
    df.to_csv("final_csv_file.csv",index=False)



📄 CSV already exists. Loading existing dataset...


In [15]:
df

Unnamed: 0,title,genre,youtube_url,title_clean,lyrics
0,Guns N' Roses - November Rain,rock,https://www.youtube.com/watch?v=8SbUC-UaAxE,Guns N' Roses November Rain,When I look into your eyes I can see a love re...
1,4 Non Blondes - What's Up (Official Music Video),rock,https://www.youtube.com/watch?v=6NXnxTNIWkc,4 Non Blondes What's Up,25 years and my life is still Tryin' to get up...
2,Guns N' Roses - Sweet Child O' Mine (Official ...,rock,https://www.youtube.com/watch?v=1w7OgIMMRc4,Guns N' Roses Sweet Child O' Mine,She's got a smile that it seems to me Reminds ...
3,The Cranberries - Zombie (Official Music Video),rock,https://www.youtube.com/watch?v=6Ejga4kJUts,The Cranberries Zombie,Another head hangs lowly Child is slowly taken...
4,AC/DC - Thunderstruck (Official Video),rock,https://www.youtube.com/watch?v=v2AC41dglnM,ACDC Thunderstruck,"Ah-ah, ah-ah-ah Ah-ah, ah-ah-ah Ah-ah, ah-ah-a..."
...,...,...,...,...,...
1223,"Dexys Midnight Runners ""Because of You""",country,https://www.youtube.com/watch?v=0_Pqb7PAFvc,Dexys Midnight Runners Because of You,Because of you These things I do Because of yo...
1224,Kacey Musgraves - My House,country,https://www.youtube.com/watch?v=RQL1Ls3WfME,Kacey Musgraves My House,"Ah-one, two, three, four Who needs a house up ..."
1225,Guilty in here,country,https://www.youtube.com/watch?v=NO32hWm1Eac,Guilty in here,God knows I tried everything I could to stay i...
1226,Believe,country,https://www.youtube.com/watch?v=bnWv1aKHC6g,Believe,"First things first, I'ma say all the words ins..."
