In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import time
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import lyricsgenius

In [2]:
# --- Determine Project Root and .env Path ---
# This logic assumes your project root is where the .git folder would be,
# and your .env file is directly in that root.
# For your structure: c:\Users\balde\SongRecommendation\

# Get the directory of the current notebook.
# If __file__ is defined (e.g. running as script), use its directory.
# Otherwise (interactive notebook), use current working directory.
notebook_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()

# Define project_root based on common structures.
# If notebook is in 'c:\Users\balde\SongRecommendation\SongRecommendation.ipynb'
# or 'c:\Users\balde\SongRecommendation\some_subfolder\SongRecommendation.ipynb'
# we want project_root to be 'c:\Users\balde\SongRecommendation\'
# A simple way is to assume the .env file is in the parent of the notebook's directory,
# or the current directory if the notebook is already in the root.

# Try to locate the .env file by checking common project root locations relative to the notebook
# This assumes your .env file is at c:\Users\balde\SongRecommendation\.env
project_root_candidate = os.path.abspath("c:/Users/balde/SongRecommendation") # Explicitly set your known project root

dotenv_path = os.path.join(project_root_candidate, '.env')

if not os.path.exists(dotenv_path):
    # Fallback if the explicit path is wrong or notebook is run from an unexpected CWD
    # Try current working directory
    dotenv_path_cwd = os.path.join(os.getcwd(), '.env')
    if os.path.exists(dotenv_path_cwd):
        dotenv_path = dotenv_path_cwd
        project_root_candidate = os.getcwd() # Update project_root if .env found in CWD
    else:
        # Try one level up from CWD (if notebook is in a subfolder of the project root)
        dotenv_path_parent = os.path.join(os.path.abspath(os.path.join(os.getcwd(), '..')), '.env')
        if os.path.exists(dotenv_path_parent):
            dotenv_path = dotenv_path_parent
            project_root_candidate = os.path.abspath(os.path.join(os.getcwd(), '..')) # Update project_root
        else:
            print(f"Warning: .env file not found at explicit path '{dotenv_path}', nor in CWD or parent of CWD.")
            # As a last resort, if the notebook is in a subfolder of the known root:
            if notebook_dir.startswith(project_root_candidate) and notebook_dir != project_root_candidate:
                 dotenv_path = os.path.join(project_root_candidate, '.env') # Re-assert known root
            # else: project_root_candidate remains the explicit path for saving files later

# Use the determined project_root_candidate for saving files later
project_root = project_root_candidate
print(f"Project root identified as: {project_root}")
print(f"Attempting to load .env file from: {dotenv_path}")

if os.path.exists(dotenv_path) and load_dotenv(dotenv_path=dotenv_path):
    print(f"Successfully loaded .env file from: {dotenv_path}")
else:
    print(f"Failed to load .env file from {dotenv_path}. Trying default load_dotenv().")
    if load_dotenv(): # Tries to find .env in CWD or parent if path not specified
        print("Successfully loaded .env file using default search.")
        # If default search worked, .env might be in CWD. Update project_root if needed for consistency.
        if os.path.exists(os.path.join(os.getcwd(), '.env')):
            project_root = os.getcwd()
            print(f"Updated project root to current working directory: {project_root}")
    else:
        print("Failed to load .env file using default search as well.")
        print(f"Please ensure '.env' exists at '{os.path.join(project_root, '.env')}' or in your current working directory, and contains your credentials.")


CLIENT_ID = os.getenv('SPOTIPY_CLIENT_ID')
CLIENT_SECRET = os.getenv('SPOTIPY_CLIENT_SECRET')

sp = None # Initialize sp
if CLIENT_ID and CLIENT_SECRET:
    try:
        client_credentials_manager = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
        sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
        print("Successfully authenticated with Spotify API!")
    except Exception as e:
        print(f"Error during authentication: {e}")
else:
    print("Error: SPOTIPY_CLIENT_ID or SPOTIPY_CLIENT_SECRET not found in environment variables.")
    print("Please check your .env file loading and its content.")

Project root identified as: c:\Users\balde\SongRecommendation
Attempting to load .env file from: c:\Users\balde\SongRecommendation\.env
Successfully loaded .env file from: c:\Users\balde\SongRecommendation\.env
Successfully authenticated with Spotify API!


In [3]:
if sp:
    artist_names = [
        # --- English-speaking artists ---
        # Pop / Contemporary R&B
        "The WeMusic. BCCI crashes. I gotta get close to you, baby. And you? We met each other, and we'll never be serving care about the fan. I miss you so much. Put you on the Vicky Bry. I can get you what you like. It will be so much data. Damn. eknd", "Taylor Swift", "Drake", "Billie Eilish", "Ed Sheeran",
        "Dua Lipa", "Post Malone", "Ariana Grande", "Justin Bieber", "Harry Styles", 
        "Olivia Rodrigo", "Adele", "Bruno Mars", "Lady Gaga", "Katy Perry",
        "Rihanna", "Beyoncé", "SZA", "Doja Cat", "Lizzo", "Miley Cyrus",
        "Sam Smith", "Shawn Mendes", "Camila Cabello", "Halsey", "Lana Del Rey",
        "The 1975", "Lorde", "Frank Ocean", "Khalid", "Giveon", "Summer Walker",
        "Kehlani", "Normani", "Charlie Puth", "Selena Gomez", "Demi Lovato",
        "Nick Jonas", "Zayn", "Ellie Goulding", "Jessie J", "Tate McRae",
        "Conan Gray", "Sabrina Carpenter", "Ava Max", "Bebe Rexha",

        # Hip Hop / Rap
        "Kendrick Lamar", "Eminem", "Kanye West", "J. Cole", "Travis Scott",
        "Future", "Lil Wayne", "Nicki Minaj", "Cardi B", "Megan Thee Stallion",
        "Lil Baby", "DaBaby", "Roddy Ricch", "Polo G", "Lil Uzi Vert",
        "21 Savage", "A$AP Rocky", "Tyler, The Creator", "Mac Miller", "Chance The Rapper",
        "Logic", "Big Sean", "Pusha T", "Nas", "Jay-Z", "Snoop Dogg", "Dr. Dre",
        "50 Cent", "The Notorious B.I.G.", "Tupac Shakur", "Ice Cube",

        # Rock / Alternative / Indie
        "Coldplay", "Imagine Dragons", "Maroon 5", "Foo Fighters", "Red Hot Chili Peppers",
        "Linkin Park", "Metallica", "Queen", "The Beatles", "Led Zeppelin",
        "Pink Floyd", "AC/DC", "Guns N' Roses", "U2", "Radiohead", "Nirvana",
        "Oasis", "Green Day", "Florence + The Machine", "Arctic Monkeys", "Tame Impala",
        "Muse", "Kings of Leon", "The Killers", "Paramore", "My Chemical Romance",
        "Fall Out Boy", "Panic! At The Disco", "Twenty One Pilots", "Bon Jovi",
        "Aerosmith", "Journey", "Fleetwood Mac", "The Rolling Stones", "The Who",
        "David Bowie", "Elton John", "Billy Joel", "Bruce Springsteen", "Tom Petty",
        "Pearl Jam", "Soundgarden", "Alice In Chains", "System Of A Down", "Korn",
        "Slipknot", "Avenged Sevenfold", "Bring Me The Horizon", "Gorillaz",
        "Vampire Weekend", "The Strokes", "Yeah Yeah Yeahs", "MGMT", "Phoenix",
        "Glass Animals", "Hozier", "Bon Iver", "The Lumineers", "Mumford & Sons",

        # Electronic / Dance
        "Calvin Harris", "David Guetta", "Marshmello", "The Chainsmokers", "Kygo",
        "Avicii", "Zedd", "Martin Garrix", "DJ Snake", "Diplo", "Skrillex",
        "Daft Punk", "Swedish House Mafia", "Deadmau5", "Disclosure", "Flume",

        # Soul / Classic R&B / Funk
        "Michael Jackson", "Madonna", "Stevie Wonder", "Prince", "Aretha Franklin",
        "Marvin Gaye", "James Brown", "Whitney Houston", "Janet Jackson", "Mariah Carey",
        "Earth, Wind & Fire", "Sade", "Luther Vandross", "Barry White",

        # --- Spanish-speaking artists ---
        # Reggaeton / Latin Trap / Urbano
        "Bad Bunny", "J Balvin", "Karol G", "Daddy Yankee", "Maluma", "Ozuna",
        "Natti Natasha", "Becky G", "Rauw Alejandro", "Anuel AA", "Myke Towers",
        "Feid", "Eladio Carrión", "Sech", "Justin Quiles", "Manuel Turizo",
        "Wisin & Yandel", "Don Omar", "Tego Calderón", "Arcángel", "De La Ghetto",
        "Bryant Myers", "Darell", "Nicky Jam", "Farruko",

        # Latin Pop / Rock / Alternative
        "Rosalía", "Shakira", "Enrique Iglesias", "Ricky Martin", "Juanes",
        "Mon Laferte", "Camilo", "Luis Miguel", "Maná", "Soda Stereo",
        "Julieta Venegas", "Alejandro Sanz", "Chayanne", "Thalía", "Paulina Rubio",
        "Carlos Vives", "Sebastian Yatra", "Danna Paola", "Morat", "Reik",
        "Jesse & Joy", "Sin Bandera", "La Oreja de Van Gogh", "Café Tacvba", "Molotov",
        "Zoé", "Fito Páez", "Andrés Calamaro", "Gustavo Cerati", "Aterciopelados",

        # Regional Mexican
        "Peso Pluma", "Fuerza Regida", "Tito Double P", "Natanael Cano", "Eslabon Armado",
        "Junior H", "Grupo Frontera", "Carin Leon", "Christian Nodal", "Angela Aguilar",
        "Banda MS de Sergio Lizárraga", "Calibre 50", "Los Tigres del Norte", "Vicente Fernández",
        "Alejandro Fernández", "Pepe Aguilar", "Gerardo Ortiz", "Alfredo Olivas",

        # Salsa / Bachata / Merengue / Tropical
        "Marc Anthony", "Romeo Santos", "Prince Royce", "Juan Luis Guerra", "Celia Cruz",
        "Hector Lavoe", "Willie Colón", "Rubén Blades", "Gilberto Santa Rosa", "Olga Tañón",

        # --- K-Pop artists ---
        "BTS", "BLACKPINK", "TWICE", "Stray Kids", "NewJeans", "SEVENTEEN",
        "EXO", "Red Velvet", "NCT 127", "NCT DREAM", "ATEEZ", "TOMORROW X TOGETHER",
        "ENHYPEN", "LE SSERAFIM", "IVE", "(G)I-DLE", "aespa", "ITZY", "BIGBANG",
        "Girls' Generation", "Super Junior", "SHINee", "IU", "Taeyeon", "Sunmi",
        "Chung Ha", "Zico", "Jay Park", "PSY", "Monsta X", "GOT7", "Mamamoo",

        # --- Other international artists ---
        # French
        "Stromae", "Angèle", "Aya Nakamura", "Indila", "Christine and the Queens",
        "Daft Punk", "Phoenix", "Gims", "Dadju", "Louane", "Mylène Farmer", "Serge Gainsbourg",
        # Japanese (J-Pop / J-Rock)
        "YOASOBI", "Official HIGE DANdism", "Kenshi Yonezu", "ONE OK ROCK", "RADWIMPS",
        "Utada Hikaru", "LiSA", "Ado", "King Gnu", "Perfume", "BABYMETAL",
        # Afrobeats / African Pop
        "Burna Boy", "Wizkid", "Davido", "Tiwa Savage", "Yemi Alade", "Mr Eazi",
        "Rema", "Fireboy DML", "Omah Lay", "Tems", "CKay",
        # Bollywood / Indian Pop
        "AR Rahman", "Atif Aslam", "Arijit Singh", "Shreya Ghoshal", "Neha Kakkar",
        "Badshah", "Diljit Dosanjh", "Lata Mangeshkar", "Kishore Kumar",
        # German
        "Rammstein", "Kraftwerk", "Scorpions", "Tokio Hotel", "Cro", "Apache 207",
        # Italian
        "Luciano Pavarotti", "Andrea Bocelli", "Eros Ramazzotti", "Laura Pausini", "Måneskin",
        # Brazilian
        "Anitta", "Seu Jorge", "Marisa Monte", "Caetano Veloso", "Gilberto Gil", "Roberto Carlos",
        # Arabic Pop
        "Amr Diab", "Nancy Ajram", "Elissa", "Fairuz", "Kadim Al Sahir", "Saad Lamjarred",
        # Scandinavian Pop
        "ABBA", "Kygo", "Avicii", "Zara Larsson", "Sigrid", "MØ", "Roxette",
        # Australian (some overlap with English-speaking, but distinct scene)
        "Sia", "Tones and I", "Gotye", "INXS", "Midnight Oil", "Kylie Minogue", "5 Seconds of Summer",
        # Canadian (some overlap with English-speaking, but distinct scene)
        "Drake", "The Weeknd", "Justin Bieber", "Shawn Mendes", "Céline Dion", "Shania Twain", "Alanis Morissette", "Arcade Fire", "Nickelback", "Bryan Adams", "Michael Bublé",
        # Irish
        "U2", "Enya", "The Cranberries", "Hozier", "Sinéad O'Connor", "Van Morrison", "Thin Lizzy"
    ]

    all_tracks_data = []
    tracks_to_fetch_features_for = []

    print(f"Starting data collection for {len(artist_names)} artists...")

    for artist_name in artist_names:
        print(f"\nFetching tracks for {artist_name}...")
        try:
            results = sp.search(q='artist:' + artist_name, type='artist', limit=1)
            if not results['artists']['items']:
                print(f"Could not find artist: {artist_name}")
                continue

            artist_uri = results['artists']['items'][0]['uri']
            top_tracks_results = sp.artist_top_tracks(artist_uri) # Fetches top 10 tracks by default

            for track in top_tracks_results['tracks']:
                if track and track['id']:
                    track_info = {
                        'track_id': track['id'],
                        'track_name': track['name'],
                        'artist_name': track['artists'][0]['name'] if track['artists'] else artist_name,
                        'album_name': track['album']['name'] if track['album'] else 'N/A',
                        'popularity': track['popularity'],
                        'track_uri': track['uri']
                    }
                    all_tracks_data.append(track_info)
                    tracks_to_fetch_features_for.append(track['uri'])
                    print(f"  Added: {track['name']} (ID: {track['id']})")
            
            time.sleep(0.6) # Increased sleep time slightly

        except Exception as e:
            print(f"Error fetching tracks for {artist_name}: {e}")
            if hasattr(e, 'http_status') and e.http_status == 429:
                print("Rate limit likely hit. Waiting for 75 seconds...") # Increased wait
                time.sleep(75)
            elif "Failed to resolve" in str(e) or "Max retries exceeded" in str(e) or "Temporary failure in name resolution" in str(e):
                print("Network issue or API unresponsive. Waiting for 45 seconds...") # Increased wait
                time.sleep(45)
            continue
    
    tracks_to_fetch_features_for = list(set(tracks_to_fetch_features_for)) # Unique URIs
    
    tracks_df = pd.DataFrame(all_tracks_data)
    if not tracks_df.empty:
        tracks_df.drop_duplicates(subset=['track_id'], keep='first', inplace=True)
    
    print(f"\nCollected basic info for {len(tracks_df)} unique tracks.")
    if not tracks_df.empty:
        print("\nBasic Track Info DataFrame (first 5 rows):")
        print(tracks_df.head())
    else:
        print("No track information was collected.")
else:
    print("Spotify API (sp) not authenticated. Cannot proceed with data collection.")

Starting data collection for 379 artists...

Fetching tracks for The WeMusic. BCCI crashes. I gotta get close to you, baby. And you? We met each other, and we'll never be serving care about the fan. I miss you so much. Put you on the Vicky Bry. I can get you what you like. It will be so much data. Damn. eknd...


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "artist:The WeMusic. BCCI crashes. I gotta get close to you, baby. And you? We met each other, and we'll never be serving care about the fan. I miss you so much. Put you on the Vicky Bry. I can get you what you like. It will be so much data. Damn. eknd", 'limit': 1, 'offset': 0, 'type': 'artist', 'market': None} returned 400 due to Query exceeds maximum length of 250 characters


Error fetching tracks for The WeMusic. BCCI crashes. I gotta get close to you, baby. And you? We met each other, and we'll never be serving care about the fan. I miss you so much. Put you on the Vicky Bry. I can get you what you like. It will be so much data. Damn. eknd: http status: 400, code: -1 - https://api.spotify.com/v1/search?q=artist%3AThe+WeMusic.+BCCI+crashes.+I+gotta+get+close+to+you%2C+baby.+And+you%3F+We+met+each+other%2C+and+we%27ll+never+be+serving+care+about+the+fan.+I+miss+you+so+much.+Put+you+on+the+Vicky+Bry.+I+can+get+you+what+you+like.+It+will+be+so+much+data.+Damn.+eknd&limit=1&offset=0&type=artist:
 Query exceeds maximum length of 250 characters, reason: None

Fetching tracks for Taylor Swift...
  Added: Cruel Summer (ID: 1BxfuPKGuaTgP7aM0Bbdwr)
  Added: Fortnight (feat. Post Malone) (ID: 2OzhQlSqBEmt7hmkYxfT6m)
  Added: cardigan (ID: 4R2kfaDFhslZEMJqAFNpdd)
  Added: august (ID: 3hUxzQpSfdDqwM3ZTFQY0K)
  Added: Lover (ID: 1dGr1c8CrMLDpV6mPbImSI)
  Added: Blank Sp

In [4]:
if 'sp' in locals() and sp and ('tracks_df' in locals() and not tracks_df.empty):
    print("\nProceeding to save the collected basic track data (without audio features).")
    
    # The DataFrame to save is tracks_df, which was created in Cell 2
    dataset_to_save_df = tracks_df 
    
    print(f"\nDataset to be saved contains {len(dataset_to_save_df)} tracks.")
    print("Basic Track Info DataFrame (first 5 rows):")
    print(dataset_to_save_df.head())

    # Ensure project_root is available from Cell 1
    # project_root should have been defined in Cell 1 (ID: b48dd0e4)
    if 'project_root' not in locals():
        print("Error: project_root not defined. Cannot determine save path.")
        print("Please ensure Cell 1 (the setup cell) ran correctly and defined 'project_root'.")
        # As a fallback, attempt to use the current working directory for project_root
        # This might not be the intended project root if Cell 1 had issues.
        project_root = os.getcwd() 
        print(f"Attempting to use current working directory as project_root for saving: {project_root}")

    # Define the output CSV filename and path
    output_csv_filename = 'spotify_track_info_dataset.csv' # Filename reflects content
    output_csv_path = os.path.join(project_root, output_csv_filename)
    
    try:
        dataset_to_save_df.to_csv(output_csv_path, index=False)
        print(f"\nDataset (basic track info) saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving dataset to {output_csv_path}: {e}.")
        print("Attempting to save to current working directory as a fallback.")
        try:
            # Fallback to saving in the current working directory
            fallback_path = output_csv_filename 
            dataset_to_save_df.to_csv(fallback_path, index=False)
            print(f"Dataset saved to fallback path (current working directory): {fallback_path}")
        except Exception as e_fallback:
            print(f"Error saving dataset to fallback path: {e_fallback}")

elif not ('tracks_df' in locals() and not tracks_df.empty):
    print("\nNo track data available (tracks_df is empty or not defined). Cannot save data.")
    print("Please ensure Cell 2 (data collection cell) ran successfully and populated 'tracks_df'.")
elif not ('sp' in locals() and sp):
    print("\nSpotify API (sp) not authenticated. Cannot save data.")
    print("Please ensure Cell 1 (authentication cell) ran successfully.")
else:
    # This case should ideally not be reached if the above conditions are comprehensive
    print("\nCould not save data due to an undetermined state. Please check previous cell outputs.")



Proceeding to save the collected basic track data (without audio features).

Dataset to be saved contains 3384 tracks.
Basic Track Info DataFrame (first 5 rows):
                 track_id                     track_name   artist_name  \
0  1BxfuPKGuaTgP7aM0Bbdwr                   Cruel Summer  Taylor Swift   
1  2OzhQlSqBEmt7hmkYxfT6m  Fortnight (feat. Post Malone)  Taylor Swift   
2  4R2kfaDFhslZEMJqAFNpdd                       cardigan  Taylor Swift   
3  3hUxzQpSfdDqwM3ZTFQY0K                         august  Taylor Swift   
4  1dGr1c8CrMLDpV6mPbImSI                          Lover  Taylor Swift   

                      album_name  popularity  \
0                          Lover          91   
1  THE TORTURED POETS DEPARTMENT          85   
2                       folklore          86   
3                       folklore          86   
4                          Lover          86   

                              track_uri  
0  spotify:track:1BxfuPKGuaTgP7aM0Bbdwr  
1  spotify:track:2O

In [None]:
# --- Load Genius API Token ---
GENIUS_TOKEN = os.getenv('GENIUS_ACCESS_TOKEN')

if not GENIUS_TOKEN:
    print("Error: GENIUS_ACCESS_TOKEN not found in environment variables.")
    print("Please add it to your .env file and restart the kernel.")
    genius = None
else:
    try:
        genius = lyricsgenius.Genius(GENIUS_TOKEN, 
                                     skip_non_songs=True, 
                                     excluded_terms=["(Remix)", "(Live)"], 
                                     remove_section_headers=True,
                                     timeout=15, # Increased timeout
                                     retries=3) # Number of retries
        print("Successfully initialized LyricsGenius.")
    except Exception as e:
        print(f"Error initializing LyricsGenius: {e}")
        genius = None

# --- Load your existing track dataset ---
# Assuming 'project_root' is defined from your earlier cells
track_info_csv_filename = 'spotify_track_info_dataset.csv' 
track_info_csv_path = os.path.join(project_root, track_info_csv_filename)

lyrics_enriched_csv_filename = 'spotify_lyrics_enriched_dataset.csv'
lyrics_enriched_csv_path = os.path.join(project_root, lyrics_enriched_csv_filename)

all_tracks_with_lyrics = []

if os.path.exists(track_info_csv_path) and genius:
    print(f"Loading track info from: {track_info_csv_path}")
    try:
        tracks_df_for_lyrics = pd.read_csv(track_info_csv_path)
        print(f"Successfully loaded {len(tracks_df_for_lyrics)} tracks for lyric fetching.")

        # Check if a partially completed lyrics file exists
        if os.path.exists(lyrics_enriched_csv_path):
            print(f"Found existing lyrics dataset: {lyrics_enriched_csv_path}. Loading it to resume.")
            lyrics_df_existing = pd.read_csv(lyrics_enriched_csv_path)
            # Create a set of already processed track_ids for faster lookup
            processed_track_ids = set(lyrics_df_existing['track_id'].astype(str))
            all_tracks_with_lyrics = lyrics_df_existing.to_dict('records')
            print(f"Resuming. Already processed {len(processed_track_ids)} tracks.")
        else:
            processed_track_ids = set()
            print("No existing lyrics dataset found. Starting fresh.")

        # Iterate through tracks and fetch lyrics
        for index, row in tracks_df_for_lyrics.iterrows():
            track_id = str(row['track_id']) # Ensure track_id is string for comparison
            
            if track_id in processed_track_ids:
                print(f"Skipping already processed: {row['track_name']} by {row['artist_name']}")
                continue

            track_name = row['track_name']
            artist_name = row['artist_name']
            print(f"\nFetching lyrics for: {track_name} by {artist_name} (ID: {track_id})")
            
            try:
                # Clean track name slightly for better matching (optional, can be more sophisticated)
                cleaned_track_name = track_name.split(' (')[0].split(' - ')[0] 
                
                song = genius.search_song(cleaned_track_name, artist_name)
                if song and song.lyrics:
                    lyrics = song.lyrics
                    # Basic cleaning: remove first line if it's like "Track Name Lyrics" and last line if "Embed"
                    lines = lyrics.split('\n')
                    if len(lines) > 1 and lines[0].lower().endswith("lyrics"):
                        lines = lines[1:]
                    if len(lines) > 1 and lines[-1].lower().strip().endswith("embed"): # Check last non-empty line
                        # Find the last actual lyric line before potential "Embed" or similar metadata
                        last_lyric_line_index = len(lines) -1
                        for i in range(len(lines) -1, -1, -1):
                            if lines[i].strip() == "": # skip empty lines
                                continue
                            if "embed" in lines[i].lower() or "you might also like" in lines[i].lower():
                                last_lyric_line_index = i -1
                            else:
                                break
                        lines = lines[:last_lyric_line_index+1]

                    lyrics = "\n".join(lines).strip()
                    print(f"  Lyrics found (first 100 chars): {lyrics[:100].replace(chr(10), ' ')}...")
                    track_data_with_lyrics = row.to_dict()
                    track_data_with_lyrics['lyrics'] = lyrics
                    all_tracks_with_lyrics.append(track_data_with_lyrics)
                else:
                    print(f"  Lyrics not found for {track_name} by {artist_name}")
                    track_data_with_lyrics = row.to_dict()
                    track_data_with_lyrics['lyrics'] = None # Or pd.NA or an empty string
                    all_tracks_with_lyrics.append(track_data_with_lyrics)

            except Exception as e:
                print(f"  Error fetching lyrics for {track_name} by {artist_name}: {e}")
                track_data_with_lyrics = row.to_dict()
                track_data_with_lyrics['lyrics'] = None
                all_tracks_with_lyrics.append(track_data_with_lyrics)
            
            # Be respectful to the API
            time.sleep(1.5) # Increased sleep time

            # Save progress periodically (e.g., every 20 songs)
            if (index + 1) % 20 == 0 and all_tracks_with_lyrics:
                temp_lyrics_df = pd.DataFrame(all_tracks_with_lyrics)
                temp_lyrics_df.to_csv(lyrics_enriched_csv_path, index=False)
                print(f"\n--- Saved progress to {lyrics_enriched_csv_path} ({len(temp_lyrics_df)} tracks) ---\n")
                processed_track_ids.update(temp_lyrics_df['track_id'].astype(str).tolist())


        # Final save
        if all_tracks_with_lyrics:
            lyrics_df_final = pd.DataFrame(all_tracks_with_lyrics)
            # Drop duplicates just in case, keeping the one that might have lyrics
            lyrics_df_final.sort_values(by='lyrics', na_position='last', inplace=True)
            lyrics_df_final.drop_duplicates(subset=['track_id'], keep='first', inplace=True)
            lyrics_df_final.to_csv(lyrics_enriched_csv_path, index=False)
            print(f"\nSuccessfully fetched lyrics and saved to: {lyrics_enriched_csv_path}")
            print(f"Total tracks in new dataset: {len(lyrics_df_final)}")
            print("\nLyrics Enriched DataFrame (first 5 rows with lyrics):")
            print(lyrics_df_final[lyrics_df_final['lyrics'].notna()].head())
        else:
            print("No lyrics were fetched or added.")

    except FileNotFoundError:
        print(f"Error: The file {track_info_csv_path} was not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file {track_info_csv_path} is empty.")
    except Exception as e:
        print(f"An unexpected error occurred while processing the CSV: {e}")

elif not genius:
    print("LyricsGenius not initialized. Cannot fetch lyrics.")
else:
    print(f"Track info CSV not found at {track_info_csv_path}. Cannot fetch lyrics.")

Successfully initialized LyricsGenius.
Loading track info from: c:\Users\balde\SongRecommendation\spotify_track_info_dataset.csv
Successfully loaded 3384 tracks for lyric fetching.
Found existing lyrics dataset: c:\Users\balde\SongRecommendation\spotify_lyrics_enriched_dataset.csv. Loading it to resume.
Resuming. Already processed 100 tracks.
Skipping already processed: Cruel Summer by Taylor Swift
Skipping already processed: Fortnight (feat. Post Malone) by Taylor Swift
Skipping already processed: cardigan by Taylor Swift
Skipping already processed: august by Taylor Swift
Skipping already processed: Lover by Taylor Swift
Skipping already processed: Blank Space by Taylor Swift
Skipping already processed: I Can Do It With a Broken Heart by Taylor Swift
Skipping already processed: I Don’t Wanna Live Forever (Fifty Shades Darker) by ZAYN
Skipping already processed: Don’t Blame Me by Taylor Swift
Skipping already processed: Anti-Hero by Taylor Swift
Skipping already processed: NOKIA by Dra