# Genius Lyrics Metadata Scraper

Script untuk mengambil metadata (views, release date, song_id) dari halaman Genius berdasarkan URL yang ada di CSV.

## Features:
- Multi-fallback extraction untuk song_id
- Proxy support
- Retry mechanism
- Batch processing
- Output ke CSV

## 1. Import Libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import os
import json

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


## 2. Configuration

In [3]:
# --- KONFIGURASI ---
INPUT_FILE = "lyrics_final_4.csv"
BATCH_INDEX = 11
SAMPLE_SIZE = 1000
WAIT_BETWEEN_REQUESTS = 0.8
MAX_RETRIES = 3

OUTPUT_DIR = "output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

FINAL_OUTPUT_FILE = f"{OUTPUT_DIR}/lyrics_final_metadata.csv"

USE_PROXIES = True
PROXIES = [
    {
        "http": "https://td-customer-fKohPcboppa9:oabSMcuisbvj@3cx2lqeo.pr.thordata.net:9999",
        "https": "https://td-customer-fKohPcboppa9:oabSMcuisbvj@3cx2lqeo.pr.thordata.net:9999"
    }
]

print(f"📁 Input file: {INPUT_FILE}")
print(f"📊 Batch {BATCH_INDEX}, size: {SAMPLE_SIZE}")
print(f"💾 Output directory: {OUTPUT_DIR}")
print(f"🌐 Using proxies: {USE_PROXIES}")

📁 Input file: lyrics_final_4.csv
📊 Batch 11, size: 1000
💾 Output directory: output
🌐 Using proxies: True


## 3. Utility Functions

In [4]:
# --- UTIL: Ekstraksi song_id dari HTML (multi-fallback) ---
def extract_song_id(html: str):
    """
    Ekstraksi song_id dari HTML dengan multiple fallback methods
    """
    # 1) trackingData: {"key":"Song ID","value":12345}
    m = re.search(r'"key"\s*:\s*"Song ID"\s*,\s*"value"\s*:\s*(\d+)', html, re.I)
    if m:
        return int(m.group(1))

    # 2) embedContent: data-song-id='12345' (quote bisa dibackslash)
    m = re.search(r"data-song-id\s*=\s*\\?['\"](\d+)\\?['\"]", html, re.I)
    if m:
        return int(m.group(1))

    # 3) pusherChannel: "song-12345"
    m = re.search(r'"pusherChannel"\s*:\s*"song-(\d+)"', html, re.I)
    if m:
        return int(m.group(1))

    # 4) dfpKv: ..."name":"song_id","values":["12345"]
    m = re.search(r'"name"\s*:\s*"song_id"\s*,\s*"values"\s*:\s*\[\s*"(\d+)"\s*\]', html, re.I)
    if m:
        return int(m.group(1))

    # 5) Parse window._PRELOADED_STATE_ baik yang JSON.parse('...') maupun objek langsung
    # 5a) JSON.parse('...') → perlu 2x json.loads (decode string → parse json)
    mm = re.search(r"window\._PRELOADED_STATE_\s*=\s*JSON\.parse\(\s*'(.+?)'\s*\)\s*;", html, re.S)
    if mm:
        blob = mm.group(1)
        try:
            state_str = json.loads(f'"{blob}"')  # decode escape dari string JS
            state = json.loads(state_str)       # parse JSON jadi dict
            sid = (state.get("songPage") or {}).get("song")
            if isinstance(sid, (int, str)) and str(sid).isdigit():
                return int(sid)
            for td in (state.get("songPage") or {}).get("trackingData", []):
                if isinstance(td, dict) and td.get("key") == "Song ID":
                    val = td.get("value")
                    if isinstance(val, (int, str)) and str(val).isdigit():
                        return int(val)
            songs = (state.get("entities") or {}).get("songs", {})
            for v in songs.values():
                if isinstance(v, dict) and str(v.get("id", "")).isdigit():
                    return int(v["id"])
        except Exception:
            pass

    # 5b) Objek langsung: window._PRELOADED_STATE_ = { ... };
    mm = re.search(r"window\._PRELOADED_STATE_\s*=\s*(\{.?\})\s;", html, re.S)
    if mm:
        try:
            state = json.loads(mm.group(1))
            sid = (state.get("songPage") or {}).get("song")
            if isinstance(sid, (int, str)) and str(sid).isdigit():
                return int(sid)
            for td in (state.get("songPage") or {}).get("trackingData", []):
                if isinstance(td, dict) and td.get("key") == "Song ID":
                    val = td.get("value")
                    if isinstance(val, (int, str)) and str(val).isdigit():
                        return int(val)
            songs = (state.get("entities") or {}).get("songs", {})
            for v in songs.values():
                if isinstance(v, dict) and str(v.get("id", "")).isdigit():
                    return int(v["id"])
        except Exception:
            pass

    return None

print("✅ extract_song_id function defined")

✅ extract_song_id function defined


## 4. Main Scraper Function

In [5]:
def get_metadata_from_genius(url):
    """
    Mengambil metadata (views, release_date, song_id) dari halaman Genius
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/114.0.0.0 Safari/537.36"
    }
    proxies = PROXIES[0] if USE_PROXIES else None

    for _ in range(MAX_RETRIES):
        try:
            response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
            if response.status_code != 200:
                time.sleep(1)
                continue

            time.sleep(3)

            html = response.text
            soup = BeautifulSoup(html, "html.parser")

            # Ambil views dari span[title*="views"]
            views = None
            view_spans = soup.select("span[title*='views']")
            for span in view_spans:
                title_text = span.get("title", "") or ""
                match = re.search(r"([\d.,]+)\s+views", title_text)
                if match:
                    views_str = match.group(1).replace(".", "").replace(",", "")
                    if views_str.isdigit():
                        views = int(views_str)
                    else:
                        # last resort: buang non-digit
                        views = int(re.sub(r"\D", "", views_str))
                    break

            # === PERBAIKAN RELEASE DATE dengan multiple fallback ===
            release_date = None
            
            # Method 1: Selector asli
            metadata_labels = soup.select(
                "div.MetadataStats_Container-sc-8a5f771a-0 span.LabelWithIcon_Label-sc-a1922d73-1"
            )
            if metadata_labels:
                text = metadata_labels[0].get_text(strip=True)
                if re.match(r"[A-Za-z]{3,}\.?\s\d{1,2},\s\d{4}", text):
                    release_date = text

            # Method 2: Cari di semua span yang mengandung pattern tanggal
            if not release_date:
                all_spans = soup.find_all('span')
                for span in all_spans:
                    text = span.get_text(strip=True)
                    # Pattern: "January 1, 2020" atau "Jan 1, 2020"
                    if re.match(r"[A-Za-z]{3,}\.?\s\d{1,2},\s\d{4}", text):
                        release_date = text
                        break
                    # Pattern: "Released January 1, 2020"
                    match = re.search(r"Released\s+([A-Za-z]{3,}\.?\s\d{1,2},\s\d{4})", text, re.I)
                    if match:
                        release_date = match.group(1)
                        break

            # Method 3: Cari di div metadata lain dengan class yang mungkin berubah
            if not release_date:
                # Cari semua div yang mengandung metadata
                metadata_divs = soup.find_all('div', class_=re.compile(r'metadata|stats|info', re.I))
                for div in metadata_divs:
                    text = div.get_text(strip=True)
                    match = re.search(r"([A-Za-z]{3,}\.?\s\d{1,2},\s\d{4})", text)
                    if match:
                        release_date = match.group(1)
                        break

            # Method 4: Cari di semua element yang mengandung kata "release"
            if not release_date:
                try:
                    # Gunakan 'string' parameter untuk BeautifulSoup versi baru
                    all_elements = soup.find_all(string=re.compile(r'release', re.I))
                    for element in all_elements:
                        parent = element.parent
                        if parent:
                            text = parent.get_text(strip=True)
                            match = re.search(r"([A-Za-z]{3,}\.?\s\d{1,2},\s\d{4})", text)
                            if match:
                                release_date = match.group(1)
                                break
                except Exception:
                    # Fallback untuk versi BeautifulSoup lama
                    try:
                        all_elements = soup.find_all(text=re.compile(r'release', re.I))
                        for element in all_elements:
                            parent = element.parent
                            if parent:
                                text = parent.get_text(strip=True)
                                match = re.search(r"([A-Za-z]{3,}\.?\s\d{1,2},\s\d{4})", text)
                                if match:
                                    release_date = match.group(1)
                                    break
                    except Exception:
                        pass

            # Method 5: Cari di JSON data (PRELOADED_STATE)
            if not release_date:
                try:
                    # Cari di window._PRELOADED_STATE_
                    state_match = re.search(r"window\._PRELOADED_STATE_\s*=\s*({.*?});", html, re.S)
                    if state_match:
                        state_data = json.loads(state_match.group(1))
                        # Cari release date di berbagai lokasi dalam state
                        song_data = state_data.get("entities", {}).get("songs", {})
                        for song in song_data.values():
                            if isinstance(song, dict):
                                release_info = song.get("release_date_for_display") or song.get("releaseDate")
                                if release_info and re.match(r"[A-Za-z]{3,}\.?\s\d{1,2},\s\d{4}", str(release_info)):
                                    release_date = str(release_info)
                                    break
                except Exception:
                    pass

            # Method 6: Cari pattern tanggal di semua text content
            if not release_date:
                full_text = soup.get_text()
                # Cari semua pattern tanggal dalam format yang umum
                date_patterns = [
                    r"([A-Za-z]{3,}\.?\s\d{1,2},\s\d{4})",  # January 1, 2020
                    r"(\d{1,2}\s[A-Za-z]{3,}\.?\s\d{4})",   # 1 January 2020
                    r"(\d{4}-\d{2}-\d{2})",                 # 2020-01-01
                ]
                
                for pattern in date_patterns:
                    matches = re.findall(pattern, full_text)
                    for match in matches:
                        # Validasi bahwa ini adalah tanggal yang masuk akal (tidak terlalu lama/baru)
                        year_match = re.search(r'\d{4}', match)
                        if year_match:
                            year = int(year_match.group())
                            if 1950 <= year <= 2025:  # Range tahun yang masuk akal
                                release_date = match
                                break
                    if release_date:
                        break

            # >>> NEW: Ambil song_id dari HTML mentah (bukan dari soup)
            song_id = extract_song_id(html)

            return {
                "views": views,
                "release_date": release_date,
                "song_id": song_id
            }

        except Exception as e:
            # Uncomment line di bawah untuk debugging jika diperlukan
            # print(f"     ⚠️  Error: {str(e)[:50]}...")
            time.sleep(1)

        time.sleep(1)

    return {
        "views": None,
        "release_date": None,
        "song_id": None
    }


## 5. Load and Prepare Data

In [6]:
# --- LOAD DATA ---
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"File tidak ditemukan: {INPUT_FILE}")

df = pd.read_csv(INPUT_FILE)
start_idx = BATCH_INDEX * SAMPLE_SIZE
end_idx = start_idx + SAMPLE_SIZE
df_batch = df.iloc[start_idx:end_idx].copy()

if "views" not in df_batch.columns:
    df_batch["views"] = None
if "release_date" not in df_batch.columns:
    df_batch["release_date"] = None
if "song_id" not in df_batch.columns:
    df_batch["song_id"] = None  # <<< NEW kolom

print(f"📊 Dataset loaded: {len(df)} total rows")
print(f"📋 Batch info: rows {start_idx}–{end_idx} ({len(df_batch)} rows)")
print(f"📋 Columns: {list(df_batch.columns)}")
print(f"\n📁 Sample data:")
print(df_batch.head())

📊 Dataset loaded: 20632 total rows
📋 Batch info: rows 11000–12000 (1000 rows)
📋 Columns: ['Unnamed: 0', 'song_id', 'song_name', 'artist_id', 'artist_name', 'album_name', 'genre_name', 'musixmatch_url', 'is_music', 'lyrics_clean', 'genius_url', 'lyrics_len', 'views', 'release_date']

📁 Sample data:
       Unnamed: 0    song_id          song_name  artist_id        artist_name  \
11000       11082  123795258   a drawn out exit   10771225  dark tranquillity   
11001       11083  120629831              intro   10012426              mocca   
11002       11084  128974287       dying inside   11619313            twinnie   
11003       11085  111493693  new york new york      17131      frank sinatra   
11004       11086  112995089               1994     203046       jason aldean   

                          album_name       genre_name  \
11000                         Moment    Barat - Metal   
11001                        Colours      Indie - Pop   
11002                   Dying Inside  Barat

## 6. Run Scraping Process

In [None]:
print(f"\n🚀 Mulai scraping batch {BATCH_INDEX} (baris {start_idx}–{end_idx})\n")

for idx, row in df_batch.iterrows():
    url = row.get("genius_url", "")
    song = row.get("song_name", "")
    artist = row.get("artist_name", "")

    print(f"🔍 [{idx}] {artist} – {song}")
    print(f"     🌐 URL: {url}")

    if pd.isna(url) or not url.strip():
        print("     ⚠️  URL kosong → dilewati.\n")
        continue

    metadata = get_metadata_from_genius(url)
    df_batch.at[idx, "views"] = metadata.get("views")
    df_batch.at[idx, "release_date"] = metadata.get("release_date")
    df_batch.at[idx, "song_id"] = metadata.get("song_id")

    if metadata.get("views") or metadata.get("release_date") or metadata.get("song_id") is not None:
        if metadata.get("views"):
            print(f"     ✅ Views        : {metadata.get('views')}")
        else:
            print(f"     ✅ Views        : Not found")
        if metadata.get("release_date"):
            print(f"     📅 Release date : {metadata.get('release_date')}")
        else:
            print(f"     📅 Release date : Not found")
        print(f"     🆔 Song ID      : {metadata.get('song_id') if metadata.get('song_id') is not None else 'Not found'}\n")
    else:
        print(f"     ❌ Gagal mengambil metadata.\n")

    time.sleep(WAIT_BETWEEN_REQUESTS)


🚀 Mulai scraping batch 11 (baris 11000–12000)

🔍 [11000] dark tranquillity – a drawn out exit
     🌐 URL: https://genius.com/dark-tranquillity-a-drawn-out-exit-lyrics
     📅 Release date : Nov. 20, 2020
     🆔 Song ID      : 5947087

🔍 [11001] mocca – intro
     🌐 URL: https://genius.com/mocca-intro-lyrics
     📅 Release date : Not found
     🆔 Song ID      : 1273694

🔍 [11002] twinnie – dying inside
     🌐 URL: https://genius.com/twinnie-dying-inside-lyrics
     📅 Release date : Apr. 22, 2022
     🆔 Song ID      : 7933123

🔍 [11003] frank sinatra – new york new york
     🌐 URL: https://genius.com/frank-sinatra-new-york-new-york-lyrics
     ✅ Views        : 607360
     📅 Release date : Mar. 26, 1980
     🆔 Song ID      : 4260

🔍 [11004] jason aldean – 1994
     🌐 URL: https://genius.com/jason-aldean-1994-lyrics
     📅 Release date : Oct. 16, 2012
     🆔 Song ID      : 156359

🔍 [11005] shaun – dream
     🌐 URL: https://genius.com/shaun-dream-lyrics
     📅 Release date : Dec. 25, 2017


## 7. Save Results

In [None]:
# --- SIMPAN KE FILE FINAL ---
if os.path.exists(FINAL_OUTPUT_FILE):
    df_existing = pd.read_csv(FINAL_OUTPUT_FILE)
    df_combined = pd.concat([df_existing, df_batch], ignore_index=True)
    df_combined.drop_duplicates(
        subset=["song_name", "artist_name", "genius_url"],
        keep="last",
        inplace=True
    )
    print(f"📋 Combined with existing data: {len(df_existing)} + {len(df_batch)} = {len(df_combined)} rows")
else:
    df_combined = df_batch
    print(f"📋 New file created with {len(df_combined)} rows")

df_combined.to_csv(FINAL_OUTPUT_FILE, index=False)
print(f"\n✅ Metadata batch {BATCH_INDEX} disimpan ke: {FINAL_OUTPUT_FILE}")

In [None]:
import matplotlib.pyplot as plt
# --- FUNGSI ANALISIS DATA ---
def analyze_scraped_data(df: pd.DataFrame):
    """
    Melakukan analisis dasar pada data yang sudah di-scrape
    """
    plt.figure(figsize=(15, 10))
    
    # Views distribution
    plt.subplot(2, 2, 1)
    views_data = df['views'].dropna()
    if len(views_data) > 0:
        plt.hist(views_data, bins=30, edgecolor='black', alpha=0.7, color='skyblue')
        plt.xlabel('Views')
        plt.ylabel('Frequency')
        plt.title('Distribution of Song Views')
        plt.yscale('log')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'No views data available', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Distribution of Song Views - No Data')
    
    # Release years
    plt.subplot(2, 2, 2)
    release_years = df['release_date'].dropna().str.extract(r'(\d{4})')[0]
    if len(release_years) > 0:
        release_years = release_years.astype(int)
        plt.hist(release_years, bins=20, edgecolor='black', alpha=0.7, color='lightgreen')
        plt.xlabel('Release Year')
        plt.ylabel('Frequency')
        plt.title('Distribution of Release Years')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'No release date data available', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Distribution of Release Years - No Data')
    
    # Success rates
    plt.subplot(2, 2, 3)
    success_rates = [
        df['views'].notna().mean() * 100,
        df['release_date'].notna().mean() * 100,
        df['song_id'].notna().mean() * 100
    ]
    labels = ['Views', 'Release Date', 'Song ID']
    bars = plt.bar(labels, success_rates, color=['skyblue', 'lightgreen', 'salmon'], alpha=0.8)
    plt.ylabel('Success Rate (%)')
    plt.title('Metadata Extraction Success Rates')
    plt.ylim(0, 100)
    plt.grid(True, alpha=0.3)
    
    # Tambahkan nilai di atas bar
    for bar, rate in zip(bars, success_rates):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{rate:.1f}%', ha='center', va='bottom')
    
    # Top artists by average views
    plt.subplot(2, 2, 4)
    artist_views = df.groupby('artist_name')['views'].mean().dropna().sort_values(ascending=False).head(10)
    if len(artist_views) > 0:
        plt.barh(range(len(artist_views)), artist_views.values, color='coral', alpha=0.8)
        plt.yticks(range(len(artist_views)), artist_views.index)
        plt.xlabel('Average Views')
        plt.title('Top 10 Artists by Average Views')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3)
        
        # Format x-axis untuk views yang besar
        ax = plt.gca()
        ax.ticklabel_format(style='scientific', axis='x', scilimits=(0,0))
    else:
        plt.text(0.5, 0.5, 'No artist views data available', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Top 10 Artists by Average Views - No Data')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\n📊 DATA ANALYSIS SUMMARY")
    print("=" * 60)
    
    total_songs = len(df)
    print(f"📈 Total songs processed: {total_songs}")
    
    if len(views_data) > 0:
        print(f"📊 Views statistics:")
        print(f"   • Mean: {views_data.mean():,.0f}")
        print(f"   • Median: {views_data.median():,.0f}")
        print(f"   • Max: {views_data.max():,.0f}")
        print(f"   • Min: {views_data.min():,.0f}")
    
    if len(release_years) > 0:
        print(f"📅 Release year range: {release_years.min()} - {release_years.max()}")
        print(f"📅 Most common release year: {release_years.mode().iloc[0] if not release_years.mode().empty else 'N/A'}")
    
    # Top artists info
    if len(artist_views) > 0:
        print(f"🎤 Top artist by avg views: {artist_views.index[0]} ({artist_views.iloc[0]:,.0f} views)")
    
    print(f"\n✅ Success rates:")
    print(f"   • Views: {success_rates[0]:.1f}%")
    print(f"   • Release dates: {success_rates[1]:.1f}%") 
    print(f"   • Song IDs: {success_rates[2]:.1f}%")
    
    return df

## 8. Results Summary

In [None]:
# --- SUMMARY ---
print("\n📊 SCRAPING SUMMARY")
print("=" * 50)

views_count = df_batch['views'].notna().sum()
date_count = df_batch['release_date'].notna().sum()
id_count = df_batch['song_id'].notna().sum()

print(f"📈 Views extracted: {views_count}/{len(df_batch)} ({views_count/len(df_batch)*100:.1f}%)")
print(f"📅 Release dates extracted: {date_count}/{len(df_batch)} ({date_count/len(df_batch)*100:.1f}%)")
print(f"🆔 Song IDs extracted: {id_count}/{len(df_batch)} ({id_count/len(df_batch)*100:.1f}%)")

# === TAMBAHAN: ANALISIS DATA ===
print(f"\n🔍 Menjalankan analisis data...")
try:
    # Analisis data yang baru saja di-scrape (batch ini)
    print("\n📊 ANALISIS BATCH SAAT INI:")
    analyze_scraped_data(df_batch)
    
    # Jika ingin analisis data keseluruhan (termasuk batch sebelumnya)
    if os.path.exists(FINAL_OUTPUT_FILE):
        df_all = pd.read_csv(FINAL_OUTPUT_FILE)
        print("\n📊 ANALISIS DATA KESELURUHAN:")
        analyze_scraped_data(df_all)
        
except Exception as e:
    print(f"⚠️  Error dalam analisis: {e}")
    print("💡 Pastikan matplotlib dan seaborn sudah terinstall: pip install matplotlib seaborn")


In [None]:
# # --- SUMMARY ---
# print("\n📊 SCRAPING SUMMARY")
# print("=" * 50)

# # Count successful extractions
# views_count = df_batch['views'].notna().sum()
# date_count = df_batch['release_date'].notna().sum()
# id_count = df_batch['song_id'].notna().sum()

# print(f"📈 Views extracted: {views_count}/{len(df_batch)} ({views_count/len(df_batch)*100:.1f}%)")
# print(f"📅 Release dates extracted: {date_count}/{len(df_batch)} ({date_count/len(df_batch)*100:.1f}%)")
# print(f"🆔 Song IDs extracted: {id_count}/{len(df_batch)} ({id_count/len(df_batch)*100:.1f}%)")

# print(f"\n📁 Final output file: {FINAL_OUTPUT_FILE}")
# print(f"📊 Total rows in final file: {len(df_combined)}")

# # Show sample of results
# print(f"\n📋 Sample results:")
# sample_data = df_batch[['song_name', 'artist_name', 'views', 'release_date', 'song_id']].head()
# print(sample_data.to_string(index=False))