# Import Library

In [2]:
import requests
import pandas as pd
import time
import csv
from datetime import datetime

# Scrape Data

## Fetching 25 top rated songs in VocaDB

In [5]:
def fetch_top_rated_2025(filename="top_rated_2025.csv"):
    url = "https://vocadb.net/api/songs/top-rated"
    params = {
        "durationHours": 720 * 5,  # 5 bulan
        "filterBy": "Popularity",
        "vocalist": "Vocaloid",
        "maxResults": 100  # Ambil lebih banyak, karena nanti kita filter manual
    }

    response = requests.get(url, params=params)
    data = response.json()

    songs_2025 = []
    for song in data:
        publish_date = song.get("publishDate")
        if publish_date:
            pub_date = datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")
            if pub_date.year == 2025 and 1 <= pub_date.month <= 5:
                songs_2025.append(song)

    # Ambil hanya 25 teratas berdasarkan ratingScore
    songs_2025 = sorted(songs_2025, key=lambda x: x.get("ratingScore", 0), reverse=True)[:25]

    # Simpan ke CSV
    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["id", "name", "artistString", "vocalist", "publishDate", "ratingScore", "pvServices"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for s in songs_2025:
            writer.writerow({
                "id": s.get("id"),
                "name": s.get("name"),
                "artistString": s.get("artistString"),
                "vocalist": s.get("vocalist"),
                "publishDate": s.get("publishDate"),
                "ratingScore": s.get("ratingScore"),
                "pvServices": s.get("pvServices"),
            })

    print(f"Selesai simpan {len(songs_2025)} lagu ke {filename}")
    return songs_2025

## Fetch YouTube URLs from top rated 25 songs

In [None]:
def get_youtube_urls_from_songs(songs, output_file="songs_with_youtube.csv"):
    results = []

    for song in songs:
        song_id = song["id"]
        song_name = song.get("defaultName", "")
        try:
            url = f"https://vocadb.net/api/songs/{song_id}"
            params = {
                "fields": "PVs"
            }
            response = requests.get(url, params=params)
            if response.status_code != 200:
                print(f"[GAGAL] Fetch detail lagu ID {song_id}")
                continue

            data = response.json()
            pvs = data.get("pvs", [])

            # Ambil semua URL YouTube yang aktif
            youtube_urls = [
                pv.get("url") for pv in pvs
                if pv.get("service", "").lower() == "youtube" and not pv.get("disabled", False)
            ]

            if youtube_urls:
                for yt_url in youtube_urls:
                    results.append({
                        "song_id": song_id,
                        "song_name": song_name,
                        "youtube_url": yt_url
                    })
                print(f"[OK] Lagu ID {song_id} → {len(youtube_urls)} YouTube URL ditemukan.")
            else:
                print(f"[NO YT] Lagu ID {song_id} tidak punya YouTube URL")

            time.sleep(0.5)

        except Exception as e:
            print(f"[ERROR] Exception saat ambil ID {song_id}: {e}")

    # Simpan ke CSV
    with open(output_file, mode="w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["song_id", "song_name", "youtube_url"])
        writer.writeheader()
        for row in results:
            writer.writerow(row)

    print(f"Selesai simpan {len(results)} URL YouTube ke {output_file}")
    return results

In [None]:
# Baca csv jadi list
top_songs = []
with open("top_rated_2025.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        top_songs.append({
            "id": int(row["id"]),
            "defaultName": row.get("defaultName", "")
        })

# 2. Ambil YouTube URL-nya
youtube_urls = get_youtube_urls_from_songs(top_songs, output_file="top_25_youtube_urls.csv")


[OK] Lagu ID 734953 → 2 YouTube URL ditemukan.
[OK] Lagu ID 793121 → 2 YouTube URL ditemukan.
[OK] Lagu ID 752940 → 2 YouTube URL ditemukan.
[OK] Lagu ID 749433 → 1 YouTube URL ditemukan.
[OK] Lagu ID 748682 → 2 YouTube URL ditemukan.
[OK] Lagu ID 750840 → 2 YouTube URL ditemukan.
[OK] Lagu ID 747700 → 2 YouTube URL ditemukan.
[OK] Lagu ID 761077 → 2 YouTube URL ditemukan.
[OK] Lagu ID 762777 → 2 YouTube URL ditemukan.
[OK] Lagu ID 776144 → 2 YouTube URL ditemukan.
[OK] Lagu ID 786030 → 1 YouTube URL ditemukan.
[OK] Lagu ID 768497 → 2 YouTube URL ditemukan.
[OK] Lagu ID 749801 → 1 YouTube URL ditemukan.
[OK] Lagu ID 771172 → 2 YouTube URL ditemukan.
[OK] Lagu ID 747458 → 2 YouTube URL ditemukan.
[OK] Lagu ID 788902 → 2 YouTube URL ditemukan.
[OK] Lagu ID 729113 → 2 YouTube URL ditemukan.
[OK] Lagu ID 732509 → 2 YouTube URL ditemukan.
[OK] Lagu ID 753297 → 2 YouTube URL ditemukan.
[OK] Lagu ID 747843 → 1 YouTube URL ditemukan.
[OK] Lagu ID 743632 → 2 YouTube URL ditemukan.
[OK] Lagu ID 

In [None]:
from googleapiclient.discovery import build
from urllib.parse import urlparse, parse_qs
import re
# Set ini jika kamu punya YouTube API Key
YOUTUBE_API_KEY = "YOUTUBE_API_KEY"  # Ganti dengan kunci asli

youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

def fetch_youtube_comments(video_id, max_comments=500):
    comments = []
    next_page_token = None

    try:
        while len(comments) < max_comments:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,  # YouTube API max per page
                pageToken=next_page_token,
                textFormat="plainText"
            ).execute()

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comments.append({
                    "author": comment.get("authorDisplayName"),
                    "message": comment.get("textDisplay"),
                    "published_at": comment.get("publishedAt"),
                    "like_count": comment.get("likeCount")
                })

                if len(comments) >= max_comments:
                    break

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

    except Exception as e:
        print(f"⚠️ Gagal ambil komentar dari video ID {video_id}: {e}")

    return comments

def extract_video_id(youtube_url):
    try:
        if "youtu.be" in youtube_url:
            return youtube_url.split("/")[-1]
        parsed_url = urlparse(youtube_url)
        query = parse_qs(parsed_url.query)
        return query.get("v", [None])[0]
    except Exception:
        return None

def fetch_youtube_comments_to_csv(song_youtube_data, output_file="yt_comments.csv", max_comments_per_video=500):
    all_comments = []

    for song in song_youtube_data:
        song_id = song["song_id"]
        song_name = song["song_name"]
        youtube_url = song["youtube_url"]
        video_id = extract_video_id(youtube_url)

        if not video_id:
            print(f"[SKIP] Gagal ekstrak video ID dari URL: {youtube_url}")
            continue

        print(f"[YT] Ambil komentar untuk '{song_name}' (ID: {song_id})")
        comments = fetch_youtube_comments(video_id, max_comments=max_comments_per_video)

        for c in comments:
            all_comments.append({
                "song_id": song_id,
                "song_name": song_name,
                "video_id": video_id,
                "author": c["author"],
                "message": c["message"],
                "published_at": c["published_at"],
                "like_count": c["like_count"]
            })

        print(f"[OK] {len(comments)} komentar diambil.")
        time.sleep(1)

    with open(output_file, "w", newline="", encoding="utf-8") as f:
        fieldnames = ["song_id", "song_name", "video_id", "author", "message", "published_at", "like_count"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for c in all_comments:
            writer.writerow(c)

    print(f"[DONE] Total {len(all_comments)} komentar disimpan ke '{output_file}'")

In [34]:
top_songs = []
with open("top_rated_2025.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        top_songs.append({
            "id": int(row["id"]),
            "defaultName": row.get("defaultName", "")
        })

# Langkah 1: Ambil YouTube URLs dari daftar lagu
songs_with_yt = get_youtube_urls_from_songs(top_songs, output_file="songs_with_youtube.csv")

data = pd.read_csv("songs_with_youtube.csv").to_dict(orient="records")
fetch_youtube_comments_to_csv(data, max_comments_per_video=500)


[OK] Lagu ID 734953 → 2 YouTube URL ditemukan.
[OK] Lagu ID 793121 → 2 YouTube URL ditemukan.
[OK] Lagu ID 752940 → 2 YouTube URL ditemukan.
[OK] Lagu ID 749433 → 1 YouTube URL ditemukan.
[OK] Lagu ID 748682 → 2 YouTube URL ditemukan.
[OK] Lagu ID 750840 → 2 YouTube URL ditemukan.
[OK] Lagu ID 747700 → 2 YouTube URL ditemukan.
[OK] Lagu ID 761077 → 2 YouTube URL ditemukan.
[OK] Lagu ID 762777 → 2 YouTube URL ditemukan.
[OK] Lagu ID 776144 → 2 YouTube URL ditemukan.
[OK] Lagu ID 786030 → 1 YouTube URL ditemukan.
[OK] Lagu ID 768497 → 2 YouTube URL ditemukan.
[OK] Lagu ID 749801 → 1 YouTube URL ditemukan.
[OK] Lagu ID 771172 → 2 YouTube URL ditemukan.
[OK] Lagu ID 747458 → 2 YouTube URL ditemukan.
[OK] Lagu ID 788902 → 2 YouTube URL ditemukan.
[OK] Lagu ID 729113 → 2 YouTube URL ditemukan.
[OK] Lagu ID 732509 → 2 YouTube URL ditemukan.
[OK] Lagu ID 753297 → 2 YouTube URL ditemukan.
[OK] Lagu ID 747843 → 1 YouTube URL ditemukan.
[OK] Lagu ID 743632 → 2 YouTube URL ditemukan.
[OK] Lagu ID 