In [1]:
!pip install requests beautifulsoup4



In [9]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time

def get_date_list(start_date_str, end_date_str):
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    date_list = []

    while start_date <= end_date:
        date_list.append(start_date.strftime("%Y-%m-%d"))
        start_date += timedelta(days=1)  # 하루씩 증가로 해놓긴했는데 빌보드 상에서 일일 순위가 확인이 가능하긴한데 업데이트는 1주일단위로 하는거같음

    return date_list

def scrape_billboard(date):
    url = f"https://www.billboard.com/charts/hot-100/{date}"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    chart_items = soup.select("div.o-chart-results-list-row-container")

    daily_results = []

    for item in chart_items[:12]:  # 최대 100위까지 시도
        try:
            ranking = item.select_one("span.c-label.a-font-primary-bold-l").text.strip()
            title = item.select_one("h3.c-title").text.strip()
            singer = item.select_one("span.c-label.a-no-trucate.a-font-primary-s").text.strip()

            daily_results.append({
                "date": date,
                "ranking": ranking,
                "title": title,
                "singer": singer
            })

        except AttributeError:
            continue  # 일부 항목 누락 시 무시

    print(f"\n📅 [{date}] 상위 10곡")
    for r in daily_results[:10]:
        print(f"  #{r['ranking']} | {r['title']} - {r['singer']}")

    return daily_results

def crawl_billboard_range(start_date, end_date, delay_sec=3):
    all_results = []
    status_report = []
    date_list = get_date_list(start_date, end_date)

    print(f"크롤링 날짜 리스트: {date_list}")
    print(f"총 {len(date_list)}개의 일자를 크롤링합니다...\n")

    for date in date_list:
        print(f"{date} → 크롤링 시작...")
        try:
            results = scrape_billboard(date)
            count = len(results)

            if count > 0:
                print(f"{date}: {count}곡 수집 성공")
                status_report.append((date, "성공", count))
            else:
                print(f"{date}: 곡 수집 실패 또는 차트 없음")
                status_report.append((date, "실패", 0))

            all_results.extend(results)

        except Exception as e:
            print(f"{date}: 에러 발생 → {e}")
            status_report.append((date, "에러", 0))

        time.sleep(delay_sec)

    print("\n 날짜별 크롤링 요약:")
    for date, status, count in status_report:
        print(f" - {date} | {status} | {count}곡")

    return all_results


# 사용 예시
start_date = "2025-05-21"
end_date = "2025-05-27"

data = crawl_billboard_range(start_date, end_date)



크롤링 날짜 리스트: ['2025-05-21', '2025-05-22', '2025-05-23', '2025-05-24', '2025-05-25', '2025-05-26', '2025-05-27']
총 7개의 일자를 크롤링합니다...

2025-05-21 → 크롤링 시작...
2025-05-21: 에러 발생 → HTTPSConnectionPool(host='www.billboard.com', port=443): Max retries exceeded with url: /charts/hot-100/2025-05-21 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000185DC548A50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
2025-05-22 → 크롤링 시작...
2025-05-22: 에러 발생 → HTTPSConnectionPool(host='www.billboard.com', port=443): Max retries exceeded with url: /charts/hot-100/2025-05-22 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000185DC548D50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
2025-05-23 → 크롤링 시작...

📅 [2025-05-23] 상위 10곡
  #1 | Luther - Kendrick Lamar & SZA
  #2 | Ordinary - Alex Warren
  #3 | Die With A Smile - Lady Gaga & Bruno Mars
  #4 | A Bar Song (Tipsy) - Shaboozey
  #5 | N