## 노래 정보에서 스크래핑 대상
- 곡ID, 앨범ID, 가수, 제목, 장르, 좋아요, 댓글, 작사가, 작곡가, 편곡자
- 창작자정보(작곡가, 작사가, 편곡자) : 창작ID, 창작자명

In [1]:
import os
import pandas as pd

# 저장했던 CSV 파일 경로
csv_dir = os.path.join(os.getcwd(), 'csv')
csv_file_path = os.path.join(csv_dir, 'melon_yearly_top100.csv')

try:
    df = pd.read_csv(csv_file_path, encoding='utf-8')
    
    # 데이터 확인
    print("\n--- DataFrame 상위 5개 행 ---")
    print(df.head())
    
    print("\n--- DataFrame 정보 (컬럼, 데이터 타입) ---")
    df.info()

except FileNotFoundError:
    print(f"❌ 오류: '{csv_file_path}' 파일을 찾을 수 없습니다. 경로를 확인해 주세요.")
except Exception as e:
    print(f"❌ CSV 파일을 읽어오는 중 오류가 발생했습니다: {e}")


--- DataFrame 상위 5개 행 ---
   year  rank   song_id  album_id source
0  2024     1  37140709  11391902  melon
1  2024     2  37138469  11402655  melon
2  2024     3  37524037  11487023  melon
3  2024     4  37145732  11404142  melon
4  2024     5  31927275  10307346  melon

--- DataFrame 정보 (컬럼, 데이터 타입) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      500 non-null    int64 
 1   rank      500 non-null    int64 
 2   song_id   500 non-null    int64 
 3   album_id  500 non-null    int64 
 4   source    500 non-null    object
dtypes: int64(4), object(1)
memory usage: 19.7+ KB


In [2]:
song_ids = df['song_id'].values.tolist()
album_ids = df['album_id'].values.tolist()

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
from tqdm import tqdm

In [4]:
options = Options()
options.add_argument(
    "--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))

song_meta_data = []
unique_creators = set()

wait = WebDriverWait(driver, 10)

for song_id, album_id in tqdm(zip(song_ids, album_ids), total=len(song_ids), desc="Processing songs"):
    try:
        url = f'https://www.melon.com/song/detail.htm?songId={song_id}'
        driver.get(url)

        # 제목
        title = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'song_name'))
        ).text

        # 가수
        singer = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'artist'))
        ).text

        # 좋아요 수
        like_num = wait.until(
            EC.presence_of_element_located((By.ID, 'd_like_count'))
        ).text.replace(",", "")

        # 뎃글 수 
        comment_num = wait.until(
            EC.presence_of_element_located((By.ID, 'revCnt'))
        ).text.replace(",", "").replace("개", "")
        
        # 장르
        meta_data = driver.find_element(By.CLASS_NAME, 'list')
        genre = meta_data.find_elements(By.TAG_NAME, 'dd')[2].text

        # 작사/작곡/편곡자
        lyricists, composers, arrangers = [], [], []

        creators = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'list_person'))
        )
        li_list = creators.find_elements(By.TAG_NAME, 'li')

        for li in li_list:
            try:
                creator_name = li.find_element(By.CLASS_NAME, 'ellipsis').text
                creator_type = li.find_element(By.CLASS_NAME, 'meta').text
                href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')

                match = re.search(r'goArtistDetail\((\d+)\)', href)
                if match:
                    creator_id = match.group(1)
                    unique_creators.add((creator_id, creator_name))

                    if creator_type == '작사':
                        lyricists.append(creator_id)
                    elif creator_type == '작곡':
                        composers.append(creator_id)
                    elif creator_type == '편곡':
                        arrangers.append(creator_id)
            except Exception as e:
                print(f"[WARN] song_id {song_id} 파싱 실패: {e}")

        data = {
            'song_id': song_id,
            'album_id': album_id,
            'singer': singer,
            'title': title,
            "genre": genre,
            "like_num": like_num,
            "comment_num": comment_num,
            'lyricists': "|".join(lyricists),
            'composers': "|".join(composers),
            'arrangers': "|".join(arrangers)
        }
        song_meta_data.append(data)

    except Exception as e:
        print(f"[ERROR] song_id {song_id} 처리 중 에러: {e}")
        continue

driver.quit()

Processing songs:  69%|█████████████████████████████████████████████████████████████████████▍                              | 347/500 [06:15<02:18,  1.11it/s]

[WARN] song_id 33346446 파싱 실패: Message: no such element: Unable to locate element: {"method":"css selector","selector":".ellipsis"}
  (Session info: chrome=140.0.7339.213); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
0   chromedriver                        0x00000001034bf274 cxxbridge1$str$ptr + 2882596
1   chromedriver                        0x00000001034b71a0 cxxbridge1$str$ptr + 2849616
2   chromedriver                        0x0000000102fe1180 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 73420
3   chromedriver                        0x00000001030284c4 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 365072
4   chromedriver                        0x000000010301dd68 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 322228
5   chromedriver                        0x00000001030696c8 _RNvCslE4myv9Sbr3_7___rustc35___rust_

Processing songs:  74%|█████████████████████████████████████████████████████████████████████████▊                          | 369/500 [06:39<03:34,  1.64s/it]

[WARN] song_id 1500196 파싱 실패: Message: no such element: Unable to locate element: {"method":"css selector","selector":".ellipsis"}
  (Session info: chrome=140.0.7339.213); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
0   chromedriver                        0x00000001034bf274 cxxbridge1$str$ptr + 2882596
1   chromedriver                        0x00000001034b71a0 cxxbridge1$str$ptr + 2849616
2   chromedriver                        0x0000000102fe1180 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 73420
3   chromedriver                        0x00000001030284c4 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 365072
4   chromedriver                        0x000000010301dd68 _RNvCslE4myv9Sbr3_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 322228
5   chromedriver                        0x00000001030696c8 _RNvCslE4myv9Sbr3_7___rustc35___rust_n

Processing songs: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [08:47<00:00,  1.06s/it]


In [5]:
len(song_meta_data)

500

In [6]:
song_meta_data[-1]

{'song_id': 32323639,
 'album_id': 10377241,
 'singer': '김재환',
 'title': '어떤 날엔',
 'genre': '발라드, 국내드라마',
 'like_num': '65962',
 'comment_num': '1171',
 'lyricists': '608334',
 'composers': '720541',
 'arrangers': '720541'}

In [7]:
# csv 저장
import csv
from datetime import datetime

# 저장할 파일명
csv_dir = os.path.join(os.getcwd(), 'csv')
csv_file_path = os.path.join(csv_dir, 'melon_song_info.csv')

# 딕셔너리의 키를 필드 이름(헤더)으로 사용합니다.
fieldnames = song_meta_data[0].keys()

try:
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        # DictWriter 객체 생성
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # 헤더(필드 이름) 쓰기
        writer.writeheader()
        
        # 데이터를 한 줄씩 쓰기
        writer.writerows(song_meta_data)
       
    print(f"✅ 성공적으로 저장")

except Exception as e:
    print(f"❌ 파일 저장 중 오류 발생: {e}")

✅ 성공적으로 저장


In [8]:
# 저장한 파일 확인
df = pd.read_csv(csv_file_path, encoding='utf-8')
df.head()

Unnamed: 0,song_id,album_id,singer,title,genre,like_num,comment_num,lyricists,composers,arrangers
0,37140709,11391902,TWS (투어스),첫 만남은 계획대로 되지 않아,댄스,165128,2019,3679765|446175|2242475|2854631|2926813,3679765|2242475|2746431|890446|3062390|2854631...,3679765|2242475|890446|2854631|2746431
1,37138469,11402655,i-dle (아이들),나는 아픈 건 딱 질색이니까,댄스,172873,2284,1050107,1050107|778848|2971650|2940838,778848|2971650|2940838|1050107
2,37524037,11487023,aespa,Supernova,댄스,185521,6208,605098,605098|3804145|883258,883258
3,37145732,11404142,아이유,Love wins all,발라드,215503,2203,261143,1759925,1759925
4,31927275,10307346,DAY6 (데이식스),한 페이지가 될 수 있게,록/메탈,360394,3073,895389,895387|895386|895389|895390|622071,622071


In [9]:
# 아티스트 정보 dict_list로 변환
creator_list = [
    {'creator_id': id, 'creator_name': name} 
    for id, name in unique_creators
]

In [10]:
# 데이터 확인
creator_list[0]

{'creator_id': '2560319', 'creator_name': 'Rockitman'}

In [11]:
len(creator_list)

918

In [12]:
# 저장
df_creators = pd.DataFrame(creator_list)
csv_file_path = os.path.join(csv_dir, 'melon_creator_info.csv')
df_creators.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
print("✅ 창작자 정보 데이터가 저장되었습니다.")

✅ 창작자 정보 데이터가 저장되었습니다.


In [13]:
df = pd.read_csv(csv_file_path, encoding='utf-8')
df.head()

Unnamed: 0,creator_id,creator_name
0,2560319,Rockitman
1,722679,EL CAPITXN
2,2748606,Vince
3,2950352,Steve Mac
4,2940838,Likey
