# 데이터 수집

- 유튜브 플레이 리스트 제목 : 조회수 100만 이상의 즉문즉설 베스트
- 유뷰브 플레이 리스트 ID : `PLGiaCgd9PatcGBfZ7xTGdTAsHoNPRQ_AP`

In [None]:
# 로컬 저장
# ! yt-dlp --write-auto-sub --sub-lang ko --skip-download -o "data/%(title)s.%(ext)s" "https://www.youtube.com/playlist?list=PLGiaCgd9PatcGBfZ7xTGdTAsHoNPRQ_AP"

In [4]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import os

gauth = GoogleAuth()
gauth.LocalWebserverAuth()  # 처음 실행 시 브라우저 인증
drive = GoogleDrive(gauth)

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=1071344378062-kv36e6mgl1vnr89evq958br79d063h8p.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [45]:
from dotenv import load_dotenv
import os

load_dotenv()
google_dirve_vtt_id = os.getenv("GOOGLE_DRIVE_VTT_ID")
google_dirve_txt_id = os.getenv("GOOGLE_DRIVE_TXT_ID")

In [None]:
import yt_dlp
import requests
from io import BytesIO

def get_playlist_entries(playlist_url):
    """ 유튜브 플레이리스트 URL 데이터 반환"""
    ydl_opts = {'quiet': True, 'extract_flat': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(playlist_url, download=False)
        return info['entries']
    
def get_subtitle_text(video_url, lang='ko'):
    """ 유튜브 자막 데이터 반환 """
    buffer = BytesIO()
    ydl_opts = {
        'writesubtitles': True,
        'skip_download': True,
        'subtitleslangs': [lang],
        'writeautomaticsub': True,
        'quiet': True,
        'outtmpl': '-',  # 실제 저장은 안함
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)
        subtitles = info_dict.get('subtitles') or info_dict.get('automatic_captions')
        
        if not subtitles or lang not in subtitles:
            return None, None
        
        # 자막 가져오기
        subtitle_url = subtitles[lang][0]['url']
        response = requests.get(subtitle_url)
        response.encoding = 'utf-8'
        return info_dict['title'], response.text
    
def upload_to_drive(title, text, lang='ko'):
    """ 자막 데이터 구글 드라이브에 저장 """
    filename = f"{title}.{lang}.vtt"
    file = drive.CreateFile({
        'title': filename,
        'parents': [{'id': google_dirve_vtt_id}]
    })
    file.SetContentString(text)
    file.Upload()

In [None]:
playlist_url = "https://www.youtube.com/playlist?list=PLGiaCgd9PatcGBfZ7xTGdTAsHoNPRQ_AP"
entries = get_playlist_entries(playlist_url)

for entry in entries:
    video_url = f"https://www.youtube.com/watch?v={entry['id']}"
    title, text = get_subtitle_text(video_url, lang='ko')
    if title and text:
        upload_to_drive(title, text)
    else:
        print(f"❌ 자막 없음: {video_url}")

# 데이터 가공

In [40]:
def clean_content(content:dict) -> str:
    """ 자막 데이터 가공 """
    segs = []
    for event in content['events']:
        if "segs" not in event:
            continue

        segs += event["segs"]
    return ''.join([seg["utf8"] for seg in segs]).strip()

In [None]:
import json
import re

# .vtt 파일만 검색
file_list = drive.ListFile({
    'q': f"'{google_dirve_vtt_id}' in parents and title contains '.vtt' and trashed=false"
}).GetList()

# 자막 데이터 로드
contents = {}
for file in file_list:
    title = re.sub(".vtt", ".txt", file['title'])
    content = json.loads(file.GetContentString())
    content = clean_content(content) # 자막 데이터 가공
    contents[title] = content

    # 가공 데이터 구글 드라이브에 저장
    file = drive.CreateFile({
        'title': title,
        'parents': [{'id': google_dirve_txt_id}]
    })
    file.SetContentString(content)
    file.Upload()

    print(title)
    print('---'*10)
    print(content[:10])
    print('==='*10)