In [37]:
import csv
import os
import re
from dotenv import load_dotenv

from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi


def authenticate_youtube(api_key):
    youtube = build('youtube', 'v3', developerKey=api_key)
    return youtube

def get_channel_id(youtube, channel_name):
    request = youtube.search().list(
        part="snippet",
        maxResults=5,
        q=channel_name,
        type="channel"
    )
    response = request.execute()
    
    for item in response["items"]:
        if item["snippet"]["channelTitle"].lower() == channel_name.lower():
            return item["snippet"]["channelId"]
    return None

def get_playlist_id(youtube, channel_id, playlist_name):
    request = youtube.playlists().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50
    )
    response = request.execute()

    for item in response["items"]:
        if item["snippet"]["title"].lower() == playlist_name.lower():
            return item["id"]
    return None

def get_video_list(youtube, playlist_id):
    videos = []
    page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="snippet",
            maxResults=50,
            playlistId=playlist_id,
            pageToken=page_token  # Add this line to handle pagination
        )
        response = request.execute()

        for item in response["items"]:
            videos.append({
                "video_id": item["snippet"]["resourceId"]["videoId"],
                "video_title": item["snippet"]["title"],
                "description": item["snippet"]["description"]
            })

        # If nextPageToken exists in the response, use it for the next iteration.
        # Otherwise, break out of the loop.
        page_token = response.get("nextPageToken")
        if not page_token:
            break

    return videos


def parse_segments(description):
    pattern = re.compile(r'(\d+:\d+:\d+|\d+:\d+) - (.+)')
    matches = pattern.findall(description)

    segments = []
    for match in matches:
        start_time = match[0]
        name = match[1]
        segments.append((start_time, name))
    return segments


def save_segments(video, segments, csv_writer):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video["video_id"])

        for idx, (start_time, name) in enumerate(segments):
            segment_start_time = time_to_seconds(start_time)
            segment_end_time = time_to_seconds(segments[idx+1][0]) if idx+1 < len(segments) else float('inf')

            segment_transcript = [entry for entry in transcript if segment_start_time <= entry['start'] < segment_end_time]
            segment_text = " ".join([entry["text"] for entry in segment_transcript])

            filename = sanitize_filename(f"Episode - {video['video_title']} Segment - {name} ({start_time}-{segments[idx+1][0] if idx+1 < len(segments) else ''}).txt")

            segment_start_in_seconds = time_to_seconds(start_time)
            segment_url = f"https://www.youtube.com/watch?v={video['video_id']}&t={segment_start_in_seconds}s"
            csv_writer.writerow([video['video_title'], name, video['video_id'], segment_url]) 
            with open(f"transcripts/{filename}", "w", encoding="utf-8") as file:
                file.write(segment_text)
    except Exception as e:
        pass

def time_to_seconds(time_str):
    parts = list(map(int, time_str.split(":")))
    if len(parts) == 3:
        return parts[0]*3600 + parts[1]*60 + parts[2]
    elif len(parts) == 2:
        return parts[0]*60 + parts[1]
    return 0

def sanitize_filename(filename):
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

In [42]:
CSV_HEADER = ['video_name', 'segment_name', 'video_id', 'segment_url']
CHANNEL_NAME = "Lex Fridman"
PLAYLIST_NAME = "Lex Fridman Podcast"

load_dotenv()
YT_API_KEY = os.environ["YT_API_KEY"]

youtube = authenticate_youtube(YT_API_KEY)

channel_id = get_channel_id(youtube, CHANNEL_NAME)

playlist_id = get_playlist_id(youtube, channel_id, PLAYLIST_NAME)

videos = get_video_list(youtube, playlist_id)

# Open CSV for writing
csv_file = open("segments_metadata.csv", "w", newline='', encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(CSV_HEADER)  # Write the header

for video in videos:
    print(video)
    segments = parse_segments(video["description"])
    save_segments(video, segments, csv_writer)