In [None]:
!pip install youtube-transcript_api

Collecting youtube-transcript_api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Downloading youtube_transcript_api-0.6.3-py3-none-any.whl (622 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/622.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/622.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m614.4/622.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.3/622.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript_api
Successfully installed youtube-transcript_api-0.6.3


In [21]:
## Import Libraries

import os
import pandas as pd
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi,  TranscriptsDisabled, NoTranscriptFound
from google.colab import userdata, drive
import time
import pandas as pd


In [None]:
#Load API Keys
YOUTUBE_APIKEY = userdata.get('YOUTUBE_APIKEY')

In [None]:
# Initialize the YouTube API Client
## Creates a YouTube API client to send requests and get data.

youtube = build('youtube', 'v3', developerKey=YOUTUBE_APIKEY)

# **Get Channel ID and List of Videos + Metadata**

In [None]:
#Get Information About a Channel
## Uses the channels.list API to get details about the NutritionFactsOrg YouTube channel.

username = 'NutritionFactsOrg'
request = youtube.channels().list(
    part="id",
    forUsername=username
)
response = request.execute()
response


{'kind': 'youtube#channelListResponse',
 'etag': 'y2wbGNYxO1mQLSW8NgLvKwMoFvQ',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'QrBFR4pa4iZJUXj0kXL8bnskofE',
   'id': 'UCddn8dUxYdgJz3Qr5mjADtA'}]}

In [None]:
#Get Channel Statistics

request = youtube.channels().list(
    part="statistics",
    forUsername=username
)
response = request.execute()
response

{'kind': 'youtube#channelListResponse',
 'etag': 'JZ5_Dd2SGBzFxt_q4ynZuuG6uH0',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'dFkMH2B16zrc390gXNNfSzIiHOc',
   'id': 'UCddn8dUxYdgJz3Qr5mjADtA',
   'statistics': {'viewCount': '260903296',
    'subscriberCount': '1210000',
    'hiddenSubscriberCount': False,
    'videoCount': '2753'}}]}

In [None]:
# Get the channel ID
channel_id = response['items'][0]['id']
channel_id

'UCddn8dUxYdgJz3Qr5mjADtA'

In [None]:
# Fetch Channel's Video List (A special playlist containing all videos uploaded to the channel.)

response = youtube.channels().list(part='contentDetails', id=channel_id).execute()
playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']


In [None]:
#Get Video Metadata
## Fetches metadata (e.g., title, description, and publication date) for each video in the uploads playlist.
## Since YouTube returns a maximum of 50 videos per request, it loops to fetch all videos using the nextPageToken

videos = []
next_page_token = None
while True:
    if next_page_token is None:
        response = youtube.playlistItems().list(
            part='snippet',
            playlistId=playlist_id,
            maxResults=50
        ).execute()
    else:
        response = youtube.playlistItems().list(
            part='snippet',
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        ).execute()

    for item in response['items']:
        videos.append({
            'videoId': item['snippet']['resourceId']['videoId'],
            'title': item['snippet']['title'],
            'description': item['snippet']['description'],
            'publishedAt': item['snippet']['publishedAt']
        })

    next_page_token = response.get('nextPageToken')
    if next_page_token is None:
        break


In [None]:
# Create a Dataframe

df = pd.DataFrame(videos)
df.head()

Unnamed: 0,videoId,title,description,publishedAt
0,DApo30IxaCQ,The Best Way to Remove Ear Wax,"Irrigation (also called ear syringing), which ...",2024-12-02T12:59:46Z
1,nMSUcC1d03M,Friday Favorites: The Benefits of Fasting for ...,Where did the idea of therapeutic fasting come...,2024-11-29T12:59:46Z
2,JtMVSBhfh0s,Podcast: The Anti-Aging Pathways of AMPK (Part 2),Does the diabetes drug metformin have an anti-...,2024-11-28T12:59:49Z
3,xUaiH7VqPiY,What benefits do greens have for our eye healt...,New subscribers to our e-newsletter always rec...,2024-11-28T12:01:16Z
4,sGVC5X5kw7A,Are There Any Benefits to Ear Candling?,Do ear candles work? Photographs are taken ins...,2024-11-27T12:59:47Z


# **Get The Transcript**

In [14]:
# Function to check and get transcripts
def check_and_get_transcript(video_id):
    """
    Checks if a transcript is available for the given video_id.
    Retrieves the transcript if available in English, or skips if not.
    """
    try:
        # Check available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        # Fetch the English transcript
        transcript_text = ' '.join([
            segment['text'].replace('\n', ' ')
            for segment in transcript_list.find_transcript(['en']).fetch()
        ])
        return transcript_text
    except (TranscriptsDisabled, NoTranscriptFound):
        # Log and skip videos without transcripts
        print(f"No transcript available for video {video_id}")
        return None
    except Exception as e:
        print(f"Error for video {video_id}: {e}")
        return None



In [15]:
# Function to retrieve transcripts for all videos in the existing DataFrame
def get_transcripts_with_check(df):
    """
    Iterates through the DataFrame and retrieves transcripts for videos with available ones.
    """
    transcripts = []
    for index, row in df.iterrows():
        video_id = row['videoId']
        print(f"Processing video ID: {video_id}")
        # Check and get transcript
        transcript = check_and_get_transcript(video_id)
        transcripts.append(transcript)
        time.sleep(2)  # Delay to prevent rate limiting
    return transcripts

# Use your existing DataFrame "df" from previous steps
# Ensure it contains a column named 'videoId'

# Retrieve transcripts and add them to the DataFrame
df['transcript'] = get_transcripts_with_check(df)

# Filter out rows with no transcripts
df_with_transcripts = df[df['transcript'].notnull()]




Processing video ID: DApo30IxaCQ
Processing video ID: nMSUcC1d03M
Processing video ID: JtMVSBhfh0s
Processing video ID: xUaiH7VqPiY
No transcript available for video xUaiH7VqPiY
Processing video ID: sGVC5X5kw7A
Processing video ID: FWZtb423xQo
Processing video ID: Fc1PWIiwgbw
Processing video ID: g6Xq4pLIQaM
Processing video ID: SuTHSSqXN4M
Processing video ID: FWMHyS3iU70
Processing video ID: TpIENGVzB98
Processing video ID: pHhloo634s4
Processing video ID: BTzMeAVK_g0
Processing video ID: BEGgC9FyHY4
Processing video ID: dtB415VNL50
Processing video ID: hBbFKe4-EdQ
Processing video ID: q-02FZalpRs
Processing video ID: es42bHR4ANo
Processing video ID: kn2Cts0qtNQ
Processing video ID: PQ7iZmoXRFU
Processing video ID: JtWEJf1ScOE
Processing video ID: TWzJEilYdrQ
No transcript available for video TWzJEilYdrQ
Processing video ID: 2v1DrvLWuYc
Processing video ID: ANnWpy3fCSo
Processing video ID: lXs3mhYIdZI
No transcript available for video lXs3mhYIdZI
Processing video ID: LwUKtgKKO8U
Proc

In [16]:
# Save the cleaned DataFrame with transcripts to Colab content folder
save_path = '/content/transcripts_available.csv'
df_with_transcripts.to_csv(save_path, index=False)

In [27]:
# Use Colab File API to upload to Google Drive
from google.colab import files
files.download(save_path)  # Downloads the file locally to your computer

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>