<a href="https://colab.research.google.com/github/Amer328/SemanticSearch2/blob/main/YtubeTranscript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import re



In [4]:
def extract_youtube_transcript(video_url):
    """
    Extracts the transcript from a given YouTube video URL.

    Args:
    video_url (str): The URL of the YouTube video.

    Returns:
    str: The formatted transcript of the video, or an error message if the transcript is not available.
    """
    # Extract the video ID from the URL
    video_id = extract_video_id(video_url)

    if not video_id:
        return "Error: Invalid YouTube URL or unable to extract video ID."

    try:
        # Fetch the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Format the transcript
        formatted_transcript = format_transcript(transcript)

        return formatted_transcript

    except Exception as e:
        return f"Error: Unable to fetch transcript. {str(e)}"


In [5]:

def extract_video_id(url):
    """
    Extracts the video ID from a YouTube URL.

    Args:
    url (str): The YouTube video URL.

    Returns:
    str: The extracted video ID, or None if not found.
    """
    # Regular expression to match YouTube video URLs
    pattern = r'(?:v=|v\/|embed\/|youtu\.be\/|\/v\/|\/e\/|watch\?v=|watch\?feature=player_embedded&v=|watch\?feature=player_detailpage&v=|watch\?feature=player_profilepage&v=)([A-Za-z0-9_-]+)'

    match = re.search(pattern, url)
    if match:
        return match.group(1)
    return None

def format_transcript(transcript):
    """
    Formats the raw transcript data into a readable format.

    Args:
    transcript (list): The raw transcript data.

    Returns:
    str: The formatted transcript.
    """
    formatted = []
    for entry in transcript:
        start_time = entry['start']
        duration = entry['duration']
        text = entry['text']

        # Format the time as minutes:seconds
        minutes, seconds = divmod(int(start_time), 60)
        time_str = f"{minutes:02d}:{seconds:02d}"

        formatted.append(f"[{time_str}] {text}")

    return "\n".join(formatted)


In [7]:
# Example usage
video_url = "https://www.youtube.com/watch?v=Tbi6Mdp9rQo"
result = extract_youtube_transcript(video_url)
print(result)



[00:00] it is the latest chapter heading in a
[00:03] long saga and I'm afraid this is going
[00:05] to go on for quite some time quite a
[00:07] roller coaster here um the latest of
[00:09] course is that Donald Trump wants to
[00:10] impose extra tariffs on car imports why
[00:14] well it comes back to this chart a chart
[00:16] that you really will not like at all
[00:18] this is showing you US exports of cars
[00:20] around the world going back to the 1960s
[00:23] and the key thing is comparing exports
[00:25] with imports look at that line has
[00:27] imports into the US of cars and of
[00:30] course the difference between that one
[00:31] line and the other one is the trade
[00:33] deficit so the US sucking in many more
[00:35] cars from overseas than it makes uh or
[00:37] sends out itself and that's what Donald
[00:39] Trump really wants to try to remedy and
[00:41] he wants to do it through tariffs
[00:42] raising the question what are the level
[00:44] of tariffs going to be