<a target="_blank" href="https://colab.research.google.com/github/DoIT-Artifical-Intelligence/colab-ytt-to-docs/blob/main/Colab_YouTube_Transcription_Extractor_And_Summarizer.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import os

notebook_id = os.environ.get("COLAB_NOTEBOOK_ID", "")

if "github.com" in notebook_id:
    raise Exception(
        "This notebook must be run from Google Drive. "
        "Please save a copy to your Drive ('File' > 'Save a copy in Drive') and run it from there."
    )

In [None]:
# Update the link for the YouTuve video and the title (required if you don't have YOUTUBE_DATA_API_KEY set).
# Now hit "Run all" at the top next the right pointing triangle
YOUTUBE_URL = "https://www.youtube.com/watch?v=XXXXXXXXXXX"
# YOUTUBE_URL = "https://www.youtube.com/watch?v=KuPc06JgI_A"
VIDEO_TITLE_FALLBACK = "INSERT VIDEO NAME"
# VIDEO_TITLE_FALLBACK = "HGO Committee Session, 1/14/2025 #1"

In [None]:
AI_SUMMARY = True  # Will only run if GEMINI_API_KEY exists in Secrets
AI_MODEL = "gemini-2.5-flash"
print(f"{AI_MODEL=}")
PROMPT = (
    f"I have included a transcript for {YOUTUBE_URL} ({VIDEO_TITLE_FALLBACK})"
    "\n\n"
    "Can you please summarize this?"
)
print("PROMPT=")
print(PROMPT)

video_id = YOUTUBE_URL.split("v=")[-1]
if video_id == "XXXXXXXXXXX":
    raise ValueError("Please add a valid YouTube url")

# approx costs (9/11/2025) per 1M tokens in USD
costs = {"gemini-2.5-flash input": 0.3, "gemini-2.5-flash output": 2.5}

In [None]:
from google.auth import default
from google.colab import auth, runtime, userdata
from googleapiclient.discovery import build

auth.authenticate_user()
creds, _ = default()
docs_service = build("docs", "v1", credentials=creds)
drive_service = build("drive", "v3", credentials=creds)

try:
    # To get a YOUTUBE_DATA_API_KEY go to
    # https://console.cloud.google.com/marketplace/product/google/youtube.googleapis.com to enable the YouTube API
    # Then go to https://console.cloud.google.com/apis/credentials to create an API key for YouTube Data API v3
    # Go to the Secrets on the left (image of a key) and your YouTube Data API Key there.
    youtube_service = build(
        "youtube", "v3", developerKey=userdata.get("YOUTUBE_DATA_API_KEY")
    )
    request = youtube_service.videos().list(part="snippet", id=video_id)
    response = request.execute()
    video_title = response["items"][0]["snippet"]["title"]
    print(f"Video Title: {video_title}")
    PROMPT = PROMPT.replace(VIDEO_TITLE_FALLBACK, video_title)
except Exception as e:
    print(e)
    video_title = VIDEO_TITLE_FALLBACK
    if video_title == "INSERT VIDEO NAME":
        raise ValueError(
            "Please replace 'INSERT VIDEO NAME' manually and re-run the cell."
        )

In [None]:
!pip install youtube-transcript-api --quiet
# https://github.com/jdepoix/youtube-transcript-api
from youtube_transcript_api import YouTubeTranscriptApi
ytt_api = YouTubeTranscriptApi()

In [None]:
transcript = ytt_api.fetch(video_id, languages=('en', 'en-US'))

In [None]:
data = transcript.to_raw_data()
len_data = len(data)
content = ""
for index, _data in enumerate(data):
    if index + 1 == len_data:
        content += _data["text"]
    else:
        content += _data["text"] + " "
content

In [None]:
doc = (
    docs_service.documents()
    .create(body={"title": video_title + " Transcript"})
    .execute()
)
document_id = doc.get("documentId")
requests = [{"insertText": {"location": {"index": 1}, "text": content}}]
docs_service.documents().batchUpdate(
    documentId=document_id, body={"requests": requests}
).execute()
transcript_url = f"https://docs.google.com/document/d/{document_id}"
print(f"Shareable link: {transcript_url}")

In [None]:
if not AI_SUMMARY:
    runtime.unassign()
try:
    # To get a GEMINI_API_KEY go to
    # https://console.cloud.google.com/marketplace/product/google/generativelanguage.googleapis.com to enable the Gemini  API
    # Then go to https://console.cloud.google.com/apis/credentials to create an API key for Generative Language API
    # Go to the Secrets on the left (image of a key) and your Gemini API Key there.
    GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")
except userdata.SecretNotFoundError:
    runtime.unassign()   

In [None]:
!pip install google-genai --quiet

In [None]:
from google import genai
from google.genai import types

client = genai.Client(api_key=GEMINI_API_KEY)
model = AI_MODEL
prompt = PROMPT + f"/n/n{content}"
contents = [types.Content(role="user", parts=[types.Part.from_text(text=prompt)])]
total_tokens = client.models.count_tokens(model=AI_MODEL, contents=prompt).total_tokens
print(f"Total input tokens: {total_tokens:,}")
input_cost = (total_tokens / 1_000_000) * costs[f'{AI_MODEL} input']
print(f"Input cost: ${input_cost:.4f}")

In [None]:
response = client.models.generate_content(
    model=model,
    contents=contents,
)
output_cost = (response.usage_metadata.total_token_count / 1_000_000) * costs[
    f"{AI_MODEL} output"
]
print(f"Total cost: ${output_cost:.4f}")
ai_summary = response.text
ai_summary

In [None]:
from datetime import datetime
from googleapiclient.http import MediaFileUpload

italicized_prompt = '\n\n'.join([f"*{line}*" for line in PROMPT.split('\n\n')])
header = (
    f"*The summary below was generated on {datetime.now().strftime('%Y-%m-%d')} by AI using {AI_MODEL} with the prompt:*\n\n'{italicized_prompt}\n\n*{{ TRANSCRIPT }}'*\n\n"
    f"*The data used was the YouTube transcript which was extracted to a Google Doc [here]({transcript_url}).*\n\n"
    "*The source code which created this document can be found [here](https://github.com/DoIT-Artifical-Intelligence/colab-ytt-to-docs).*\n\n---\n\n"
)

temp_md_filename = "tmp.md"
with open(temp_md_filename, "w", encoding="utf-8") as f:
    f.write(header + ai_summary)
try:
    file_metadata = {
        "name": video_title + " AI Summary",
        "mimeType": "application/vnd.google-apps.document",
    }
    media = MediaFileUpload(temp_md_filename, mimetype="text/markdown", resumable=True)
    doc = (
        drive_service.files()
        .create(body=file_metadata, media_body=media, fields="id, webViewLink")
        .execute()
    )
    shareable_link = doc.get("webViewLink")
    print(f"Shareable link: {shareable_link}")
finally:
    os.remove(temp_md_filename)

In [None]:
runtime.unassign()