In [None]:
pip install transformers datasets accelerate torchaudio yt_dlp

In [None]:
import yt_dlp
import nbformat as nbf
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login


In [None]:

def download_audio_with_ytdlp(youtube_url, output_audio_path="audio.mp3"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'temp_audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    os.rename("temp_audio.mp3", output_audio_path)
    return output_audio_path

def transcribe_audio_whisper(audio_path):
    whisper = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
    result = whisper(audio_path, return_timestamps=True)
    return result["text"]


youtube_link = "https://youtu.be/yYALsys-P_w?si=MUMJ74ggsPW5PMu-"
audio_file = download_audio_with_ytdlp(youtube_link)
text = transcribe_audio_whisper(audio_file)

print("\n--- TRANSCRIPTION ---\n")
print(text)


In [None]:

user_secrets = UserSecretsClient()
token = user_secrets.get_secret("hugging-face-key")

login(token=token)



In [None]:

model_id = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)

In [None]:
# Example usage:
youtube_link = "https://youtu.be/yYALsys-P_w?si=MUMJ74ggsPW5PMu-"
audio_file = download_audio_with_ytdlp(youtube_link)
text = transcribe_audio_whisper(audio_file)


In [None]:
gemma = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = f"<bos><start_of_turn>user\nMake a Coursera-style educational Jupyter Notebook on this topic: {text}\nStart with:\n1. A short **Summary** explaining the overall idea.\n2. Clear breakdown of **Main Topics** with explanations and, when applicable, simple code examples.\n3. End with a set of **Exercises**:\n- Some questions where students need to **complete missing code** (use `____` as blanks).\n- Some **theory questions** about the concepts explained.\n\nStructure headings like this:\n# Summary\n## Topic 1: ...\n## Topic 2: ...\n...\n# Exercises\n- [Code Completion]\n- [Theory Question]\n\nKeep the tone friendly and instructional, like a Coursera or edX lab.\n\nUse Markdown and Python code cells just like in a real Jupyter Notebook.\n<end_of_turn>\n<start_of_turn>model\n"

outputs = gemma(prompt, max_new_tokens=1200, do_sample=True, temperature=0.7)
output_text = outputs[0]["generated_text"]
notebook_content = output_text[len(prompt):]

print("done")

In [None]:

nb = nbf.v4.new_notebook()
cells = []

# Split the content by '## ' heading (top-level sections) and create markdown cells
sections = notebook_content.split("## ")
for section in sections:
    if not section.strip():
        continue
    # Add '## ' back for heading markdown syntax
    section_text = "## " + section if not section.startswith("## ") else section
    cells.append(nbf.v4.new_markdown_cell(section_text.strip()))

nb['cells'] = cells

with open('notebook.ipynb', 'w') as f:
    nbf.write(nb, f)
