<a href="https://colab.research.google.com/github/AlvinKimata/ml-projects/blob/main/RAG/Langchain_transcription_with_sources.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Query the YouTube video transcripts, returning timestamps as sources to legitimize the answers.

### First set runtime to GPU

In [None]:
pip install pytube # For audio downloading

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytube
  Downloading pytube-12.1.3-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-12.1.3


In [None]:
pip install git+https://github.com/openai/whisper.git -q # Whisper from OpenAI transcription model

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [None]:
import whisper
import pytube

In [None]:
url = "https://www.youtube.com/watch?v=Q8fLfQGCdlE"
video = pytube.YouTube(url)
video.streams.get_highest_resolution().filesize

93249188

In [None]:
audio = video.streams.get_audio_only()
fn = audio.download(output_path="tmp.mp3") # Downlods only audio from youtube video

In [None]:
model = whisper.load_model("base")

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 123MiB/s]


In [None]:
transcription = model.transcribe('')

In [None]:
res = transcription['text']

In [None]:
from datetime import datetime

def store_segments(segments):
  texts = []
  start_times = []

  for segment in segments:
    text = segment['text']
    start = segment['start']

    # Convert the starting time to a datetime object
    start_datetime = datetime.fromtimestamp(start)

    # Format the starting time as a string in the format "00:00:00"
    formatted_start_time = start_datetime.strftime('%H:%M:%S')

    texts.append("".join(text))
    start_times.append(formatted_start_time)

  return texts, start_times

In [None]:
store_segments(res)

([' This program is brought to you by Stanford University.',
  ' Please visit us at stanford.edu.',
  ' Thank you.',
  " I'm honored to be with you today for your commencement from one of the finest universities",
  ' in the world.',
  " Truth be told, I never graduated from college and this is the closest I've ever gotten",
  ' to a college graduation.',
  ' Today I want to tell you three stories from my life.',
  " That's it.",
  ' No big deal.',
  ' Just three stories.',
  ' The first story is about connecting the dots.',
  ' I dropped out of Reed College after the first six months but then stayed around as a drop-in',
  ' for another 18 months or so before I really quit.',
  " So why'd I drop out?",
  ' It started before I was born.',
  ' My biological mother was a young unwed graduate student and she decided to put me up for adoption.',
  ' She felt very strongly that I should be adopted by college graduates so everything was all',
  ' set for me to be adopted at birth by a lawyer

In [None]:
texts, start_times = store_segments(res)

In [None]:
pip install langchain

In [None]:
pip install openai

In [None]:
pip install --upgrade faiss-gpu==1.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu==1.7.1
  Downloading faiss_gpu-1.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (89.7 MB)
[K     |████████████████████████████████| 89.7 MB 35 kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.1


In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chains import VectorDBQAWithSourcesChain
from langchain import OpenAI
import openai
import faiss

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(texts):
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": start_times[i]}] * len(splits))
embeddings = OpenAIEmbeddings()

In [None]:
store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)

In [None]:
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=store)

In [None]:
result = chain({"question": "How old was Steve Jobs when started Apple?"})

In [None]:
print(f"Answer: {result['answer']}  Sources: {result['sources']}")

Answer:  Steve Jobs was 20 when he started Apple.  Sources: 00:05:47, 00:05:59
