In [1]:
!pip install pytube # For audio downloading
!pip install git+https://github.com/openai/whisper.git -q # Whisper from OpenAI transcription model
!pip install langchain
!pip install faiss-cpu
!pip install auto-gptq

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
!pip install sentence-transformers




In [3]:
import whisper
import pytube


url = "https://www.youtube.com/watch?v=ZXiruGOCn9s"
video = pytube.YouTube(url)

In [4]:
audio = video.streams.get_audio_only()
audio.download(filename='tmp.mp3') # Downlods only audio from youtube video

'/content/tmp.mp3'

In [5]:
model = whisper.load_model("small")

In [6]:
transcription = model.transcribe('/content/tmp.mp3')

In [7]:
res = transcription['segments']

In [8]:
from datetime import datetime

def store_segments(segments):
    texts = []
    start_times = []

    for segment in segments:
        text = segment['text']
        start = segment['start']

        # Convert the starting time to a datetime object
        start_datetime = datetime.fromtimestamp(start)

        # Format the starting time as a string in the format "00:00:00"
        formatted_start_time = start_datetime.strftime('%H:%M:%S')

        texts.append("".join(text))
        start_times.append(formatted_start_time)

    return texts, start_times

In [9]:
store_segments(res)

([" No, it's not those transformers, but they can do some pretty cool things.",
  ' Let me show you.',
  ' So, why did the banana cross the road?',
  ' Because it was sick of being mashed.',
  " I'm not sure that I quite get that one, and that's because it was created by a computer.",
  ' I literally asked it to tell me a joke, and this is what it came up with.',
  ' Basically I used a GPT-3, or a generative pre-trained transformer model.',
  ' The three here means that this is the third generation.',
  ' GPT-3 is an autoregressive language model that produces text that looks like it was',
  ' written by a human.',
  ' GPT-3 can write poetry, craft emails, and evidently come up with its own jokes.',
  ' Off you go.',
  " Now, while our banana joke isn't exactly funny, it does fit the typical pattern of",
  ' a joke with a setup and a punchline, and sort of kind of makes sense.',
  " I mean, who wouldn't cross the road to avoid getting mashed?",
  ' But look, GPT-3 is just one example o

In [11]:
texts, start_times = store_segments(res)

In [10]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain import LLMChain
from transformers import AutoTokenizer, pipeline, logging, AutoModelForCausalLM
import faiss

In [12]:
from huggingface_hub import login

access_token = "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"

login(token=access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [13]:
model_name_or_path = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,use_safetensors=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15
)

In [15]:
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
embeddings=HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-small',model_kwargs={'device':'auto'})



In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(texts):
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": start_times[i]}] * len(splits))

In [None]:
store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
faiss.write_index(store.index, "docs.index")

In [None]:
faiss

In [None]:
chain = VectorDBQAWithSourcesChain.from_llm(llm=llm, vectorstore=store)

In [None]:
result = chain({"question": "What is transformers"})

In [None]:
print(f"Answer: {result['answer']}  Sources: {result['sources']}")