# Video Transcript Analysis using LLM


## Please refer to the respective sections in the book for further details.


## Step1. Install and set up the development environment

In [None]:
!pip install openai


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "XXXXXXXXXXXXXXX" ## INSERT YOUR OPENAPI KEY HERE

In [None]:
!pip3 install langchain


## Step 2. Data Loading


In [None]:
from langchain.document_loaders import DirectoryLoader

text_loader = DirectoryLoader("", glob="*.txt")
text_documents = text_loader.load_and_split()


### Step 2.1 Youtube videos transcription


In [None]:
!pip install youtube-transcript-api

In [None]:
from langchain.document_loaders import YoutubeLoader

from youtube_transcript_api import YouTubeTranscriptApi


In [None]:
yt_loader = YoutubeLoader(video_id="lrRt9uzWtqU", language="en")
transcripts = yt_loader.load_and_split()

In [None]:

YouTubeTranscriptApi.get_transcript("lrRt9uzWtqU")

## Step 3. Data processing




In [None]:
!pip install chromadb

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding_model = OpenAIEmbeddings()
document_store = Chroma.from_documents(transcripts, embedding_model)


In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

language_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.9)

question_answer = RetrievalQA.from_chain_type(llm=language_model, 
                                              chain_type="stuff",
                                              retriever=document_store.as_retriever())


## Step 4. Question Answering

In [None]:
query = "Who scored more goals in one Champions leage season?"
question_answer.run(query)

In [None]:
query = "Who scored more goal in one season between Ronaldo and Haaland?"
question_answer.run(query)

## Step 5. Post Processing

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

question_answer = RetrievalQA.from_chain_type(llm=llm, 
                                    chain_type="refine",
                                    retriever=document_store.as_retriever())

## Step 6. Transcript summarization

In [None]:
YouTubeTranscriptApi.get_transcript("IaT-yOiSffI")

In [None]:
loader = YoutubeLoader(video_id="awPT3ffcvX0", language="en")
summarization_docs = loader.load_and_split()

In [None]:
summarization_docs

In [None]:
summarization_docs[0].page_content[0:1000]

In [None]:
str(summarization_docs[0])[0:3000]

In [None]:
str(summarization_docs[0])[-3000:]

In [None]:
import requests
import json 

truncated_doc = str(summarization_docs[0])[0:1000]

API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"

headers = {"Authorization": "Bearer XXXXXXXXXXXXXXX"} ## INSERT YOUR HUGGINGFACE API TOKEN HERE

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

model_input = f"Provide a summary for the following document:" + \
        truncated_doc

json_data = {
    "inputs": model_input,
    "parameters": {'temperature': 0.5, 'max_new_tokens': 300},
}

response = requests.post(API_URL, headers=headers, json=json_data)
json_response = json.loads(response.content.decode("utf-8"))
print(json_response)
model_output = json_response[0]['generated_text']
model_output