### OS Setup

In [1]:
! pip install langchain yt_dlp openai pydub pypdf python-dotenv

Collecting langchain
  Downloading langchain-0.1.1-py3-none-any.whl.metadata (13 kB)
Collecting yt_dlp
  Downloading yt_dlp-2023.12.30-py2.py3-none-any.whl.metadata (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.7/160.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting openai
  Downloading openai-1.8.0-py3-none-any.whl.metadata (18 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting pypdf
  Downloading pypdf-4.0.0-py3-none-any.whl.metadata (7.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (7.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl.metadata (25 kB)
Collecting langchain-community<0.1,>=0.0.13 (from langchain)
  Downloading langcha

In [8]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

""" If Running Notebook in Google colab uncomment line below
# Mount your Google Drive
from google.colab import drive

# Mount Google Drive
drive.mount('/content/data')

# Specify the path to the .env file
dotenv_path = 'data/MyDrive/Notebooks/env/.env'

# Load environmental variables from .env
load_dotenv(dotenv_path) 

"""

# Set your OpenAI API key
os.environ['OPENAI_API_KEY'] = 'Update with your API Key'

### **Video to Documnents**

In [9]:
from langchain.document_loaders.generic import GenericLoader

#OpenAIWhisperParser : It use Openai Whisper model, spech to text model
# to convert youtube audio to txt format
from langchain.document_loaders.parsers import OpenAIWhisperParser

#YoutubeAudioLoader: Load audio file from youtube video
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [10]:
# Toatl Cost : $0.19
url="https://www.youtube.com/watch?v=ed9346Z4uLk"
save_dir="youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)

docs = loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=ed9346Z4uLk
[youtube] ed9346Z4uLk: Downloading webpage
[youtube] ed9346Z4uLk: Downloading ios player API JSON
[youtube] ed9346Z4uLk: Downloading android player API JSON
[youtube] ed9346Z4uLk: Downloading m3u8 information
[info] ed9346Z4uLk: Downloading 1 format(s): 140
[download] youtube//100 Years of NFL History In Under 4 Minutes!.m4a has already been downloaded
[download] 100% of    3.27MiB
[ExtractAudio] Not converting audio youtube//100 Years of NFL History In Under 4 Minutes!.m4a; file is already in target format m4a
Transcribing part 1!


In [11]:
docs[0].page_content[0:10]

'In 1920, t'

In [12]:
len(docs)

1

In [13]:
page = docs[0]

In [14]:
page.metadata

{'source': 'youtube/100 Years of NFL History In Under 4 Minutes!.m4a',
 'chunk': 0}

In [15]:
page.page_content[0:150]

'In 1920, the American Professional Football Association was born, becoming the NFL two years later. In 1933, the Chicago Bears defeated the New York G'

### **Document Splitting**

In [16]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [17]:
splits = text_splitter.split_documents(docs)

In [18]:
len(splits)

3

In [19]:
print(splits[0])


page_content="In 1920, the American Professional Football Association was born, becoming the NFL two years later. In 1933, the Chicago Bears defeated the New York Giants in the league's inaugural championship game. Though the league featured black players prior, in 1946, the NFL officially integrated. In the 1958 championship game, the Baltimore Colts defeated the New York Giants in sudden death in what became known as the greatest game ever played. 45 million people tuned in, starting an explosion in viewership for future decades. The NFL merged with the American Football League in 1966, with the two league champions playing each other in the first AFL-NFL championship game, which later became known as Super Bowl I. The 1967 championship game was also known as the Ice Bowl, played at negative 13 degrees, so cold it froze the commentators' coffee. This was the Green Bay Packers' fourth NFL title in six years. They then won their second straight Super Bowl title. By 1969, Kansas City Ch

In [20]:
print(splits[1])

page_content="in the Super Bowl era to finish undefeated. They beat the Redskins in Super Bowl VII to finish the season 17-0. In 1978, newly introduced rules made defending the pass more difficult. The Steelers capitalized, winning the next two Super Bowls, giving them four in six seasons, cementing a 1970s dynasty. Dwight Clark made the catch to take the 49ers to the 1982 Super Bowl, sparking their 80s dynasty in the era of the pass-heavy West Coast offense. The Cowboys were next in the 90s, winning Super Bowls in 93, 94, and 96. After the attacks of September 11th devastated America, many people found a semblance of normalcy in the 2001 NFL season. That year, a week-two injury to Drew Bledsoe of the Patriots allowed little-known second-year quarterback Tom Brady to take the NFL stage. That year, Brady and the Patriots won the first of six Super Bowls over an 18-season stretch. They entered the NFL's 100th season as defending champions, and Brady has the consensus choice as the greate

In [21]:
print(splits[2])

page_content="its 100th season. Twenty NFL teams have won at least one of the first 53 Super Bowls. During the season, NFL games are played three days a week and shown on TV networks and streaming services around the globe. The league's structure and revenue sharing amongst all teams have kept the NFL the most popular sports league in the U.S. for several decades and is poised to remain so for many more. Here's to the next 100 years." metadata={'source': 'youtube/100 Years of NFL History In Under 4 Minutes!.m4a', 'chunk': 0}


In [22]:
for doc in docs:
    print(doc.metadata)

{'source': 'youtube/100 Years of NFL History In Under 4 Minutes!.m4a', 'chunk': 0}


In [23]:
! pip install reportlab

Collecting reportlab
  Downloading reportlab-4.0.9-py3-none-any.whl.metadata (1.4 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.0.9-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.0.9


### **Embedding & Vector Store**

In [None]:
! pip install langchain_openai
! pip install langchain[docarray]
! pip install chromadb

In [25]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
persist_directory = 'docs/chroma/'

In [26]:
! rm -rf $persist_directory

In [27]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [28]:
print(vectordb._collection.count())

3


In [33]:
! pip install docarray

Collecting docarray
  Downloading docarray-0.40.0-py3-none-any.whl.metadata (36 kB)
Collecting orjson>=3.8.2 (from docarray)
  Using cached orjson-3.9.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (49 kB)
Collecting rich>=13.1.0 (from docarray)
  Using cached rich-13.7.0-py3-none-any.whl.metadata (18 kB)
Collecting types-requests>=2.28.11.6 (from docarray)
  Using cached types_requests-2.31.0.20240106-py3-none-any.whl.metadata (1.8 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=13.1.0->docarray)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=13.1.0->docarray)
  Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Downloading docarray-0.40.0-py3-none-any.whl (270 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsing cached orjson-3.9.12-cp311-cp311-manylinux_2_17_aarch64.manylinux20

In [34]:
from langchain.vectorstores import DocArrayInMemorySearch

# create vector database from data
db = DocArrayInMemorySearch.from_documents(
    docs,
    embeddings)



In [35]:
#Save this so we can use it later!
vectordb.persist()

In [36]:
question = "Write a summary in no more then 5 sentance."

result = vectordb.similarity_search(question,k=3)

len(result)

3

In [37]:
print(result)

[Document(page_content="its 100th season. Twenty NFL teams have won at least one of the first 53 Super Bowls. During the season, NFL games are played three days a week and shown on TV networks and streaming services around the globe. The league's structure and revenue sharing amongst all teams have kept the NFL the most popular sports league in the U.S. for several decades and is poised to remain so for many more. Here's to the next 100 years.", metadata={'chunk': 0, 'source': 'youtube/100 Years of NFL History In Under 4 Minutes!.m4a'}), Document(page_content="in the Super Bowl era to finish undefeated. They beat the Redskins in Super Bowl VII to finish the season 17-0. In 1978, newly introduced rules made defending the pass more difficult. The Steelers capitalized, winning the next two Super Bowls, giving them four in six seasons, cementing a 1970s dynasty. Dwight Clark made the catch to take the 49ers to the 1982 Super Bowl, sparking their 80s dynasty in the era of the pass-heavy Wes

### **QA with Data**

In [38]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [40]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    model_name=llm_name,
    temperature=0
    )

In [41]:
from langchain.prompts import PromptTemplate

#Build Prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [42]:
from langchain.chains import RetrievalQA

# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=False,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [43]:
question = "Which game are we talking about ?"

In [44]:
result = qa_chain({"query": question})

  warn_deprecated(
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


In [45]:
print(result)

{'query': 'Which game are we talking about ?', 'result': 'The game being referred to is the 1958 championship game between the Baltimore Colts and the New York Giants, also known as the greatest game ever played. Thanks for asking!'}


In [46]:
question = "What is Super Bowl ?"

result = qa_chain({"query": question})

print(result)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


{'query': 'What is Super Bowl ?', 'result': "The Super Bowl is the championship game of the National Football League (NFL), played annually between the champions of the league's two conferences. It is the culmination of the NFL season and is one of the most-watched sporting events in the United States. Thanks for asking!"}


In [47]:
question = "How many team has won Super Bowl ?"

result = qa_chain({"query": question})

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


In [48]:
print(result)

{'query': 'How many team has won Super Bowl ?', 'result': 'Twenty NFL teams have won at least one of the first 53 Super Bowls. Thanks for asking!'}


In [49]:
question = "Which team rank number 1 in total number of Super Bowl ?"

result = qa_chain({"query": question})

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


In [50]:
print(result)

{'query': 'Which team rank number 1 in total number of Super Bowl ?', 'result': 'The Pittsburgh Steelers rank number 1 in total number of Super Bowl wins with six championships. Thanks for asking!'}
