In [1]:
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings # updated code
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from dotenv import find_dotenv, load_dotenv
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import textwrap


load_dotenv(find_dotenv())
embeddings = OpenAIEmbeddings()

In [2]:
def create_db_from_youtube_video_url(video_url):
    loader = YoutubeLoader.from_youtube_url(video_url)
    transcript = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = text_splitter.split_documents(transcript)

    db = FAISS.from_documents(docs, embeddings)
    return db

In [3]:
video_url = "https://www.youtube.com/watch?v=fNxaJsNG3-s&list=PLQY2H8rRoyvzDbLUZkbudP-MFQZwNmU4S"
db = create_db_from_youtube_video_url(video_url)

In [10]:
def create_db_and_transcript_from_youtube_video_url(video_url):
    loader = YoutubeLoader.from_youtube_url(video_url)
    transcript = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = text_splitter.split_documents(transcript)

    db = FAISS.from_documents(docs, embeddings)
    return transcript, db

video_url = "https://www.youtube.com/watch?v=ZMudJXhsUpY&list=PLQY2H8rRoyvzDbLUZkbudP-MFQZwNmU4S&index=6"
transcript, db = create_db_and_transcript_from_youtube_video_url(video_url)

# Extract text content from the Document objects and concatenate into a single string
transcript_string = ' '.join(doc.page_content for doc in transcript)

# Remove '\n' symbols from the transcript
transcript_string = transcript_string.replace('\n', ' ')

# Add appropriate punctuation
transcript_string = transcript_string.replace('. ', '.\n\n')

print(transcript_string)


LAURENCE MORONEY: Through this series so far, you've been learning the basics of NLP using TensorFlow.

You saw how to tokenize and then sequence text, preparing it to train neural networks.

You saw how sentiment in text can be represented with embeddings and how the semantics of text over long stretches might be learned using recurrent neural networks and LSTMs.

In this video, we'll put all of that together into a fun scenario.

We'll create a model and train it on the lyrics of traditional Irish songs.

From that, you'll see then if it can write its own poetry using those words.

Let's look at the steps involved.

First of all, this is our text.

Within the entire corpus are the lyrics to lots of Irish songs.

One of them, "Lanigan's Ball," is listed here, and you can see these words have a very distinctive style.

If we were to read them in, we could do it something like this.

And for simplicity, I'll just use this one song for now.

It's stored as a single string with slash n's 

In [None]:
def get_response_from_query(db, query, k=4):
    docs = db.similarity_search(query, k=k)
    docs_page_content = " ".join([d.page_content for d in docs])

    chat = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.2)

    # Template to use for the system message prompt
    template = """
        You are a helpful assistant that that can answer questions about youtube videos 
        based on the video's transcript: {docs}
        
        Only use the factual information from the transcript to answer the question.
        
        If you feel like you don't have enough information to answer the question, say "I don't know".
        
        """

    system_message_prompt = SystemMessagePromptTemplate.from_template(template)

    # Human question prompt
    human_template = "Answer the following question: {question}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt]
    )

    chain = LLMChain(llm=chat, prompt=chat_prompt)

    response = chain.run(question=query, docs=docs_page_content)
    response = response.replace("\n", "")
    return response, docs

In [None]:
from langchain_openai import ChatOpenAI
query = "what is this video about?"
response, docs = get_response_from_query(db, query)
print(textwrap.fill(response, width=50))

In [None]:
query = "show me the full transcript"
response, docs = get_response_from_query(db, query)
print(textwrap.fill(response, width=50))

In [None]:
query = "what did talk about on AGI?"
response, docs = get_response_from_query(db, query)
print(textwrap.fill(response, width=50))

In [None]:
query = "On what topic they spend most time?"
response, docs = get_response_from_query(db, query)
print(textwrap.fill(response, width=50))

In [None]:
query = "who are the speakers in this video?"
response, docs = get_response_from_query(db, query)
print(textwrap.fill(response, width=50))

In [None]:
query = "what are they saying about Microsoft?"
response, docs = get_response_from_query(db, query)
print(textwrap.fill(response, width=50))

In [None]:
import sys
print(sys.version)