## Loading YouTube Video

In [1]:
from langchain.document_loaders import TextLoader

# Replace with your actual file path
file_path = "/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final/stats1 transcript.txt"

# Initialize the loader
loader = TextLoader(file_path)

# Load the document
docs = loader.load()

docs[0]

Document(metadata={'source': '/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final/stats1 transcript.txt'}, page_content="00:00:00.000 Hey team, Justin Zeltzer here from\xa0zstatistics.com, where today I'm\xa0\xa0\n00:00:05.430 responding to a challenge that was issued to me.\xa0Someone asked me if I could explain statistics\xa0\xa0\n00:00:11.160 to them in under half an hour. While initially\xa0I thought that was a bit of an ambitious ask,\xa0\xa0\n00:00:17.250 I thought no that's actually a really good\xa0challenge, and one that I might do for everybody.\xa0\xa0\n00:00:23.280 So this is it! An introduction to statistics,\xa0with no maths, and done in under half an hour.\xa0\xa0\n00:00:30.300 Now you can probably see that the the timing of\xa0this video is a bit longer than that, but it is\xa0\xa0\n00:00:36.480 because I bunged on a little extra section at\xa0the end- which is a bit of an optional extra,\xa0\xa0\n00:00:41.220 but I think I get most of it done in under half\xa0an

Split Chunks

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

In [3]:
print(splits[0])


page_content='00:00:00.000 Hey team, Justin Zeltzer here from zstatistics.com, where today I'm  
00:00:05.430 responding to a challenge that was issued to me. Someone asked me if I could explain statistics  
00:00:11.160 to them in under half an hour. While initially I thought that was a bit of an ambitious ask,  
00:00:17.250 I thought no that's actually a really good challenge, and one that I might do for everybody.  
00:00:23.280 So this is it! An introduction to statistics, with no maths, and done in under half an hour.  
00:00:30.300 Now you can probably see that the the timing of this video is a bit longer than that, but it is  
00:00:36.480 because I bunged on a little extra section at the end- which is a bit of an optional extra,  
00:00:41.220 but I think I get most of it done in under half an hour. But the idea is for you to develop your  
00:00:47.280 intuition around statistics, so it's great for those people who are just enrolling in a' metadata={'source': '/Users/emiliodu

Move TimeStamps into MetaData

In [4]:
import re
from langchain.schema import Document

# Regex to match timestamps like 00:01:05.438
timestamp_pattern = r"\d{2}:\d{2}:\d{2}(?:\.\d{3})?"

updated_docs = []

for doc in splits:
    text = doc.page_content

    # Find all timestamps in this chunk
    timestamps = re.findall(timestamp_pattern, text)

    # strip all timestamps from the text
    cleaned_text = re.sub(timestamp_pattern, '', text).strip()

    # extract the first and last timestamps (if any)
    start_time = timestamps[0] if timestamps else None
    end_time = timestamps[-1] if timestamps else None

    # add timestamps to metadata
    updated_metadata = dict(doc.metadata)  # copy existing metadata
    updated_metadata["start_time"] = start_time
    updated_metadata["end_time"] = end_time

    
    updated_docs.append(Document(page_content=cleaned_text, metadata=updated_metadata))

updated_docs[0]

Document(metadata={'source': '/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final/stats1 transcript.txt', 'start_time': '00:00:00.000', 'end_time': '00:00:47.280'}, page_content="Hey team, Justin Zeltzer here from\xa0zstatistics.com, where today I'm\xa0\xa0\n responding to a challenge that was issued to me.\xa0Someone asked me if I could explain statistics\xa0\xa0\n to them in under half an hour. While initially\xa0I thought that was a bit of an ambitious ask,\xa0\xa0\n I thought no that's actually a really good\xa0challenge, and one that I might do for everybody.\xa0\xa0\n So this is it! An introduction to statistics,\xa0with no maths, and done in under half an hour.\xa0\xa0\n Now you can probably see that the the timing of\xa0this video is a bit longer than that, but it is\xa0\xa0\n because I bunged on a little extra section at\xa0the end- which is a bit of an optional extra,\xa0\xa0\n but I think I get most of it done in under half\xa0an hour. But the idea is for you to devel

Embeddings

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-large-en")

  from .autonotebook import tqdm as notebook_tqdm


Delete old Vectorstores

In [6]:
import os
import shutil

persist_directory = "/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final"

# Chroma-specific files/folders to delete
files_to_delete = [
    "chroma.sqlite3"
]

for item in files_to_delete:
    path = os.path.join(persist_directory, item)
    if os.path.isfile(path):
        os.remove(path)
        print(f"Deleted file: {item}")
    elif os.path.isdir(path):
        shutil.rmtree(path)
        print(f"Deleted folder: {item}")
    else:
        print(f"Not found: {item}")


Not found: chroma.sqlite3


Vectorstores

In [7]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


In [8]:
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

  embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")


In [9]:
persist_directory = "/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final"

In [10]:
vectordb = Chroma.from_documents(
    documents = updated_docs,
    embedding = embedding_model,
    persist_directory=persist_directory
)

LLM Application

In [11]:
import openai
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


In [None]:
openai.api_key = "api key"

In [13]:
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name = llm_name, temperature = 0, openai_api_key = openai.api_key)

  llm = ChatOpenAI(model_name = llm_name, temperature = 0, openai_api_key = openai.api_key)


In [14]:
template = """Use the following pieces of context to answer the question at the end
If you don't know the answer, just say "I don't know". For the response, use three sentences max.
Keep the answer as concise as possible. Always say "thanks for asking!" at the end
of the answer.
Context: {context}
Question: {question}"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [15]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever(),
    return_source_documents = True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [16]:
question = "What is Hypothesis Testing"
result = qa_chain({"query" : question})
result["result"]

  result = qa_chain({"query" : question})


'Hypothesis testing is a common topic in statistics where we seek evidence to support an alternate hypothesis by comparing it to a null hypothesis. The p-value in hypothesis testing measures how extreme our sample data is, helping us determine whether to reject the null hypothesis. Thanks for asking!'

In [17]:
vectordb.similarity_search(question, k=3)


[Document(metadata={'source': '/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final/stats1 transcript.txt', 'start_time': '00:21:54.950', 'end_time': '00:22:47.720'}, page_content="deviation is given the symbol s P is generally\xa0used for proportion R for correlation and B for\xa0\xa0\n the gradient so be prepared to see all of these\xa0particular lowercase Roman numerals to represent\xa0\xa0\n the sample values that estimate these parameters\xa0provided in Greek but I will say be prepared also\xa0\xa0\n for your statistics textbook to break all of those\xa0rules because this despite them being conventions\xa0\xa0\n sometimes you'll find they don't stick to them\xa0annoyingly all right so with that under our belt\xa0\xa0\n let's go and have a look at a very common topic\xa0in statistics called hypothesis testing now I'm\xa0\xa0\n gonna start you off with an example rather than\xa0give you some kind of hypothetical definition\xa0\xa0\n here but using the data we've just seen is t

download youtube video to computer

In [18]:
from IPython.display import Video
from moviepy import VideoFileClip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
def timestamp_to_seconds(ts):
    h, m, s = ts.split(":")
    return int(h) * 3600 + int(m) * 60 + float(s)

def ask_question(question, input_path): 
    result = qa_chain({"query": question})
    result_text = result['result']
    print("Output:", result_text)
    
    source = result['source_documents']
    source_md = source[0].metadata
    start_time = source_md['start_time']
    end_time = source_md['end_time']
    
    start_sec = timestamp_to_seconds(start_time)
    end_sec = timestamp_to_seconds(end_time)
    
    clip = VideoFileClip(input_path).subclipped(start_sec, end_sec)
    clip.write_videofile(f"output_clip.mp4", codec="libx264", audio_codec="aac")
    
    print("Clip:\n")
    Video("output_clip.mp4", embed=True)

In [20]:
video_path = "/Users/emiliodulay/Documents/DSU/Study Buddy - Stats Final/intro stats.mp4"

ask_question(question = "Can you explain what hypothesis testing is?",
            input_path = video_path)

Output: Hypothesis testing is a statistical method used to determine if there is enough evidence to support a claim or hypothesis. It involves setting up a null hypothesis (the opposite of what is being tested) and an alternate hypothesis (what is being tested for), and then analyzing data to see if the null hypothesis can be rejected. Thanks for asking!


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

MoviePy - Building video output_clip.mp4.
MoviePy - Writing audio in output_clipTEMP_MPY_wvf_snd.mp4


chunk:   0%|          | 0/1134 [00:00<?, ?it/s, now=None]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


MoviePy - Done.
MoviePy - Writing video output_clip.mp4



                                                                           

MoviePy - Done !
MoviePy - video ready output_clip.mp4
Clip:

