# Phase 1 - Creating a Vector Database

In [1]:
# Import necessary libraries for PDF and subtitle processing, LangChain, and ChromaDB
import pysrt
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from crewai import Agent, Task, Crew  
import pandas as pd
import ast
from tqdm import tqdm
import os

In [3]:
openai_api_key = 'your-api-key-here'
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4o-mini'

### Data Collection and Preprocessing

In [4]:
# --- Part 1: Helper Function to Process SRT Files ---

def extract_text_from_srt(srt_path):
    """Extracts text from an SRT subtitle file using pysrt."""
    subs = pysrt.open(srt_path)
    text = " ".join(sub.text for sub in subs)
    return text

In [5]:

# Define course names and their respective folder paths
# course_folders = {
#     "Introduction to Deep Learning using PyTorch": "/Users/apoorv/Desktop/AV/Code/GAI/AV_projects/Learners_Queries/Introduction_to_Deep_Learning_Using_Pytorch",
#     "Building Production-Ready RAG systems using LlamaIndex": "/Users/apoorv/Desktop/AV/Code/GAI/AV_projects/Learners_Queries/Building Production-Ready RAG systems using LlamaIndex",
#     "Introduction to LangChain - Building Generative AI Apps & Agents": "/Users/apoorv/Desktop/AV/Code/GAI/AV_projects/Learners_Queries/introduction_to_langchain_using_agentic_ai"
# }

course_folders = {
    "Biology": "/Users/apoorv/Desktop/AV/Code/GAI/YT_projects/Query Resolution/Biology",
    "Geography": "/Users/apoorv/Desktop/AV/Code/GAI/YT_projects/Query Resolution/Geography",
    "Mathematics": "/Users/apoorv/Desktop/AV/Code/GAI/YT_projects/Query Resolution/Mathematics"
}

# Dictionary to store course names and their respective .srt file paths
course_srt_files = {}

# Iterate through course folder mappings
for course, folder_path in course_folders.items():
    srt_files = []
    
    # Walk through the directory to find .srt files
    for root, _, files in os.walk(folder_path):
        srt_files.extend(os.path.join(root, file) for file in files if file.endswith(".srt"))
    
    # Add to dictionary if there are .srt files
    if srt_files:
        course_srt_files[course] = srt_files

# Print or use the extracted dictionary
# print(course_srt_files)


### Chunking and Embedding Document 

In [6]:
# --- Part 2: Setup Persistent Vectorstore with Course SRT Files ---

# Define the persistent directory for ChromaDB (replace with your desired path)
persist_directory = "./sample_db"

# Text splitter to break documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Initialize Chroma vectorstore with persistent directory
vectorstore = Chroma(
    collection_name="sample_course",
    embedding_function=embeddings,
    persist_directory=persist_directory
)

  embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
  vectorstore = Chroma(


### Storing in a Vector Database

In [7]:
import time

# OpenAI Pricing (adjust based on the model being used)
COST_PER_1K_TOKENS = 0.0001  # Cost per 1K tokens for 'text-embedding-ada-002'
TOKENS_PER_CHUNK_ESTIMATE = 750  # Approximate tokens per 1000-character chunk

# Track total tokens and cost
total_tokens = 0
total_cost = 0

# Start timing
start_time = time.time()

# Add new courses to the vectorstore if they don't already exist
for course, srt_list in course_srt_files.items():
    # Check if the course already exists in the vectorstore
    existing_docs = vectorstore._collection.get(where={"course": course})
    if not existing_docs['ids']:
        # Course not found, add it
        srt_texts = [extract_text_from_srt(srt) for srt in srt_list]
        course_text = "\n\n\n\n".join(srt_texts)  # Join SRT texts with four new lines
        doc = Document(page_content=course_text, metadata={"course": course})
        chunks = text_splitter.split_documents([doc])
        
        # Estimate cost before adding documents
        chunk_count = len(chunks)
        batch_tokens = chunk_count * TOKENS_PER_CHUNK_ESTIMATE
        batch_cost = (batch_tokens / 1000) * COST_PER_1K_TOKENS
        total_tokens += batch_tokens
        total_cost += batch_cost
        
        vectorstore.add_documents(chunks)
        print(f"Added course: {course} (Chunks: {chunk_count}, Cost: ${batch_cost:.4f})")
    else:
        print(f"Course already exists: {course}")

# End timing
end_time = time.time()

# Display cost and time
print(f"\nCourse Embeddings Update Completed! 🚀")
print(f"Total Chunks Processed: {total_tokens // TOKENS_PER_CHUNK_ESTIMATE}")
print(f"Estimated Total Tokens: {total_tokens}")
print(f"Estimated Cost: ${total_cost:.4f}")
print(f"Total Time Taken: {end_time - start_time:.2f} seconds")


Added course: Biology (Chunks: 8, Cost: $0.0006)
Added course: Geography (Chunks: 7, Cost: $0.0005)
Added course: Mathematics (Chunks: 5, Cost: $0.0004)

Course Embeddings Update Completed! 🚀
Total Chunks Processed: 20
Estimated Total Tokens: 15000
Estimated Cost: $0.0015
Total Time Taken: 6.08 seconds


### Query Understanding and Retrieval

In [8]:
# Define retrieval tool with metadata filtering
def retrieve_course_materials(query: str, course = course):
    """Retrieves course materials filtered by course name."""
    filter_dict = {"course": course}
    results = vectorstore.similarity_search(query, k=3, filter=filter_dict)
    return "\n\n".join([doc.page_content for doc in results])

In [9]:
course_name = "Biology"
question = "What is DNA?"
context = retrieve_course_materials(query=question, course= course_name)
print(context)

DNA, or deoxyribonucleic acid, is the fundamental molecule that carries genetic information in all living organisms. It is composed of two long strands forming a double helix, a structure first described by Watson and Crick. The strands are made up of nucleotide bases: adenine, thymine, cytosine, and guanine, which pair specifically. DNA replication ensures that genetic information is faithfully copied before cell division occurs. Mutations can occur during replication or due to environmental factors, affecting the sequence and function of genes. Cells have intricate repair mechanisms to correct errors, maintaining the integrity of the genome. Gene expression involves transcription of DNA into RNA, which is then translated into proteins that perform cellular functions. Genetic diversity arises from processes like recombination and mutation, driving evolution and adaptation. Modern biotechnology uses DNA sequencing and genetic engineering in fields such as medicine, agriculture, and

ar

#### Building an Agent

In [10]:
# Define the agent with a well-structured role and backstory
query_answer_agent = Agent(
    role = "Learning Support Specialist",
    goal = "You help learners with their queries with the best possible response",
    backstory = """You lead the Learners Query resolution department of an Ed tech company focussed on self paced courses on K12 school topics. You respond to learner queries related to course content, assignments, technical and administrative issues. You are polite, diplomatic and take ownership of things which could be imporved in your oragnisation. 
    
    """,
    verbose = False,
 
)

In [11]:
query_answering_task  = Task(
    description= """
    Answer the learner queries to the best of your abilities. Try to keep your response concise with less than 100 words. 
    Here is the query: {query}

    Here is similar content from the course extracted from subtitles, which you should use only when required: {relevant_content} .  Since this content is extracted from course subtitles, there may be spelling errors, make sure to correct these, while using this information in your response.

    This is the full name of the learner: {learner_name}
    Address each learner by their first name, if you are not sure what the first name is, simply start with Hi. 
    Also mention some appropriate and encouraging comforting lines at the end of the reponse, like "hope you found this helpful", "I hope this information is useful. Keep up the great work!", "Glad to assist! Feel free to reach out anytime." etc. 

    If you are not sure about the answer mention - "Sorry, I am not sure about this, I will get back to you"

    """,
    expected_output = "A crisp accurate response to the query",
    agent=query_answer_agent)

In [12]:
# Create the Crew
response_crew = Crew(
    agents=[query_answer_agent],
    tasks=[query_answering_task],
    verbose=False
)

Exception while exporting Span batch.
Traceback (most recent call last):
  File "/opt/anaconda3/envs/lc/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
  File "/opt/anaconda3/envs/lc/lib/python3.10/site-packages/urllib3/connectionpool.py", line 488, in _make_request
    raise new_e
  File "/opt/anaconda3/envs/lc/lib/python3.10/site-packages/urllib3/connectionpool.py", line 464, in _make_request
    self._validate_conn(conn)
  File "/opt/anaconda3/envs/lc/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1093, in _validate_conn
    conn.connect()
  File "/opt/anaconda3/envs/lc/lib/python3.10/site-packages/urllib3/connection.py", line 741, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
  File "/opt/anaconda3/envs/lc/lib/python3.10/site-packages/urllib3/connection.py", line 920, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
  File "/opt/anaconda3/envs/lc/lib/pytho

In [13]:
question = "What are some types of probability distributions?"
learner_name = "John" 
course_name = "Mathematics"
context = retrieve_course_materials(query = question , course=course_name)

response_result = response_crew.kickoff(inputs={"query": question, 
                                                "relevant_content": context,
                                                "learner_name": learner_name}) 
print('Q: ', question)
print('\n')
print('A: ', response_result)
print('\n\n')
print('Context: \n', context)

Q:  What are some types of probability distributions?


A:  Hi John! Some common types of probability distributions include the binomial distribution, which models the number of successes in a fixed number of independent trials; the Poisson distribution, used for counting the number of events in a fixed interval; and the normal distribution, which represents data that clusters around a mean. Understanding these distributions can greatly aid in analyzing data and making predictions. I hope this information is useful. Keep up the great work!



Context: 
 Probability is the mathematical study of uncertainty and likelihood. It measures the chance of an event occurring, expressed as a number between 0 and 1. The probability of an event is calculated as the ratio of favorable outcomes to total possible outcomes. Basic probability concepts include independent and dependent events, conditional probability, and Bayes' Theorem. Random variables help in modeling real-world uncertainties, using d

#### To upgrade this system further: 
1. Explore different methods of chunking
2. Query Enhancement
3. Image Processing Capability
4. Finding different approaches to select relevant documents.
5. Inlcude past discussions in the database.
