# Task: MeetMind



## 1. Requirements:

In [2]:
# ! pip install -q langchain langchain_chroma langchain_community langchain_core langchain_huggingface transformers
# ! pip install -qU langchain_huggingface
# ! pip install pytube
# ! pip install --upgrade --quiet  youtube-transcript-api

# !pip install -qU langchain_pinecone
# !pip install -qU langchain_groq

## 2. Loading Libraries 

In [2]:
import os
import re
import time
from dotenv import load_dotenv
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma
from langchain_community.llms import HuggingFaceHub
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import YoutubeLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import HypotheticalDocumentEmbedder
from dataclasses import dataclass
from langchain.schema import Document


ModuleNotFoundError: No module named 'langchain'

## 3. Links

In [4]:
@dataclass
class Credentials:
    token: str = "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"
    yt_link: str = "https://www.youtube.com/live/UrfmlqHgZLA?si=gGsUOyKGz2zmltr9"
    groq_api: str = "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"


## 4. Laoding text from video

In [6]:
def get_word_count_from_youtube_url(yt_link):
    """Calculates the word count from a YouTube video link and returns the documents.

    Args:
        yt_link (str): The URL of the YouTube video.

    Returns:
        tuple: A tuple containing the number of words and the documents.
    """

    try:
        loader = YoutubeLoader.from_youtube_url(yt_link)
        docs = loader.load()
        word_count = len(docs[0].page_content.split())
        return word_count, docs
    except Exception as e:
        print(f"Error loading YouTube video: {e}")
        return 0, None
    
    
word_count, docs = get_word_count_from_youtube_url(Credentials.yt_link)
print(f"Number of words: {word_count}")

number of words: 13356


In [7]:
# Cleaning the text using regular expressions
def clean_text(text):
  """Cleans the text by removing unwanted characters and symbols.

  Args:
    text: The input text to be cleaned.

  Returns:
    The cleaned text.
  """

  # Remove punctuation and special characters except for a few symbols
  allowed_symbols = r"[^a-zA-Z0-9\s\.\,\?\!\-\(\)]"
  text = re.sub(allowed_symbols, '', text)

  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text)

  return text

content = clean_text((docs[0].page_content))
content = Document(page_content=content)


In [8]:
# saving the loaded docs to the txt files

with open("texts\lex-elon.txt", "w") as obj:
    obj.write(content)
    
print(len(content.split())) 

13356


## 5. for Chunking and converting into embbedings

In [9]:
EmbedModel = HuggingFaceEndpointEmbeddings(model="intfloat/multilingual-e5-large", huggingfacehub_api_token=token)


# from langchain_pinecone import PineconeEmbeddings
# EmbedModel = PineconeEmbeddings(model="multilingual-e5-large",pinecone_api_key='f55bd135-3e72-41d7-8665-55105bbf2e08')

In [10]:
text_splitter = RecursiveCharacterTextSplitter()

chunck_list = text_splitter.split_documents(content)

## 6. Loading chunked docs to vector DB

In [11]:
len(chunck_list)

18

In [12]:
vector_store = Chroma.from_documents(chunck_list, EmbedModel, persist_directory="./chroma_db")
# vector_store = Chroma(persist_directory="./chroma_db", embedding_function=EmbedModel)

## 7. Load the LLM

In [13]:
from langchain_groq import ChatGroq
import  os

llm=ChatGroq(groq_api_key=groq_api,
             model_name="Llama3-8b-8192")

In [14]:
# llm = HuggingFaceEndpoint(
#     repo_id="mistralai/Mistral-7B-Instruct-v0.3",
#     task="text-generation",
#     max_new_tokens=3000,
#     do_sample=False,
#     huggingfacehub_api_token=token
    
# )

## 8. creating all the prompt templates

In [15]:
# Define the contextualization prompt for reformulating questions based on chat history
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is."""),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [16]:
# Statefully manage chat history
chat_history_store = {}

def get_chat_session_history(session_id: str) -> BaseChatMessageHistory:
    """Fetches the chat history for the given session."""
    if session_id not in chat_history_store:
        chat_history_store[session_id] = ChatMessageHistory()
    return chat_history_store[session_id]

In [17]:
# Define the chat prompt template for QA
qa_prompt_template = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful.
<context>
{context}
</context>
Question: {input}
""")

## 9. Creating chains

In [18]:
# Create the question answer chain
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt_template)

# Create the history-aware retriever
history_aware_retriever = create_history_aware_retriever(
    llm, vector_store.as_retriever(), contextualize_q_prompt
)

# Create the retrieval chain
retrieval_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

# Create the conversational RAG chain with chat history management
conversational_rag_chain = RunnableWithMessageHistory(
    retrieval_chain,
    get_chat_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [19]:
user_question = "Highlight the important points in the podcast!"
session_id = "u1"


## 10. Inference 

In [20]:
# user_questio = input("Enter: ")
response = conversational_rag_chain.invoke(
    {"input": user_question},
    config={"configurable": {"session_id": session_id}},
)
print(response['answer'])

Based on the provided context, here are the important points:

1. **Full Tex search**: The speaker mentions that using old-style full Tex search to find technical terms can improve recall.
2. **Repository links**: The speaker shares links to the repository, a blog post, and an older blog post, and encourages the audience to clone the repository, submit bugs, and ask questions on the GitHub discussion thread.
3. **Data generation and fine-tuning**: The speaker discusses the process of data generation and fine-tuning, including the generation of a dataset, reformating files, and uploading to the fine-tuning API.
4. **Splitting data**: The speaker explains the importance of splitting data into training, validation, and evaluation sets in machine learning.
5. **Fining deployment and evaluation**: The speaker provides an end-to-end demonstration of how to do data generation, fine-tuning, deployment, and evaluation fully as code.

These points highlight the main topics discussed in the podca