In [None]:
!pip install openai   # Install the OpenAI library
!pip install langchain   # Install the LangChain library
!pip install langchain_community   # Install the LangChain Community library
!pip install faiss-cpu   # Install the FAISS library for efficient similarity search
!pip install pypdf   # Install the PyPDF library for PDF processing
!pip install tiktoken   # Install the TikToken library for tokenization

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp39-cp39-win_amd64.whl.metadata (5.2 kB)
Downloading faiss_cpu-1.12.0-cp39-cp39-win_amd64.whl (18.2 MB)
   ---------------------------------------- 0.0/18.2 MB ? eta -:--:--
   - -------------------------------------- 0.5/18.2 MB 8.5 MB/s eta 0:00:03
   -- ------------------------------------- 1.0/18.2 MB 3.0 MB/s eta 0:00:06
   -- ------------------------------------- 1.0/18.2 MB 3.0 MB/s eta 0:00:06
   -- ------------------------------------- 1.3/18.2 MB 1.7 MB/s eta 0:00:11
   -- ------------------------------------- 1.3/18.2 MB 1.7 MB/s eta 0:00:11
   -- ------------------------------------- 1.3/18.2 MB 1.7 MB/s eta 0:00:11
   -- ------------------------------------- 1.3/18.2 MB 1.7 MB/s eta 0:00:11
   --- ------------------------------------ 1.6/18.2 MB 964.5 kB/s eta 0:00:18
   ---- ----------------------------------- 1.8/18.2 MB 987.4 kB/s eta 0:00:17
   ----- ---------------------------------- 2.4/18.2 MB 1.1 MB/s eta 0:00:15

Installing libraries

In [None]:
import openai
from langchain.document_loaders import PyPDFLoader # For loading PDF documents
from langchain.embeddings.openai import OpenAIEmbeddings # For generating embeddings using OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter # For splitting text into smaller chunks

In [11]:
from langchain.llms import OpenAI # For interacting with OpenAI's language models
import os # For accessing environment variables
import re # For regular expressions
from dotenv import load_dotenv # For loading environment variables from a .env file
load_dotenv() # Load environment variables from .env file
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # Set the OpenAI API key from environment variable

In [None]:
from langchain_community.chat_models import ChatOpenAI # This library is used to interact with OpenAI's chat models

llm = ChatOpenAI(temperature=0.9, model_name="gpt-3.5-turbo") # Initialize the OpenAI language model with specified parameters

llm.invoke("Explain RAG in simple terms. In terms of its components, RAG consists of three main parts: retrieval, augmentation, and generation.") # Invoke the language model to explain RAG in simple terms

AIMessage(content='RAG stands for retrieval-augmented generation, a type of model in artificial intelligence that combines information retrieval and natural language generation techniques. \n\nRetrieval refers to the process of finding relevant information from a large dataset, while augmentation involves enhancing this information with additional context or details. Generation then uses this combined information to create a new output, such as a text response or recommendation. \n\nBy integrating these three components, RAG models are able to generate more accurate and contextually relevant responses compared to traditional generation models that rely solely on generating text from scratch.', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 111, 'prompt_tokens': 36, 'total_tokens': 147, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_

Using custom dataset

RecursiveCharacterTextSplitter is a text splitter that splits the text into chunks, trying to keep paragraphs togeher and avoid loosing context over pages

In [18]:
from langchain.vectorstores import FAISS   # For creating a vector store using FAISS. FAISS is a library for efficient similarity search and clustering of dense vectors.
from langchain.embeddings import OpenAIEmbeddings   # For generating embeddings using OpenAI. Embeddings are numerical representations of text that capture semantic meaning.

In [39]:
from langchain_community.document_loaders import PyPDFLoader # For loading PDF documents. 
from langchain.text_splitter import RecursiveCharacterTextSplitter # For splitting text into smaller chunks.
from langchain_community.vectorstores import Chroma # For creating a vector store using Chroma. Chroma is a vector database that allows for efficient storage and retrieval of vector embeddings.   
from langchain_openai import OpenAIEmbeddings, ChatOpenAI # For generating embeddings and interacting with OpenAI's chat models.
from langchain.chains import ConversationalRetrievalChain # For creating a conversational retrieval chain.
from langchain.prompts import PromptTemplate # For creating prompt templates.
from dotenv import load_dotenv # For loading environment variables from a .env file.
import os

In [40]:
# Loading PDF
pdf_path = r"C:\Users\allif\Downloads\GenAI-Project-ResearchRagCHAT\RAGPaper.pdf" # Path to the PDF file.
loader = PyPDFLoader(pdf_path) # Initialize the PDF loader with the specified file path.
documents = loader.load() # Load the PDF document into memory.

# Splitting into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Split documents into chunks of 1000 characters with 200 characters overlap.
chunks = splitter.split_documents(documents) # Split the loaded documents into smaller chunks for better processing.

print(f" Loaded {len(chunks)} text chunks from PDF")




 Loaded 92 text chunks from PDF


In [None]:
# Create embeddings and vector store
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)

# Initialize LLM
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)

# Create retrieval chain
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.

Chat History:
{chat_history}
Follow up Input: {question}
Standalone question:
""")

# Create the ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    return_source_documents=True,
    verbose=False
)

# Ask question about the PDF
chat_history = []
query = "Summarize the key points from the RAG paper."
result = qa.invoke({"question": query, "chat_history": chat_history})

# Print the answer
print("\ Answer:")
print(result["answer"])


\ Answer:
The RAG paper discusses the effectiveness of the Retrieval-Augmented Generation (RAG) model in question answering tasks. Key points include:
1. RAG can generate correct answers even when the answer is not in any retrieved document, achieving 11.8% accuracy in such cases.
2. RAG outperforms BART on Open MS-MARCO NLG by 2.6 Bleu points and 2.6 Rouge-L points.
3. RAG models generate factually correct text more often than BART and hallucinate less.
4. RAG generations are more diverse than BART generations.
5. RAG models are more factual and specific than BART for Jeopardy question generation.
6. RAG's retrieval mechanism improves results for all tasks, with a gold article present in the top 10 retrieved articles in 90% of cases.


In [42]:
# Asking another question
print("\n Sources used:")
for doc in result["source_documents"]:
    print("→", doc.metadata.get("source", "Unknown"), "| snippet:", doc.page_content[:150])


 Sources used:
→ C:\Users\allif\Downloads\GenAI-Project-ResearchRagCHAT\RAGPaper.pdf | snippet: to more effective marginalization over documents. Furthermore, RAG can generate correct answers
even when the correct answer is not in any retrieved d
→ C:\Users\allif\Downloads\GenAI-Project-ResearchRagCHAT\RAGPaper.pdf | snippet: to more effective marginalization over documents. Furthermore, RAG can generate correct answers
even when the correct answer is not in any retrieved d
→ C:\Users\allif\Downloads\GenAI-Project-ResearchRagCHAT\RAGPaper.pdf | snippet: in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.
4.5 Additional Results
Generation Diversity Section 4
→ C:\Users\allif\Downloads\GenAI-Project-ResearchRagCHAT\RAGPaper.pdf | snippet: in 71% of cases, and a gold article is present in the top 10 retrieved articles in 90% of cases.
4.5 Additional Results
Generation Diversity Section 4
