In [1]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from typing_extensions import Concatenate
from langchain_openai import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
api_key = os.getenv('api_key')

In [23]:
llm = ChatOpenAI(
    api_key=api_key,
    model='gpt-4o', 
)

embedding = OpenAIEmbeddings(
    api_key=api_key,
    model='text-embedding-3-small'
)

In [13]:
pdf = PdfReader(r'C:\Users\Ai Sukmoren\Desktop\kmitl-chatbot\temp\The Impact of Discrimination in the Workplace-DESKTOP-UB36V5M.pdf')

In [24]:
llm.invoke('hi')

AIMessage(content='Hello! How can I assist you today?', response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_4008e3b719', 'finish_reason': 'stop', 'logprobs': None}, id='run-86710721-11ba-4167-af41-b9e24dc81d3e-0', usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17})

In [15]:
# read text from pdf
raw_text = ''
for i, page in enumerate(pdf.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [17]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)
len(texts)

68

In [18]:
document_search = FAISS.from_texts(texts, embedding)

In [19]:
chain = load_qa_chain(llm, chain_type="stuff")

In [21]:
query = "Who made this paper"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

'The paper was made by Ai Sukmoren (61100447), Kittapat Lochindarat (61100466), and Nattapat Chotirawi (61100480).'

In [None]:
def process_pdf_to_faiss_search(pdf_path, embedding_model):
    # Read the PDF file
    pdf = PdfReader(pdf_path)
    
    # Extract text from the PDF
    raw_text = ''
    for page in pdf.pages:
        content = page.extract_text()
        if content:
            raw_text += content
    
    # Split the text using Character Text Splitter
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=800,
        chunk_overlap=200,
        length_function=len,
    )
    texts = text_splitter.split_text(raw_text)
    
    # Create FAISS document search
    document_search = FAISS.from_texts(texts, embedding_model)
    
    # Save the FAISS index locally
    document_search.save_local("faiss_index")

In [None]:
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain

# Load the FAISS index with allow_dangerous_deserialization set to True
index_path = r"C:\Users\(Ai)AiSukmoren\Desktop\KMITL-present\faiss_index"
document_search = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)

# Define your query
query = "what the section olar Energy Offerings says"

# Perform a similarity search
docs = document_search.similarity_search(query)

# Load a QA chain
chain = load_qa_chain(llm, chain_type="stuff")

# Run the QA chain
answer = chain.run(input_documents=docs, question=query)

print("Answer:", answer)


In [None]:
def gen_rag_answer(query:str)->str:
    # Load the FAISS index with allow_dangerous_deserialization set to True
    index_path = r"C:\Users\(Ai)AiSukmoren\Desktop\KMITL-present\faiss_index"
    document_search = FAISS.load_local(index_path, embedding, allow_dangerous_deserialization=True)
    
    # Perform a similarity search
    docs = document_search.similarity_search(query)

    # Load a QA chain
    chain = load_qa_chain(llm, chain_type="stuff")

    # Run the QA chain
    answer = chain.run(input_documents=docs, question=query)

    return answer

In [None]:
res = gen_rag_answer('what the section olar Energy Offerings says')

In [None]:
print(res)