In [None]:
print("hello")

In [None]:
%pwd

In [6]:
import os
os.chdir("../")

In [None]:
%pwd

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from  langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Extract data from the pdf files.
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()

    return documents

In [8]:
extracted_data = load_pdf_file(data='data/')

In [9]:
# Split the extracted data into chunks.
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)

    return text_chunks

In [None]:
text_chunks = text_split(extracted_data)
print("Length of text chunks" , len(text_chunks))

In [11]:
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [12]:
# Download the embeddings from Hugging Face.
def download_hf_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
embeddings = download_hf_embeddings()

In [None]:
query_result = embeddings.embed_query("Hello, world!")
print("Length", len(query_result))

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [35]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "mini-demo-vanilla-bot"
pc.create_index(
    name = index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [None]:
type(OPENAI_API_KEY)

In [38]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [39]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [21]:
# Load an existing index.
# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_existing_index(
#     index_name=index_name,
#     embedding=embeddings
# )

In [None]:
docsearch

In [41]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [42]:
retrieved_docs = retriever.invoke("What crime did Wesley Crusher commit??")

In [None]:
retrieved_docs

In [45]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [78]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an AI chatbot and "
    "you are a big fan of the TV show Star Trek: The Next Generation. "
    "Use the following pieces of retrieved context to answer questions about the "
    "TV show. If you don't know the answer, say that you don't know."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),

    ]
)

In [79]:
chat_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, chat_chain)

In [None]:
response = rag_chain.invoke({"input": "What crime did Wesley Crusher commit?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "Is it possible to separate the saucer section at warp 9?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "What did Q put humanity on trial for?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "Who won the 2023 Superbowl?"})
print(response["answer"])