In [1]:
print("Hello World")

Hello World


In [11]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
# Set your Pinecone API key
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
index_name = "mbot"

In [13]:
# Initialize Pinecone connection
pc = Pinecone(api_key=PINECONE_API_KEY)

In [14]:
# Check if the index exists; if not, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Adjust based on the embedding model used
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [15]:
# Extract data from PDFs
def load_pdf(data_directory):
    loader = DirectoryLoader(data_directory, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("Data/")

In [16]:
# Split the documents into chunks for better embedding
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of chunks:", len(text_chunks))

Length of chunks: 7486


In [17]:
# Load Hugging Face Embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

In [18]:
# Create a Pinecone vector store from the text embeddings
docsearch = LangchainPinecone.from_texts(
    [t.page_content for t in text_chunks],
    embeddings,
    index_name=index_name
)

In [20]:
query = "What is Cancer?"
docs = docsearch.similarity_search(query, k=3)

# Print the results
print("Result", docs)

Result [Document(metadata={}, page_content='Ellen S. Weber, MSN\nBreast cancer\nDefinition\nBreast cancer is caused by the development of\nmalignant cells in the breast. The malignant cells origi-nate in the lining of the milk glands or ducts of the breast(ductal epithelium), defining this malignancy as a cancer.Cancer cells are characterized by uncontrolled divisionleading to abnormal growth and the ability of these cellsto invade normal tissue locally or to spread throughoutthe body, in a process called metastasis.\nDescription'), Document(metadata={}, page_content='Ellen S. Weber, MSN\nBreast cancer\nDefinition\nBreast cancer is caused by the development of\nmalignant cells in the breast. The malignant cells origi-nate in the lining of the milk glands or ducts of the breast(ductal epithelium), defining this malignancy as a cancer.Cancer cells are characterized by uncontrolled divisionleading to abnormal growth and the ability of these cellsto invade normal tissue locally or to sprea

In [9]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
load_dotenv()
# Set your Pinecone API key
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
print(os.getenv("PINECONE_API_KEY"))
index_name = "mbot"

# Initialize Pinecone connection
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if the index exists; if not, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Adjust based on the embedding model used
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Extract data from PDFs
def load_pdf(data_directory):
    loader = DirectoryLoader(data_directory, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("Data/")

# Split the documents into chunks for better embedding
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of chunks:", len(text_chunks))

# Load Hugging Face Embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

# Create a Pinecone vector store from the text embeddings
docsearch = LangchainPinecone.from_texts(
    [t.page_content for t in text_chunks],
    embeddings,
    index_name=index_name
)

# Example query
query_result = embeddings.embed_query("hello")
print("Query result length:", len(query_result))

# You can now run retrieval-based queries using Langchain's RetrievalQA with your Pinecone vector store
qa_chain = RetrievalQA.from_chain_type(
    retriever=docsearch.as_retriever(),
    chain_type="stuff"  # You can replace this with your preferred chain type
)

# Example of using the QA chain
query = "What is the content of the PDF?"
result = qa_chain.run(query)
print("Query result:", result)


53f375aa-3693-47f8-923e-ab4bb4eab33d
Length of chunks: 7486
Query result length: 384


TypeError: BaseRetrievalQA.from_chain_type() missing 1 required positional argument: 'llm'

In [8]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
print(os.getenv("PINECONE_API_KEY"))


53f375aa-3693-47f8-923e-ab4bb4eab33d
