In [None]:
file_path = "sample.pdf"



In [None]:
from langchain_community.document_loaders import PyPDFLoader


In [None]:
import pypdf


In [None]:

# Import Libraries
from langchain_community.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import spacy
import os
from google.colab import files
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Load Models
nlp = spacy.load('en_core_web_sm')
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
os.environ["OPENAI_API_KEY"] = api_key

# Function to Parse and Extract Key Information from PDFs
def parse_and_extract(file_path):
    """Parse and extract key information from a PDF."""
    # Load the PDF document
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Extract text from the document
    text = " ".join([doc.page_content for doc in documents])

    # Use spaCy for Named Entity Recognition (NER)
    doc = nlp(text)
    extracted_info = {
        "parties": [ent.text for ent in doc.ents if ent.label_ == "ORG"],
        "dates": [ent.text for ent in doc.ents if ent.label_ == "DATE"],
        "clauses": [
            sentence for sentence in text.split(".")
            if "indemnity" in sentence.lower() or "confidentiality" in sentence.lower()
        ]
    }
    return extracted_info, documents

# Function to Generate Embeddings and Store in FAISS
def process_and_store_embeddings(documents):
    """Generate embeddings and store in FAISS."""
    # Split the document into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    texts = [chunk.page_content for chunk in chunks]

    # Use LangChain FAISS with SentenceTransformerEmbeddings
    vector_store = FAISS.from_texts(texts, embedding_model)  # Pass the SentenceTransformerEmbeddings object
    return vector_store

# Function to Create a Q&A Chain
def create_qa_chain(vector_store):
    """Create a Q&A chain with GPT-4 and FAISS retriever."""
    llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0)
    retriever = vector_store.as_retriever()
    qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    return qa_chain

# Function to Answer Questions
def answer_question(qa_chain, question, chat_history):
    """Answer user questions based on the document."""
    result = qa_chain({"question": question, "chat_history": chat_history})
    return result["answer"]

# Step 1: Upload and Parse PDF
uploaded = files.upload()  # Upload PDF in Colab
file_path = list(uploaded.keys())[0]  # Get the uploaded file name
extracted_info, documents = parse_and_extract(file_path)

# Step 2: Display Extracted Information
print("Extracted Parties:", extracted_info["parties"])
print("Extracted Dates:", extracted_info["dates"])
print("Extracted Clauses:", extracted_info["clauses"])

# Step 3: Generate Embeddings and Store in FAISS
vector_store = process_and_store_embeddings(documents)

# Step 4: Set Up Q&A
qa_chain = create_qa_chain(vector_store)

# Step 5: Interactive Q&A Loop
chat_history = []
while True:
    question = input("Ask a question (or type 'exit' to quit): ")
    if question.lower() == "exit":
        break
    answer = answer_question(qa_chain, question, chat_history)
    print(f"Answer: {answer}")
    chat_history.append((question, answer))


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saving Potential-Paraquat-Claim.pdf to Potential-Paraquat-Claim.pdf
Extracted Parties: ['Dear Paraquat Client', 'Chevron', 'Paraquat', 'Chevron', 'Syngenta AG', 'MDL', 'Paraquat', 'Paraquat', 'Order', 'Pennsylvania State Court', 'Court', 'U.S. Supreme Court', 'Paraquat']
Extracted Dates: ['September 2023', 'October of of this year', 'June', 'August 20', 'October', 'the end of September', '2024']
Extracted Clauses: []


  result = qa_chain({"question": question, "chat_history": chat_history})


Answer:  Wagstaff Law Firm
Answer:  The status of the case for this main law firm is that they are currently gathering additional information from clients and negotiating a Bellwether process for cases filed in Pennsylvania. They are also involved in a hearing regarding scientific causation evidence in the MDL.
