In [16]:
# Import required libraries
import os
from transformers import AutoTokenizer, AutoModel
from PyPDF2 import PdfReader
import torch
import numpy as np

In [15]:
# Step 3: Initialize Pinecone
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key="pcsk_4qGAdo_HJpbWqnXgMp73CihYSLJS6eRtfRcRDgA7jWsivdJP3aYAkjikFuqxhVabLVMhVj",
    environment="us-west-1"
)
index_name = "realincgemma"


try:
    # Try to get the index
    index = pc.Index(index_name)
    print(f"Index '{index_name}' already exists")
except Exception as e:
    # If index doesn't exist, create it
    print(f"Creating index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-1'
        )
    )
    index = pc.Index(index_name)
    print(f"Index '{index_name}' created successfully")

Index 'realincgemma' already exists


In [19]:
# Load Legal-BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")



In [8]:
def get_embedding(text):
    # Tokenize the text
    inputs = tokenizer(
        text,
        return_tensors="pt",
        #max_length=512,
        padding=True,
        truncation=True
    )
    
    # Get the embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the [CLS] token embedding as the sentence embedding
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    
    print(embeddings)    
    # Convert to list and return the first (and only) embedding
    return embeddings.tolist()

In [9]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return " ".join(text)

In [10]:
# Function to chunk text into smaller pieces
def chunk_text(text, chunk_size=300):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

In [11]:
# Index the PDF content into Pinecone
def index_pdf_to_pinecone(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text)
    for i, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        index.upsert([(f"chunk-{i}", embedding, {"text": chunk})])
    print("PDF content indexed successfully.")

In [12]:
# Query the indexed content
def query_pdf(question, top_k=20):
    # Get and prepare embedding
    query_embedding = get_embedding(question)
        
    # Print debugging information
    print(f"Embedding type: {type(query_embedding)}")
    print(f"Embedding length: {len(query_embedding)}")
    print(query_embedding)

    
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True, namespace="", include_values=True)
    print(results)
   # Extract answers from results
    answers = []
    for match in results['matches']:
        if 'metadata' in match and 'text' in match['metadata']:
            answers.append(match['metadata']['text'])
        else:
            print(f"Warning: Match missing expected metadata structure: {match}")
                
        return answers


In [None]:
# Main function to run the pipeline
if __name__ == "__main__":
    # Path to your PDF file
    pdf_path = "pdf/DCPR_2034_13-09-2024.pdf"  # Replace with your PDF file path

    # Index the PDF
    print("Indexing the PDF...")
    index_pdf_to_pinecone(pdf_path)

    

In [6]:
from openai import OpenAI 

# Set your OpenAI API key
client = OpenAI(api_key='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')

# Generate final answer with ChatGPT-4
def generate_final_answer(context_chunks, question):
    prompt = (
        "You are a legal assistant with expertise in legal documents. Answer the following question using only the given context. "
        "If the information is not in the context, say 'The context does not provide sufficient information.' "
        "Ensure your response is precise, professional, and formatted as a legal answer.\n\n"
        "### Question:\n"
        f"{question}\n\n"
        "### Context:\n"
        f"{' '.join(context_chunks)}\n\n"
        "### Answer:"
    )
    print(prompt)
    #openai.api_key = "YOUR_OPENAI_API_KEY"  # Replace with your OpenAI API key
    # Generate response using GPT-4
    #openai.api_key = "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"
    response = client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo" if you don't have GPT-4 access
            messages=[
                {
                    "role": "system", 
                    "content": "You are a legal assistant with expertise in legal documents."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            max_tokens=300,
            temperature=0.2
    )
    # Debug print
    print("Response type:", type(response))
    print("Response structure:", dir(response))
    print("First choice type:", type(response.choices[0]))
    print("First choice structure:", dir(response.choices[0]))
    return response.choices[0].message.content

In [20]:
# Ask a question
print("You can now ask questions!")
question = "What is the definition of FSI"

# Retrieve top context chunks
context_chunks = query_pdf(question)
print(context_chunks)
# Generate final answer using ChatGPT-4
answer = generate_final_answer(context_chunks, question)
print("\nAnswer:")
print(answer)

You can now ask questions!
[[-1.62739098e-01 -4.01847064e-01 -3.37026119e-01  1.08735217e-02
  -7.14993656e-01 -1.12443745e+00 -2.32274979e-01 -6.13120079e-01
   2.98481405e-01 -3.03242058e-01  6.47006571e-01  4.92910534e-01
   7.79649198e-01 -4.17546749e-01  4.04043764e-01 -2.36624219e-02
  -2.68615156e-01 -6.97151661e-01 -2.14985430e-01  1.37993628e-02
  -8.84618819e-01 -1.49137735e-01 -3.71974260e-01 -7.28554368e-01
  -1.42775464e+00 -4.40231413e-01 -2.51180589e-01  4.97169435e-01
  -4.58148539e-01  1.34948894e-01 -1.32184303e+00 -9.15726185e-01
  -5.85193634e-02 -2.77128704e-02  1.68034449e-01  3.83340776e-01
   7.10458159e-01 -4.48536038e-01  3.27888846e-01  1.75402224e-01
   7.95468628e-01  6.76318884e-01  3.12676251e-01 -6.00406766e-01
  -5.42567819e-02  2.98990250e-01 -1.04630911e+00  3.90930831e-01
  -2.76229858e-01  1.55952096e-01  9.13416088e-01 -4.87388551e-01
  -9.71118689e-01  5.18412650e-01 -7.11058617e-01  7.85335749e-02
   6.88646376e-01  2.49177873e-01  7.74102747e-01

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************************************************************************************************************************ZlMA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}