In [42]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pinecone
from fastapi import FastAPI, Request
from pydantic import BaseModel
import uvicorn

In [41]:
# Step 1: Load the LegalBERT model
model_name = "nlpaueb/legal-bert-base-uncased"  # LegalBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)



In [53]:
# Step 3: Initialize Pinecone
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key="pcsk_4qGAdo_HJpbWqnXgMp73CihYSLJS6eRtfRcRDgA7jWsivdJP3aYAkjikFuqxhVabLVMhVj",
    environment="us-west-1"
)
index_name = "legalbertsearch"


try:
    # Try to get the index
    index = pc.Index(index_name)
    print(f"Index '{index_name}' already exists")
except Exception as e:
    # If index doesn't exist, create it
    print(f"Creating index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-1'
        )
    )
    index = pc.Index(index_name)
    print(f"Index '{index_name}' created successfully")

Index 'legalbertsearch' already exists


In [38]:
# Step 4: Load and preprocess PDF
def load_and_process_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Split text into manageable chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(documents)

    return chunks

file_path = "pdf/DCPR_2034_13-09-2024.pdf"  # Replace with your legal PDF file
chunks = load_and_process_pdf(file_path)

In [43]:
# Step 5: Generate embeddings using fine-tuned LegalBERT
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


In [55]:
import uuid
# Function to create embeddings and upsert to Pinecone
def upsert_to_pinecone(chunks, embedding_model):
    batch_size = 100  # Adjust based on your needs
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        ids = [str(uuid.uuid4()) for _ in batch]
        texts = [chunk.page_content for chunk in batch]
        metadatas = [chunk.metadata for chunk in batch]
        embeddings = embedding_model.encode(texts).tolist()
        
        to_upsert = list(zip(ids, embeddings, metadatas))
        
        # Upsert to Pinecone
        index.upsert(vectors=to_upsert)

# Use the function
upsert_to_pinecone(chunks, embedding_model)

AttributeError: 'HuggingFaceEmbeddings' object has no attribute 'encode'

In [None]:
# Step 7: Optimize Pinecone Queries
retriever = pinecone_index.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 results
qa_chain = RetrievalQA.from_chain_type(
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

In [None]:
from sentence_transformers import SentenceTransformer
import torch
from pinecone import Pinecone, ServerlessSpec
import numpy as np
from transformers import AutoModelForPreTraining, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI

# Set your OpenAI API key
client = OpenAI(api_key='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')

pc = Pinecone(
    api_key="pcsk_4qGAdo_HJpbWqnXgMp73CihYSLJS6eRtfRcRDgA7jWsivdJP3aYAkjikFuqxhVabLVMhVj",
    environment="us-west-1"
)
index_name = "realincgemma"
try:
    # Try to get the index
    index = pc.Index(index_name)
    print(f"Index '{index_name}' already exists")
except Exception as e:
    # If index doesn't exist, create it
    print(f"Creating index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=768,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-1'
        )
    )
    index = pc.Index(index_name)
    print(f"Index '{index_name}' created successfully")
    
# Load the same model you used to create embeddings
model = SentenceTransformer('nlpaueb/legal-bert-base-uncased')

from transformers import AutoModelForCausalLM, AutoTokenizer
# Load ChatGPT model from Hugging Face
chatgpt_model_name = "nlpaueb/legal-bert-base-uncased"  # Replace with the actual Hugging Face model name for GPT-4
tokenizer = AutoTokenizer.from_pretrained(chatgpt_model_name)
chatgpt_model = AutoModelForCausalLM.from_pretrained("nlpaueb/legal-bert-base-uncased")

def query_pinecone(query_text, top_k=5):
    try:
        
    
        # Generate embedding for the query
        query_embedding = chatgpt_model.encode(query_text)
        
        # Convert to list, handling different possible types
        if isinstance(query_embedding, np.ndarray):
            query_embedding = query_embedding.tolist()
        elif isinstance(query_embedding, torch.Tensor):
            query_embedding = query_embedding.tolist()
        elif not isinstance(query_embedding, list):
            raise ValueError(f"Unexpected embedding type: {type(query_embedding)}")
        
        # Check index statistics
        stats = index.describe_index_stats()
        print(f"Total vectors in index: {stats['total_vector_count']}")

        # Check if index is ready
        if stats['total_vector_count'] == 0:
            return [{'score': 0, 'metadata': {}, 'text': 'No data available in the index yet'}]
        
        #print(query_embedding)
        # Query Pinecone
        query_response = index.query(
            vector=query_embedding,
            namespace="",
            top_k=top_k,
            include_values=True,
            include_metadata=True
        )
        #print("======== query_response start ========")
        #print(query_response)
        #print("======== query_response end ========")
        
        # Extract and return results
        results = []
        for match in query_response.matches:
            results.append({
                'score': match.score,
                'metadata': match.metadata,
                'text': match.metadata.get('text', 'No text available')
            })
        
        return results
    
    except Exception as e:
        print(f"Error querying Pinecone: {str(e)}")
        return [{'score': 0, 'metadata': {}, 'text': f'Error: {str(e)}'}]

# Example usage
query = "What is the Minimum gap from the adjacent wall to the hand rail"
results = query_pinecone(query)
print(results)
# Print results
for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Similarity Score: {result['score']:.4f}")
    print(f"Text: {result['text'][:200]}...")  # Print first 200 characters


def refine_with_gpt4(context, results):
    # Encode the context and results
    context_embedding = model.encode(context)
    result_embeddings = [model.encode(result['text']) for result in results]

    # Calculate cosine similarities
    similarities = cosine_similarity([context_embedding], result_embeddings)[0]

    # Sort results by similarity
    sorted_results = sorted(zip(results, similarities), key=lambda x: x[1], reverse=True)

    # Create a prompt with the most relevant information
    prompt = f"Query: {context}\n\nRelevant information:\n"
    for result, similarity in sorted_results[:5]:  # Take top 5 most similar results
        prompt += f"- {result['text']}... (Similarity: {similarity:.2f})\n"
    
    prompt += "\nBased on the above information, please provide a comprehensive and accurate answer to the query."

    print(prompt)
    # Generate response using GPT-4
    response = client.chat.completions.create(
        model="gpt-4",  # You can also use "gpt-3.5-turbo" if GPT-4 access is not available
        messages=[
            {"role": "system", "content": "You are a Expert legal assistant that provides accurate information based on the given context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,  # Adjust as needed
        n=1,
        stop=None,
        temperature=0.7,
    )

    return response.choices[0].message.content.strip()

# Usage
if results:
    refined_answer = refine_with_gpt4(query, results)
    print("\nFinal Answer:")
    print(refined_answer)
else:
    print("No results to refine.")
    




Index 'realincgemma' already exists


No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


Error querying Pinecone: 'BertForPreTraining' object has no attribute 'encode'
[{'score': 0, 'metadata': {}, 'text': "Error: 'BertForPreTraining' object has no attribute 'encode'"}]

Result 1:
Similarity Score: 0.0000
Text: Error: 'BertForPreTraining' object has no attribute 'encode'...
Query: What is the Minimum gap from the adjacent wall to the hand rail

Relevant information:
- Error: 'BertForPreTraining' object has no attribute 'encode'... (Similarity: 0.82)

Based on the above information, please provide a comprehensive and accurate answer to the query.

Final Answer:
I'm sorry, but the provided information doesn't contain any details related to the minimum gap from the adjacent wall to the hand rail. Please provide relevant details or context to get an accurate response.
