In [7]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from fastapi import FastAPI, Request
from pydantic import BaseModel
import uvicorn

In [2]:
# Step 1: Load the LegalBERT model
model_name = "nlpaueb/legal-bert-base-uncased"  # LegalBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
# Step 2: Fine-tune LegalBERT
def fine_tune_legalbert(dataset_path, output_dir="fine_tuned_legalbert"):
    # Load dataset (assumes a text file with labeled legal data, one sentence per line)
    dataset = Dataset.from_text(dataset_path)

    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        save_steps=500,
        save_total_limit=2,
        logging_dir=f"{output_dir}/logs",
    )

    # Data collator for masked language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
    )

    # Trainer API
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Fine-tune the model
    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model fine-tuned and saved to {output_dir}")

In [15]:
# Uncomment this to fine-tune
# fine_tune_legalbert("path_to_labeled_legal_data.txt")


from pinecone import Pinecone, ServerlessSpec

# Create a Pinecone client instance
pc = Pinecone(
    api_key="pcsk_4qGAdo_HJpbWqnXgMp73CihYSLJS6eRtfRcRDgA7jWsivdJP3aYAkjikFuqxhVabLVMhVj"
)

# If you're using serverless, you can specify it like this:
index = pc.create_index(
    name="legalbertsearch",
    dimension=768,  # adjust dimension as needed
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)


In [10]:
# Step 4: Load and preprocess PDF
def load_and_process_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    # Split text into manageable chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(documents)

    return chunks

In [11]:
file_path = "pdf/DCPR_2034_13-09-2024.pdf"  # Replace with your legal PDF file
chunks = load_and_process_pdf(file_path)

In [16]:
# Step 5: Generate embeddings using fine-tuned LegalBERT
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

No sentence-transformers model found with name nlpaueb/legal-bert-base-uncased. Creating a new one with mean pooling.


In [37]:
from langchain.vectorstores import Pinecone as LangchainPinecone

# Step 6: Store embeddings in Pinecone
index_name = "legalbertsearch"
# Then create your vector store
pinecone_index = LangchainPinecone.from_documents(
    documents=chunks,
    embedding=embedding_model,
    index_name=index_name
)

KeyboardInterrupt: 

In [None]:
from langchain.llms import OpenAI  # Correct import
 # or any other LLM you prefer
from langchain.chains import RetrievalQA

# First, initialize your LLM
llm = OpenAI(openai_api_key = "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")  # You'll need to set OPENAI_API_KEY in your environment variables


# Step 7: Optimize Pinecone Queries
retriever = pinecone_index.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 results
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

In [None]:
from fastapi import FastAPI
import nest_asyncio
from fastapi.responses import JSONResponse
import uvicorn

# Allow the event loop to run in the Jupyter Notebook
nest_asyncio.apply()

# Create FastAPI app
app = FastAPI()

class QuestionRequest(BaseModel):
    question: str
    
@app.get("/")
async def read_root():
    return {"message": "Welcome to FastAPI running in Jupyter Notebook!"}

@app.post("/ask")
async def ask_question(request: QuestionRequest):
    question = request.question
    response = qa_chain.run(question)

    # Format response
    formatted_response = {
        "answer": response["result"],
        "sources": [
            {
                "document": source.metadata['source'],
                "page": source.metadata.get('page', "N/A")
            }
            for source in response["source_documents"]
        ]
    }
    return formatted_response

# Run FastAPI app in the notebook
uvicorn.run(app, host="0.0.0.0", port=8000)



Task exception was never retrieved
future: <Task finished name='Task-8' coro=<Server.serve() done, defined at c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\main.py", line 579, in run
    server.run()
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf

INFO:     127.0.0.1:49250 - "POST /ask HTTP/1.1" 500 Internal Server Error


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette\applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette

INFO:     127.0.0.1:49252 - "POST /ask HTTP/1.1" 500 Internal Server Error


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette\applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette

INFO:     127.0.0.1:49253 - "POST /ask HTTP/1.1" 500 Internal Server Error


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette\applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette

INFO:     127.0.0.1:49256 - "POST /get HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:49258 - "POST /ask HTTP/1.1" 500 Internal Server Error


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 406, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette\applications.py", line 113, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\Admin\Documents\pythonproj\GemmaPdf\venv\Lib\site-packages\starlette