In [1]:
import fitz
import torch
import re
from transformers import BertForQuestionAnswering, BertTokenizer

# Initialize the BERT question-answering model and tokenizer
bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForQuestionAnswering.from_pretrained(bert_model_name)
bert_model.eval()  # Set the model to evaluation mode

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join(page.get_text() for page in doc)
    return text

def clean_text(text):
    # Remove special characters, symbols, and extra whitespaces
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def answer_question_bert(document, question, model, tokenizer, max_length=512):
    # Clean the document text
    document = clean_text(document)

    # Use 'longest_first' truncation strategy
    input_dict = tokenizer.encode_plus(question, document, add_special_tokens=True, return_tensors="pt", max_length=max_length, truncation=True)
    outputs = model(**input_dict)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    answer = tokenizer.decode(input_dict["input_ids"][0][answer_start:answer_end+10])

    return answer

# Example usage with BERT:
pdf_path = "uploads/CASE OF ANAGNOSTAKIS v. GREECE.pdf"
question = "What was the outcome for the alleged violation of Article 8 in the case of ANAGNOSTAKIS?"
document = extract_text_from_pdf(pdf_path)
bert_answer = answer_question_bert(document, question, bert_model, bert_tokenizer)
print("BERT Answer:", bert_answer)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


BERT Answer: delay in the proceedings setting the contact schedule between the applicant and his child anagnostakis v greece judgment 2 the


In [3]:
from langchain.document_loaders import TextLoader  #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for u"sing HugginFace models
from langchain.vectorstores import FAISS  
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader  #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader  #load urls into docoument-loader
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [4]:
# import csv from langchain.document_loaders 
import csv
from langchain.document_loaders import PyPDFLoader

# Load the PDF file from current working directory
loader = PyPDFLoader("uploads/CASE OF ANAGNOSTAKIS v. GREECE.pdf")
# Split the PDF into Pages
pages = loader.load_and_split()

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', '(?=>\. )', ' ', '']
)
docs  = text_splitter.split_documents(pages)

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [7]:
#Create the vectorized db
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
db = FAISS.from_documents(docs, embeddings)

In [8]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000})
chain = load_qa_chain(llm, chain_type="stuff")

#QUERYING
query = ""
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)



'What is the name of the person that was awarded just satisfaction under Article 41 of the Convention?'

In [12]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
retriever=db.as_retriever(search_kwargs={"k": 3}))

In [13]:
query = "What atricle did the case concern?"
qa.run(query)


'Article 8'