In [6]:
import fitz
import torch
import re
from transformers import BertForQuestionAnswering, BertTokenizer

# Initialize the BERT question-answering model and tokenizer
bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForQuestionAnswering.from_pretrained(bert_model_name)
bert_model.eval()  # Set the model to evaluation mode

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = " ".join(page.get_text() for page in doc)
    return text

def clean_text(text):
    # Remove special characters, symbols, and extra whitespaces
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def answer_question_bert(document, question, model, tokenizer, max_length=512):
    # Clean the document text
    document = clean_text(document)

    # Use 'longest_first' truncation strategy
    input_dict = tokenizer.encode_plus(question, document, add_special_tokens=True, return_tensors="pt", max_length=max_length, truncation=True)
    outputs = model(**input_dict)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    answer = tokenizer.decode(input_dict["input_ids"][0][answer_start:answer_end+10])

    return answer

# Example usage with BERT:
pdf_path = "uploads/CASE OF ANAGNOSTAKIS v. GREECE.pdf"
question = "What was the outcome for the alleged violation of Article 8 in the case of ANAGNOSTAKIS?"
document = extract_text_from_pdf(pdf_path)
bert_answer = answer_question_bert(document, question, bert_model, bert_tokenizer)
print("BERT Answer:", bert_answer)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


BERT Answer: delay in the proceedings setting the contact schedule between the applicant and his child anagnostakis v greece judgment 2 the


In [7]:
from langchain.document_loaders import TextLoader  #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS  
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader  #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader  #load urls into docoument-loader
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [8]:
import csv
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("uploads/CASE OF ANAGNOSTAKIS v. GREECE.pdf")
pages = loader.load_and_split()

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', '(?=>\. )', ' ', '']
)
docs  = text_splitter.split_documents(pages)

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [11]:

from langchain.vectorstores import FAISS
db = FAISS.from_documents(docs, embeddings)

In [12]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000})
chain = load_qa_chain(llm, chain_type="stuff")

query = "What was the outcome for the alleged violation of Article 8?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)



'there has been no violation of Article 8 of the Convention'

In [13]:
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", 
retriever=db.as_retriever(search_kwargs={"k": 3}))

In [14]:
query = "What atricle did the case concern?"
qa.run(query)


'Article 8'

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

model_name = "google/flan-t5-xxl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

questions = [
    "What does Article 232 (a) of the Criminal Code relate to in this case?",
    "What did the applicant refuse regarding the psychiatric evaluation of the child?",
    "How did the applicant want the psychiatric evaluation to focus?",
    "What steps were taken by the public prosecutor to address the case?",
    "What was the Greek Ombudsman's role in this case?",
    "Why is the involvement of child-psychology experts considered important in contact disputes?",
    "How did the applicant's refusal to cooperate affect the case?",
    "What are the obligations of the State under Article 8 of the Convention in cases involving family life?",
    "What is the significance of a final court decision in this case?",
    "Why is the relationship between parents described as tense in the judgment?",
    "What principles guide the State's obligations in cases where children resist contact with a parent?",
    "How does the judgment view the necessity of cooperation in contact disputes?",
    "What are some examples of practical preparatory measures to facilitate contact between a parent and child?",
    "What is the primary consideration when determining the best interests of the child?",
    "How can the State balance the rights of parents and the best interests of the child in such cases?",
    "What is the significance of the Court's reference to Article 45 § 2 of the Convention in the judgment?",
    "In which cases may Article 8 of the Convention require phased measures?",
    "What role do child professionals and experts play in cases where children resist contact with a parent?"
]

answers = [
    "Article 232 (a) of the Criminal Code is mentioned in the case as a provision that may be used to initiate criminal proceedings related to the child's refusal to meet with the applicant.",
    "The applicant refused to attend the psychiatric evaluation of the child and requested that the order be revoked because he was not consulted prior to the decision.",
    "The applicant wanted the psychiatric evaluation to focus on the specific reasons behind the child's refusal to meet with him.",
    "The public prosecutor ordered social reports, a psychiatric evaluation, and referred the case to GONIS for consultation.",
    "The Greek Ombudsman contacted E.K., made recommendations, and suggested alternatives to GONIS for resolving the contact issue.",
    "In contact disputes, child-psychology experts can help identify and address the reasons behind a child's refusal to meet with a parent, contributing to resolution.",
    "The applicant's refusal to cooperate, including his refusal to attend the psychiatric evaluation and consider counseling, hindered efforts to resolve the contact issue.",
    "Under Article 8 of the Convention, the State has positive obligations to facilitate and maintain family life, including taking practical measures to resolve disputes.",
    "The final court decision of the Athens Court of First Instance granted contact rights to the applicant, making it a crucial element in the case.",
    "The relationship between the parents is described as tense in the judgment because it was characterized by mistrust and rivalry, which impacted the child's refusal to meet the applicant.",
    "States are guided by principles that require them to identify the causes of the child's resistance and take appropriate measures to address those causes.",
    "The judgment emphasizes the importance of cooperation and understanding among all parties involved in contact disputes.",
    "Practical preparatory measures may include counseling, mediation, gradual reintroduction plans, and emotional and psychological support for all parties involved.",
    "The best interests of the child are the primary consideration, ensuring that decisions and measures taken aim to benefit the child's well-being.",
    "The State must strike a balance between respecting the rights of parents and safeguarding the best interests of the child when addressing contact disputes.",
    "Article 45 § 2 of the Convention allows for the attachment of separate opinions to judgments, as seen in the dissenting opinion of Judge Serghides.",
    "Article 8 of the Convention may require phased measures in cases where reunification of a parent and child needs gradual and systematic reintroduction.",
    "Child professionals and experts play a vital role in assessing and addressing the child's needs and reasons for resisting contact, helping find solutions."
]

inputs = tokenizer(questions, answers, return_tensors="pt", padding=True, truncation=True)


optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=1000)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

model.save_pretrained("fine_tuned_flan_t5")
