In [29]:
import torch
import numpy as np

print("CUDA Available:", torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


CUDA Available: True
Using device: cuda


In [30]:
from bs4 import BeautifulSoup
import requests

url = "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32018L1972"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
sections = soup.find_all(['p'])

i = 0
lim = 10
for section in sections:
    print(f"{len(section.text)} {section.text}")
    i+=1
    if i >= lim:
        break

13 17.12.2018   
2 EN
38 Official Journal of the European Union
8 L 321/36
93 
            DIRECTIVE (EU) 2018/1972 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL
         
29 of 11 December 2018
         
56 establishing the European Electronic Communications Code
8 (Recast)
25 (Text with EEA relevance)
62 THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,


In [31]:
filtered_sections = [section for section in sections if len(section.text.strip()) > 120]

In [32]:
filtered_cleaned_sections = [section.text.strip().replace('\xa0', ' ').replace('.', "").replace(';', "").replace(',', "") for section in filtered_sections]
filtered_cleaned_sections[0:10]

['Directives 2002/19/EC (4) 2002/20/EC (5) 2002/21/EC (6) and 2002/22/EC (7) of the European Parliament and of the Council have been substantially amended Since further amendments are to be made those Directives should be recast in the interests of clarity',
 'The functioning of the five Directives which are part of the existing regulatory framework for electronic communications networks and services namely Directives 2002/19/EC 2002/20/EC 2002/21/EC and 2002/22/EC and Directive 2002/58/EC of the European Parliament and of the Council (8) is subject to periodic review by the Commission with a view in particular to determining the need for modification in light of technological and market developments',
 'In its communication or 6 May 2015 setting out a Digital Single Market Strategy for Europe the Commission stated that its review of the telecommunications framework would focus on measures that aim to provide incentives for investment in high-speed broadband networks bring a more consi

In [33]:
from sentence_transformers import SentenceTransformer, util
from transformers import BertForQuestionAnswering, BertTokenizer, pipeline

def fetch_most_relevant_paragraph(question, data, embedder = None):
    if embedder == None:
        embedder = SentenceTransformer('all-MiniLM-L6-v2')

    if type(question) == str and type(data) == list:
        data_embeddings = embedder.encode(data)
        question_embedding = embedder.encode(question)
        cos_scores = util.pytorch_cos_sim(question_embedding,data_embeddings)

        return data[cos_scores.argmax()]

    elif type(question) == str:
        print("Please enter knowledgebase as a list of strings")
    elif type(data) == list:
        print("Please enter question as a string")
    else:
        print("Unknown error")
        
def ask_question_to_paragraph(question, data, model = None, tokenizer = None):
    if model == None:
        model = BertForQuestionAnswering.from_pretrained("nlpaueb/legal-bert-base-uncased")
    if tokenizer == None:
        tokenizer = BertTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
    ###
    # path = "./legal-bert-finetuned"
    # model = BertForQuestionAnswering.from_pretrained(path)
    # tokenizer = BertTokenizer.from_pretrained(path)
    ###

    inputs = tokenizer(question, data, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][torch.argmax(start_logits):torch.argmax(end_logits)+1]))
    return answer

def refine_answer(answer,summarizer=None):
    if summarizer == None:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)
    summary = summarizer(answer, max_length=500, min_length=1, do_sample=False)
    return summary[0]['summary_text']

In [34]:
question1 = "How does the Directive redefine electronic communications services to address evolving technologies and ensure end-users rights are protected when using functionally equivalent online services?"
question2 = "How often shall commission review the functioning of this Directive and report to the European Parliament and to the Council?"

current_question = question2

In [35]:
def ask_question(current_question,filtered_cleaned_sections):
    
    path = "./legal-bert-finetuned"
    try:
        model = BertForQuestionAnswering.from_pretrained(path)
        tokenizer = BertTokenizer.from_pretrained(path)
    except:
        model= None
        tokenizer = None

    paragraph = fetch_most_relevant_paragraph(current_question,filtered_cleaned_sections)
    answer = ask_question_to_paragraph(current_question,paragraph,model=model,tokenizer=tokenizer)
    refined_answer = refine_answer(answer)
    print(f"\nQuestion: {current_question}\nResponse: {refined_answer}")
    return refined_answer

In [36]:
chatbot_answer = ask_question(current_question, filtered_cleaned_sections)

Your max_length is set to 500, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)



Question: How often shall commission review the functioning of this Directive and report to the European Parliament and to the Council?
Response: The commission shall review the functioning of this directive every five years thereafter. The commission shall report to the european parliament and to the council. The directive must be in place by 21 december 2025.


In [37]:
user_input = None
while(True):
    user_input = input("enter \"exit\" if you wish to exit or enter a question")
    if user_input.lower() == "exit":
        break
    ask_question(user_input, filtered_cleaned_sections)

Your max_length is set to 500, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)



Question: How often shall commission review the functioning of this Directive and report to the European Parliament and to the Council?
Response: The commission shall review the functioning of this directive every five years thereafter. The commission shall report to the european parliament and to the council. The directive must be in place by 21 december 2025.
