In [1]:
import os
import PyPDF2
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
W0613 10:21:16.650000 17276 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
FOLDER_PATH = "./data/bronze"

docs = os.listdir(FOLDER_PATH)

In [8]:
def read_pdf_file(file_path, safe=True):
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            return reader.pages[0].extract_text()
    except Exception as e:
        if safe:
            return ""
        else:
            raise e
        
def get_main_content(doc):
    if not doc:
        return "No content"
    if "Subject:" in doc:
        return doc.split("Subject:")[1].split("Submitted:")[0].strip()
    else:
        return doc

In [19]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cuda")

In [20]:
candidate_labels_v1 = [
    "foreign policy",
    "technology",
    "finance",
    "human rights",
    "fishing",
    "environment",
    "energy",
    "health",
    "agriculture",
    "transport",
    "industry",
    "social",
    "education",
    "culture",
]

candidate_labels_v2 = [
    "foreign relations",  # Diplomacy, geopolitical issues, intergovernmental agreements
    "trade and market",  # International trade, internal market, customs, tariffs
    "EU governance",  # Institutional affairs, EU law, rule of law, Council/Commission actions
    "security and defense",  # Military, terrorism, EU security policy, intelligence
    "migration and borders",  # Asylum, immigration, border control, refugees
    "justice and civil rights",  # Legal matters, fundamental rights, judicial cooperation
    "economic policy",  # Macroeconomics, budgets, fiscal policy, eurozone
    "taxation and finance",  # Tax policy, banking, financial regulation
    "employment and labor",  # Workers' rights, wages, job creation, working conditions
    "digital and technology",  # ICT, internet, AI, data protection, digital markets
    "research and innovation",  # Scientific research, Horizon Europe, academia-industry links
    "industry and enterprise",  # Manufacturing, SMEs, industrial strategy
    "energy and resources",  # Energy supply, renewables, energy security, raw materials
    "environment and climate",  # Climate change, emissions, environmental protection
    "agriculture and fisheries",  # Farming policy, CAP, rural development, fishing quotas
    "transport and infrastructure",  # Roads, rail, aviation, public transport, logistics
    "health and public safety",  # Public health, pandemics, food safety, pharmaceuticals
    "education and youth",  # Schools, universities, Erasmus, children’s rights
    "culture and media",  # Heritage, arts, journalism, disinformation
    "social affairs and welfare",  # Social protection, poverty, housing, inclusion
    "equality and minorities",  # Gender equality, LGBTIQ, ethnic/religious minorities
    "development and aid",  # Development policy, humanitarian aid, global cooperation
    "regional and urban policy",  # Cohesion funds, urban planning, regional investment
    "sports and leisure",  # Sports policy, events, youth engagement
    "religious and ethical issues",  # Religion, bioethics, animal rights, moral debates
]

candidate_labels_v3 = [
    "foreign and security affairs",  # Diplomacy, war/conflict, development aid, defense, migration, borders
    "economic and monetary policy",  # Budget, taxation, employment, macroeconomics, eurozone
    "trade and internal market",  # Trade, SMEs, competition, customs, market regulation
    "digital and innovation",  # Tech, AI, data, research, digital infrastructure
    "infrastructure and industry",  # Transport, energy, manufacturing, logistics, industrial strategy
    "environment and climate",  # Climate policy, sustainability, agriculture, biodiversity
    "health and social policy",  # Healthcare, public safety, social protection, housing, labor
    "education, culture, and rights",  # Schools, universities, arts, media, civil rights, minorities, equality
]

candidate_labels = [
    "foreign and security affairs",  # Diplomacy, war/conflict, development aid, defense, migration, borders
    "economics and trade",  # Budget, taxation, employment, macroeconomics, eurozone
    "technology",  # Tech, AI, data, research, digital infrastructure
    "infrastructure and industry",  # Transport, energy, manufacturing, logistics, industrial strategy
    "environment and climate",  # Climate policy, sustainability, agriculture, biodiversity
    "health",  # Healthcare, public safety, social protection, housing, labor
    "human rights",  # Civil rights, minorities, equality
    "education",  # Schools, universities, arts, media, civil rights, minorities, equality
    "agriculture",  # Farming policy, CAP, rural development, fishing quotas
]

In [21]:
from datasets import Dataset

# Create dataset from documents
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

# Get number of CPU cores, leave one free for system
n_cores = max(1, multiprocessing.cpu_count() - 1)

# Process documents in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=n_cores) as executor:
    doc_texts = list(executor.map(lambda doc: read_pdf_file(f"{FOLDER_PATH}/{doc}"), docs))
    doc_ids = [doc.split(".")[0] for doc in docs]

dataset = Dataset.from_dict({
    "id": doc_ids,
    "sequence": [get_main_content(doc) for doc in doc_texts]
})


In [22]:
# check if there is an empty squecne thenm remove it from the dataset
dataset = dataset.filter(lambda x: x["sequence"] != "")

Filter: 100%|██████████| 72155/72155 [00:00<00:00, 225238.91 examples/s]


In [23]:


# Process in batches
batch_size = 8
results = []

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = dataset[i:i + batch_size]
    sequences = batch["sequence"]
    
    # Run classification on batch
    batch_results = classifier(sequences, candidate_labels, multi_label=True)
    
    # Process results
    for j, resp in enumerate(batch_results):
        d = {l: s for l, s in zip(resp["labels"], resp["scores"])}
        results.append({
            "id": batch["id"][j],
            "sequence": sequences[j],
            **d
        })

data = results


100%|██████████| 9017/9017 [28:58:18<00:00, 11.57s/it]   


In [26]:
pd.DataFrame(data).to_csv("data/silver/questions_topics.csv", index=False)

In [30]:
df = pd.read_csv("data/silver/questions_topics.csv")
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
human rights,72134.0,0.420413,0.310137,0.000138,0.148837,0.342849,0.680071,0.999586
foreign and security affairs,72134.0,0.274111,0.268652,7.5e-05,0.060868,0.1754,0.413403,0.999188
health,72134.0,0.355757,0.300915,9.2e-05,0.104783,0.26823,0.543292,0.998705
infrastructure and industry,72134.0,0.28847,0.206414,0.0001,0.128469,0.251861,0.407741,0.997486
agriculture,72134.0,0.185943,0.254229,4.1e-05,0.028846,0.086915,0.210658,0.999335
technology,72134.0,0.272025,0.227871,0.000164,0.1117,0.201599,0.352736,0.99913
environment and climate,72134.0,0.313685,0.285408,5e-05,0.082845,0.220895,0.468852,0.998911
economics and trade,72134.0,0.325959,0.290762,4.2e-05,0.077889,0.234852,0.51792,0.998896
education,72134.0,0.139724,0.181458,3.7e-05,0.031796,0.082332,0.171823,0.999166


In [104]:
from random import choice


resp = classifier(get_main_content(choice(doc_texts)), candidate_labels, multi_label=True)
print(resp['sequence'])
print('\n')
for label, score in zip(resp['labels'], resp['scores']):
    print(f"{label}: {score}")

Violation of freedom of expression by the occupying regime in Cyprus   
According to the issue of  ‘Fileleftheros’ newspaper of 31/7/2018, the Turkish government is 
persecuting and putting on trial the Turkish Cypriot journalists, Şener Levent and Ali Osman, of the 
newspaper ‘Afrika’.  Levent, a citizen of the Republic of Cyprus living in the occupied areas, is editor -
in-chief of this newspaper. Despite the persecution and threats of the occupying regime and Ankara,  
most recently an attempt to have him lynched on 22 January 2018, he has continued to issue it, 
strongly criticising the illegal Turkish occupation of Cyprus. The charges concern the front pag e of the 
newspaper ‘Afrika’, which described the Turkish military intervention of Afrin in Syria as ‘Turkey’s 
second invasion move’ and considered the Turkish invasion of Cyprus in 1974 to have been Turkey’s 
‘first invasion’, as well as an article by Leven t in which he inveighed against the Turkish government.  
 
 It should

In [None]:
text1 = """ Current situation in resolving the problems relating to the disbursement of direct 
agricultural payments in Slovakia
In December 2018, the CONT Committee carried out a fact-finding mission to Slovakia to investigate 
accusations by farmers from Eastern Slovakia regarding problems with the right to direct payments 
and observance of cross-compliance rules under the EU’s common agricultural policy. However, the 
situation has still not been resolved satisfactorily and in many cases direct payments have still not 
been made to eligible applicants.
Recently, we became aware of media reports about possible wrongdoing by some (former) 
employees of the PPA (Agricultural Paying Agency), who are suspected of a conflict of interest, of 
providing sensitive information to external persons, and of manipulating files or halting conflict and 
risk management processes within the agency.
The Commission states that it is monitoring and analysing the situation at the PPA in connection with 
implementation of the transformation plan and fulfilment of the accreditation criteria.
1. What conclusions has the Commission drawn from its observations?
2. What steps will the Commission take to resolve the situation in Slovakia?
3. Will this situation not jeopardise the position of the PPA with regard to its accreditation?
Supporter1
1 This question is supported by a Member other than the author: Tomáš Zdechovský (PPE)

"""

text2 = """ Macedonian minority  
If a Member State contravenes Article 2 of the Treaty on European Union and Article 21 of the Charter 
of Fundamental Rights of the EU, and the Commission does not intervene, is the Commission 
comply ing with its obligations as the ‘guardian ’ of the Treaties?
"""

In [85]:
classifier(""" Macedonian minority  
If a Member State contravenes Article 2 of the Treaty on European Union and Article 21 of the Charter 
of Fundamental Rights of the EU, and the Commission does not intervene, is the Commission 
comply ing with its obligations as the ‘guardian ’ of the Treaties?
""", candidate_labels, multi_label=True)

{'sequence': ' Macedonian minority  \nIf a Member State contravenes Article 2 of the Treaty on European Union and Article 21 of the Charter \nof Fundamental Rights of the EU, and the Commission does not intervene, is the Commission \ncomply ing with its obligations as the ‘guardian ’ of the Treaties?\n\n\n\n',
 'labels': ['human rights',
  'foreign and security affairs',
  'environment and climate',
  'health',
  'infrastructure and industry',
  'agriculture',
  'economics and trade',
  'education',
  'technology'],
 'scores': [0.8970650434494019,
  0.15330223739147186,
  0.11451103538274765,
  0.02921704761683941,
  0.01909625716507435,
  0.01496758870780468,
  0.014105464331805706,
  0.013367978855967522,
  0.005283644888550043]}