### Converting the given XML descriptor file into tsv file (Acquiring the annotated dataset)

In [4]:
import xml.etree.ElementTree as Xet # for parsing and creating XML data
import pandas as pd
import os

cols = ['ID', 'EN'] # will be saving in a tsv with ids and their corresponding terms
rows = []

# parsing the xml file
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\descriptors\\desc_en.xml")
xml_parse = Xet.parse(temp_path)
root = xml_parse.getroot()

# iterate through the elements of xml file
for element in root:
    rows.append({"ID": element.find("DESCRIPTEUR_ID").text, "EN": element.find("LIBELLE").text})

# creating the tsv file
df = pd.DataFrame(rows, columns=cols)
df.to_csv('eurovoc.tsv', sep='\t', index=False) # using sep='\t' gives us a tsv file instead of csv

In [5]:
df

Unnamed: 0,ID,EN
0,594,AAMS countries
1,759,abandoned child
2,4444,abandoned land
3,3509,ABM Agreement
4,4333,abolition of customs duties
...,...,...
6792,6252,Åland
6793,8005,Örebro county
6794,8004,Östergötland county
6795,7874,Šiauliai county


In [8]:
!pip install flair

Collecting flair
  Using cached flair-0.12.2-py3-none-any.whl (373 kB)
Collecting transformer-smaller-training-vocab>=0.2.1
  Using cached transformer_smaller_training_vocab-0.2.3-py3-none-any.whl (12 kB)
Collecting bpemb>=0.3.2
  Using cached bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting gdown==4.4.0
  Using cached gdown-4.4.0-py3-none-any.whl
Collecting hyperopt>=0.2.7
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting pytorch-revgrad
  Using cached pytorch_revgrad-0.2.0-py3-none-any.whl (4.6 kB)
Collecting FuzzyTM>=0.4.0
  Using cached FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume
  Using cached pyFUME-0.2.25-py3-none-any.whl (67 kB)
Collecting simpful
  Using cached simpful-2.10.0-py3-none-any.whl (31 kB)
Collecting fst-pso
  Using cached fst_pso-1.8.1-py3-none-any.whl
Collecting miniful
  Using cached miniful-0.0.6-py3-none-any.whl
Installing collected packages: simpful, miniful, hyperopt, pytorch-revgrad, gdown, fst-pso, pyfume, transformer-smalle

In [15]:
!pip install -q datasets transformers

In [16]:
# functions
def original_to_annotated_transformer():
    return None

def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities


In [14]:
from flair.data import Corpus # function?
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings # these embeddings helps NER to perform better
import pandas as pd
import re

# getting our data
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "src\\main\\eurovoc.tsv")
data = pd.read_csv(temp_path , sep='\t')

corpus_functions = Corpus(data)
tag_type = 'ner'
tag_dictionary = corpus_functions.make_label_dictionary(label_type = tag_type, train = True)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### Data Preparation


In [3]:
#####################################################
####################################################
###################################################

def TsvDicProcessing(path):
    # !!! It only works with a 2-columns TSV file
    Dic = {}
    RevDic = {}
    list1 = []
    list2 = []
    with open(path, 'rt', encoding='utf8') as csvfile:
        myreader = csv.reader(csvfile, delimiter='\t')
        rcount = 0
        for row in myreader:
            rcount += 1
            ccount = 0
            if rcount > 1:
                for cells in row:
                    ccount += 1
                    if ccount ==1:
                        list1.append(cells)
                        key = cells
                    else:
                        list2.append(cells)
                        value = cells
                Dic[key] = value
                RevDic[value] = key
    return Dic, RevDic, list1, list2


def FolderListWithTerminaison(terminaison):
    DocList = []
    for doc in os.listdir():
        if re.search (r'.*\%s$' % terminaison, doc) is not None:
            DocList.append(doc)
    return DocList

def FolderListToDic(List):
    Dic = {}
    # the input should be a list of file contained in a folder
    for FileName in List:
        print('importing', FileName, '...')
        with open("%s" % FileName, "r", encoding='utf8') as myfile:
            text = myfile.read()
        Dic[FileName]= text
    return Dic

def TokenCleaning(token, stemmer):
    token = token.lower()
    token = stemmer_en.stem(token)
    return token

def RegexFromTerm(term, stemmer):

    # Regex Opening
    ################
    regex = r"\b("

    # Adding terms to regex
    ########################
    tokensList = nltk.word_tokenize(term)
    # in case of one-word term
    if len(tokensList) == 1:
        for token in tokensList:
            regex += TokenCleaning(token, stemmer)
    # if it is a multi-word term
    else:
        decount = len(tokensList)
        for token in tokensList:
            decount = decount -1
            # add between-words
            if decount != len(tokensList)-1:
                regex+= r'\w*\W\w*\W*'
            # add token
            regex += TokenCleaning(token, stemmer)

    # Regex Closure
    ################
    regex += '''\w{0,5})(\W)'''

    return regex

In [0]:
import os, csv, re, nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt') # unsupervised trainable model, which means it can be trained on unlabeled data (Data that has not been tagged with information identifying its characteristics, properties, or categories is referred to as unlabeled data.)

# creation of a Eurovoc dictionary from the TSF
TsvFile = "eurovoc.tsv"


EurovocDic, EurovocReverseDic, URIList, ConceptList = TsvDicProcessing(TsvFile)
print(EurovocReverseDic)
print('Eurovoc importated!')

In [0]:
#=====================

# move folder

print('moving to corpus folder...')

# detection of TXT in the folder

# storing document content in a dictionary
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\directives_txt\\Directive_(EU)_2016_343_en.txt")
DocList = [temp_path]

DocumentDic = FolderListToDic(DocList)

#=====================

In [2]:
# tagging by researching concept-regexed as a substring of the text

stemmer_en = SnowballStemmer("english")

for DocName in DocList:
    tagsList=[]
    taggedText = ""
    print('tagging', DocName,'...')
    text = DocumentDic[DocName]
    text = text.lower()
    taggedText = text # document's initial text


#  a concept tag will be done with a star (*), and the identifier with a +

    for concept in ConceptList:

        if concept != "": # IMPORTANT TO AVOID TAGGING ANYTHING

            # REGEX CREATION
            regex = RegexFromTerm(concept, stemmer_en)

            ####################
            # TEMPORARY TAGGING#
            ####################

            # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
            if re.search(regex, text) != None:
                tagsList.append(concept)
                subRegex = r""
                subRegex += r'''<:><,>'''
                subRegex += EurovocReverseDic[concept] # insert the identifier which will then be used for the URL
                subRegex += r'''</,>\1</:>\2'''
                taggedText = re.sub(regex, subRegex, taggedText)

    #############################
    # POST PROCESSING TO REPORT #
    # FINAL HYPERTEXT TAGGING   #
    #############################

    # create a new file with the tagged file
    file = open("%s_TAGGED.html" % DocName, "w", encoding='utf8')
    htmlReportText = re.sub(r'''<:><,>''', r'''<span style="background-color: #FFFF00"><a href="http://eurovoc.europa.eu/''', taggedText)
    htmlReportText = re.sub(r'''</,>''', r'''">''', htmlReportText)
    htmlReportText = re.sub(r'''</:>''', r'''</a></span>''', htmlReportText)
    file.write("<html><body>")
    file.write(htmlReportText)
    file.write("</body><html>")
    file.close()

    print(len(tagsList), 'concepts found:', tagsList)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dnaen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'AAMS countries': '594', 'abandoned child': '759', 'abandoned land': '4444', 'ABM Agreement': '3509', 'abolition of customs duties': '4333', 'abortion': '4504', 'Abruzzi': '5075', 'absenteeism': '5339', 'absolute majority': '1746', 'abstentionism': '5984', 'abuse of power': '186', 'academic freedom': '3914', 'access to a profession': '545', 'access to Community information': '5399', 'access to education': '280', 'access to information': '453', 'access to the courts': '5400', 'accession criteria': '6706', 'accession negotiations': '6708', 'accession to an agreement': '5420', 'accession to the European Union': '12', 'accident in the home': '5314', 'accident prevention': '5810', 'accidental pollution': '6413', 'account': '61', 'accountant': '60', 'accounting': '54', 'accounting entry': '1333', 'accounting system': '4362', 'acculturation': '4873', 'acid': '5035', 'acid rain': '4165', 'acidification': '6407', 'acoustics': '3291', 'ACP countries': '5083', 'ACP-EC Committee of Ambassadors': 

In [None]:
# tempt
import PyPDF2

#create file object variable
#opening method will be rb
pdffileobj=open('1.pdf','rb')

#create reader variable that will read the pdffileobj
pdfreader=PyPDF2.PdfFileReader(pdffileobj)

#This will store the number of pages of this pdf file
x=pdfreader.numPages

#create a variable that will select the selected number of pages
pageobj=pdfreader.getPage(x+1)

#(x+1) because python indentation starts with 0.
#create text variable which will store all text datafrom pdf file
text=pageobj.extractText()

#save the extracted data from pdf to a txt file
#we will use file handling here
#dont forget to put r before you put the file path
#go to the file location copy the path by right clicking on the file
#click properties and copy the location path and paste it here.
#put "\\your_txtfilename"
file1=open(r"C:\Users\SIDDHI\AppData\Local\Programs\Python\Python38\\1.txt","a")
file1.writelines(text)