### Converting the given XML descriptor file into tsv file (Acquiring the annotated dataset)

In [4]:
import xml.etree.ElementTree as Xet # for parsing and creating XML data
import pandas as pd
import os

cols = ['ID', 'EN'] # will be saving in a tsv with ids and their corresponding terms
rows = []

# parsing the xml file
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\descriptors\\desc_en.xml")
xml_parse = Xet.parse(temp_path)
root = xml_parse.getroot()

# iterate through the elements of xml file
for element in root:
    rows.append({"ID": element.find("DESCRIPTEUR_ID").text, "EN": element.find("LIBELLE").text})

# creating the tsv file
df = pd.DataFrame(rows, columns=cols)
df.to_csv('eurovoc.tsv', sep='\t', index=False) # using sep='\t' gives us a tsv file instead of csv

In [5]:
df

Unnamed: 0,ID,EN
0,594,AAMS countries
1,759,abandoned child
2,4444,abandoned land
3,3509,ABM Agreement
4,4333,abolition of customs duties
...,...,...
6792,6252,Åland
6793,8005,Örebro county
6794,8004,Östergötland county
6795,7874,Šiauliai county


In [8]:
!pip install flair

Collecting flair
  Using cached flair-0.12.2-py3-none-any.whl (373 kB)
Collecting transformer-smaller-training-vocab>=0.2.1
  Using cached transformer_smaller_training_vocab-0.2.3-py3-none-any.whl (12 kB)
Collecting bpemb>=0.3.2
  Using cached bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting gdown==4.4.0
  Using cached gdown-4.4.0-py3-none-any.whl
Collecting hyperopt>=0.2.7
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Collecting pytorch-revgrad
  Using cached pytorch_revgrad-0.2.0-py3-none-any.whl (4.6 kB)
Collecting FuzzyTM>=0.4.0
  Using cached FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume
  Using cached pyFUME-0.2.25-py3-none-any.whl (67 kB)
Collecting simpful
  Using cached simpful-2.10.0-py3-none-any.whl (31 kB)
Collecting fst-pso
  Using cached fst_pso-1.8.1-py3-none-any.whl
Collecting miniful
  Using cached miniful-0.0.6-py3-none-any.whl
Installing collected packages: simpful, miniful, hyperopt, pytorch-revgrad, gdown, fst-pso, pyfume, transformer-smalle

In [15]:
!pip install -q datasets transformers

In [16]:
# functions
def original_to_annotated_transformer():
    return None

def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities


In [14]:
from flair.data import Corpus # function?
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings # these embeddings helps NER to perform better
import pandas as pd
import re

# getting our data
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "src\\main\\eurovoc.tsv")
data = pd.read_csv(temp_path , sep='\t')

corpus_functions = Corpus(data)
tag_type = 'ner'
tag_dictionary = corpus_functions.make_label_dictionary(label_type = tag_type, train = True)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### Data Preparation


In [3]:
#####################################################
####################################################
###################################################

def TsvDicProcessing(path):
    # !!! It only works with a 2-columns TSV file
    Dic = {}
    RevDic = {}
    list1 = []
    list2 = []
    with open(path, 'rt', encoding='utf8') as csvfile:
        myreader = csv.reader(csvfile, delimiter='\t')
        rcount = 0
        for row in myreader:
            rcount += 1
            ccount = 0
            if rcount > 1:
                for cells in row:
                    ccount += 1
                    if ccount ==1:
                        list1.append(cells)
                        key = cells
                    else:
                        list2.append(cells)
                        value = cells
                Dic[key] = value
                RevDic[value] = key
    return Dic, RevDic, list1, list2


def FolderListWithTerminaison(terminaison):
    DocList = []
    for doc in os.listdir():
        if re.search (r'.*\%s$' % terminaison, doc) is not None:
            DocList.append(doc)
    return DocList

def FolderListToDic(List):
    Dic = {}
    # the input should be a list of file contained in a folder
    for FileName in List:
        print('importing', FileName, '...')
        with open("%s" % FileName, "r", encoding='utf8') as myfile:
            text = myfile.read()
        Dic[FileName]= text
    return Dic

def TokenCleaning(token, stemmer):
    token = token.lower()
    token = stemmer_en.stem(token)
    return token

def RegexFromTerm(term, stemmer):

    regex = r"\b(" # Regex Opening

    # Adding terms to regex
    tokensList = nltk.word_tokenize(term)

    # in case of one-word term
    if len(tokensList) == 1:
        for token in tokensList:
            regex += TokenCleaning(token, stemmer)
    # if it is a multi-word term
    else:
        decount = len(tokensList)
        for token in tokensList:
            decount = decount-1
            # add between-words
            if decount != len(tokensList)-1:
                regex+= r'\w*\W\w*\W*'
            # add token
            regex += TokenCleaning(token, stemmer)


    regex += '''\w{0,5})(\W)''' # Regex Closure
    return regex

In [9]:
import os, csv, re, nltk
from nltk.stem.snowball import SnowballStemmer
from itertools import islice

nltk.download('punkt') # unsupervised trainable model, which means it can be trained on unlabeled data (Data that has not been tagged with information identifying its characteristics, properties, or categories is referred to as unlabeled data.)

# creation of a Eurovoc dictionary from the TSF
TsvFile = "eurovoc.tsv"

# getting info of ids and concepts
EurovocDic, EurovocReverseDic, URIList, ConceptList = TsvDicProcessing(TsvFile)
print('Eurovoc importated!')

EurovocDic:
{'594': 'AAMS countries', '759': 'abandoned child', '4444': 'abandoned land', '3509': 'ABM Agreement', '4333': 'abolition of customs duties', '4504': 'abortion', '5075': 'Abruzzi', '5339': 'absenteeism', '1746': 'absolute majority', '5984': 'abstentionism'}

EurovocReverseDic:
{'AAMS countries': '594', 'abandoned child': '759', 'abandoned land': '4444', 'ABM Agreement': '3509', 'abolition of customs duties': '4333', 'abortion': '4504', 'Abruzzi': '5075', 'absenteeism': '5339', 'absolute majority': '1746', 'abstentionism': '5984'}

URIList:
['594', '759', '4444', '3509', '4333']

ConceptList:
['AAMS countries', 'abandoned child', 'abandoned land', 'ABM Agreement', 'abolition of customs duties']

Eurovoc importated!


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dnaen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
#=====================

# move folder

print('moving to corpus folder...')

# detection of TXT in the folder

# storing document content in a dictionary
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\directives_txt\\Directive_(EU)_2016_343_en.txt")
DocList = [temp_path]

DocumentDic = FolderListToDic(DocList)
#=====================

In [16]:
# tagging by researching concept-regexed as a substring of the text

stemmer_en = SnowballStemmer("english")

for DocName in DocList:
    tagsList=[]
    taggedText = ""
    print('tagging', DocName,'...')
    text = DocumentDic[DocName]
    text = text.lower()
    taggedText = text # document's initial text


#  a concept tag will be done with a star (*), and the identifier with a +
    for concept in ConceptList:

        if concept != "": # IMPORTANT TO AVOID TAGGING ANYTHING
            # REGEX CREATION: creating regex of the concept such that it can be used to search in doc later
            regex = RegexFromTerm(concept, stemmer_en)

            # TAGGING#
            # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
            if re.search(regex, text) != None:
                tagsList.append(concept)
                subRegex = r"[" + concept + r"]"
                subRegex += r"(" + EurovocReverseDic[concept] + r") " # insert the identifier
                taggedText = re.sub(regex, subRegex, taggedText)

# create a new file with the tagged file
    file = open("%s_TAGGED.txt" % DocName, "w", encoding='utf8')
    file.write(taggedText)
    file.close()


tagging C:\Users\dnaen\PycharmProjects\bachelor_thesis_23\data\en\directives_txt\Directive_(EU)_2016_343_en.txt ...


In [26]:
import re
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M) # used to compile a regular expression pattern provided as a string into a regex pattern object
    # flags: re.IGNORECASE and re.MULTILINE

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token) # if no match then returns None

        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O")) # no match

    return tokens_with_entities

In [27]:
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\directives_txt\\Directive_(EU)_2016_343_en.txt_TAGGED.txt")


with open("%s" % temp_path, "r", encoding='utf8') as myfile:
    temp_text = myfile.read()

temp_text_entities = get_tokens_with_entities(temp_text)

In [29]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=self.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

In [0]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

dm = NERDataMaker(temp_text.split("\n"))

In [32]:
print(f"total examples = {len(dm)}")
print(dm[0:20])


total examples = 702
[{'id': 0, 'ner_tags': [0], 'tokens': ['11.3.2016']}, {'id': 1, 'ner_tags': [0], 'tokens': ['']}, {'id': 2, 'ner_tags': [0], 'tokens': ['en']}, {'id': 3, 'ner_tags': [0], 'tokens': ['']}, {'id': 4, 'ner_tags': [14, 109, 0, 0, 59, 0], 'tokens': ['Official', 'Journal', 'of', 'the', 'EURES', 'union']}, {'id': 5, 'ner_tags': [0], 'tokens': ['']}, {'id': 6, 'ner_tags': [0, 0], 'tokens': ['l', '65/1']}, {'id': 7, 'ner_tags': [0], 'tokens': ['']}, {'id': 8, 'ner_tags': [0], 'tokens': ['i']}, {'id': 9, 'ner_tags': [0, 0], 'tokens': ['([legislation](1589)', 'acts)']}, {'id': 10, 'ner_tags': [0], 'tokens': ['']}, {'id': 11, 'ner_tags': [67, 67, 0, 0, 0, 0, 59, 32, 0, 0, 0, 0], 'tokens': ['directive', 'directive', '(eu)', '2016/343', 'of', 'the', 'EURES', 'parliament', 'and', 'of', 'the', 'council']}, {'id': 12, 'ner_tags': [0, 0, 22, 0], 'tokens': ['of', '9', 'Marches', '2016']}, {'id': 13, 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['on', 'th

In [33]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    weight_decay=0.01,
)

train_ds = dm.as_hf_dataset(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=train_ds, # eval on training set! ONLY for DEMO!!
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Map:   0%|          | 0/702 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
# tempt
import PyPDF2

#create file object variable
#opening method will be rb
pdffileobj=open('1.pdf','rb')

#create reader variable that will read the pdffileobj
pdfreader=PyPDF2.PdfFileReader(pdffileobj)

#This will store the number of pages of this pdf file
x=pdfreader.numPages

#create a variable that will select the selected number of pages
pageobj=pdfreader.getPage(x+1)

#(x+1) because python indentation starts with 0.
#create text variable which will store all text datafrom pdf file
text=pageobj.extractText()

#save the extracted data from pdf to a txt file
#we will use file handling here
#dont forget to put r before you put the file path
#go to the file location copy the path by right clicking on the file
#click properties and copy the location path and paste it here.
#put "\\your_txtfilename"
file1=open(r"C:\Users\SIDDHI\AppData\Local\Programs\Python\Python38\\1.txt","a")
file1.writelines(text)