# NER

### 1.Converting the given XML descriptor file into tsv file

In [1]:
# imports
import xml.etree.ElementTree as Xet # for parsing and creating XML data
import pandas as pd
import os, csv, re, nltk
from flair.data import Corpus # in order to use the functions tha flair has
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings # these embeddings helps NER to perform better
from nltk.stem.snowball import SnowballStemmer
from itertools import islice


Before running the cell below "desc_en.xml" file (that is downloaded from EuroVoc website) needs to be downloaded and added to package "data/en/descriptors/..."

In [2]:
cols = ['ID', 'EN'] # will be saving in a tsv with ids and their corresponding terms
rows = []

# parsing the xml file -> with the given EuroVoc descriptors
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\descriptors\\desc_en.xml")
xml_parse = Xet.parse(temp_path)
root = xml_parse.getroot()

# iterate through the elements of xml file
for element in root:
    rows.append({"ID": element.find("DESCRIPTEUR_ID").text, "EN": element.find("LIBELLE").text})

# creating the tsv file
df = pd.DataFrame(rows, columns=cols)
df.to_csv('eurovoc.tsv', sep='\t', index=False) # using sep='\t' gives us a tsv file instead of csv

now we have a new 2-columns (ID, EN) TSV file

In [3]:
df

Unnamed: 0,ID,EN
0,594,AAMS countries
1,759,abandoned child
2,4444,abandoned land
3,3509,ABM Agreement
4,4333,abolition of customs duties
...,...,...
6792,6252,Åland
6793,8005,Örebro county
6794,8004,Östergötland county
6795,7874,Šiauliai county


### 2. Data Preparation - (Acquiring the annotated dataset)

In [4]:
# FUNCTIONS
# -*- coding: utf-8 -*-


# this function assumes we get the text annotated as [entity_value](entity_name), and assigns prefixes B, I, and 0 to each token
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces (but not splitting the space inside the square brackets (so not splitting the "multi-word" entity value yet))
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"

    # flags: re.IGNORECASE and re.MULTILINE
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M) # using it to compile a regular expression pattern provided as a string into a regex pattern object

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token) # if no match then returns None

        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O")) # no match

    return tokens_with_entities

# NLTK VERSION
def regex_from_term_nltk(term, stemmer):
    regex = r"\b(" # Regex Opening
    tokensList = nltk.word_tokenize(term)

    # Adding terms to regex
    if len(tokensList) == 1: # in case of one-word term
        for token in tokensList:
            regex += TokenCleaning(token, stemmer)

    else: # if it is a multi-word term
        decount = len(tokensList)
        for token in tokensList:
            decount = decount-1
            # add between-words
            if decount != len(tokensList)-1:
                regex+= r'\w*\W\w*\W*'
            # add token
            regex += TokenCleaning(token, stemmer)

    regex += '''\w{0,5})(\W)''' # Regex Closure
    return regex

def TokenCleaning(token, stemmer):
    token = token.lower()
    token = stemmer_en.stem(token)
    return token

# Functions for document processing, taken from @https://github.com/shashankmc/eurovoc_entity_link/blob/master/EurovocTagger.py
def TsvDicProcessing(path):
    # !!! It only works with a 2-columns TSV file
    Dic = {}
    RevDic = {}
    list1 = []
    list2 = []
    with open(path, 'rt', encoding='utf8') as csvfile:
        myreader = csv.reader(csvfile, delimiter='\t')
        rcount = 0
        for row in myreader:
            rcount += 1
            ccount = 0
            if rcount > 1:
                for cells in row:
                    ccount += 1
                    if ccount ==1:
                        list1.append(cells)
                        key = cells
                    else:
                        list2.append(cells)
                        value = cells
                Dic[key] = value
                RevDic[value] = key
    return Dic, RevDic, list1, list2

def FolderListWithTerminaison(terminaison):
    DocList = []
    for doc in os.listdir():
        if re.search (r'.*\%s$' % terminaison, doc) is not None:
            DocList.append(doc)
    return DocList

def FolderListToDic(List):
    Dic = {}
    # the input should be a list of file contained in a folder
    for FileName in List:
        print('importing', FileName, '...')
        with open("%s" % FileName, "r", encoding='utf8') as myfile:
            text = myfile.read()
        Dic[FileName]= text
    return Dic

#### 2.1 Creation of dictionary: Concepts (descriptors) and their ids


In [15]:
# nltk.download('punkt') # unsupervised trainable model, which means it can be trained on unlabeled data (Data that has not been tagged with information identifying its characteristics, properties, or categories is referred to as unlabeled data.)
current_path = os.getcwd()
data_path = current_path.replace("data\\en\\directives_txt_tagged", "src\\main")
os.chdir(data_path)
print(os.getcwd())

# creation of a Eurovoc dictionary from the TSF
TsvFile = "eurovoc_en.tsv"

# getting info of ids and concepts
EurovocDic, EurovocReverseDic, URIList, ConceptList = TsvDicProcessing(TsvFile)
print('Eurovoc importated!')

C:\Users\dnaen\PycharmProjects\bachelor_thesis_23\src\main
Eurovoc importated!


In [13]:
ConceptList

['AAMS countries',
 'abandoned child',
 'abandoned land',
 'ABM Agreement',
 'abolition of customs duties',
 'abortion',
 'Abruzzi',
 'absenteeism',
 'absolute majority',
 'abstentionism',
 'abuse of power',
 'academic freedom',
 'access to a profession',
 'access to Community information',
 'access to education',
 'access to information',
 'access to the courts',
 'accession criteria',
 'accession negotiations',
 'accession to an agreement',
 'accession to the European Union',
 'accident in the home',
 'accident prevention',
 'accidental pollution',
 'account',
 'accountant',
 'accounting',
 'accounting entry',
 'accounting system',
 'acculturation',
 'acid',
 'acid rain',
 'acidification',
 'acoustics',
 'ACP countries',
 'ACP-EC Committee of Ambassadors',
 'ACP-EC Convention',
 'ACP-EC Council of Ministers',
 'ACP-EC institution',
 'ACP-EC Joint Assembly',
 'ACP-EC Joint Committee',
 'acquisition of property',
 'action brought before an administrative court',
 'action brought before

In [14]:
# Run this cell for multiple documents
# os.chdir('data\\en\\directives_txt') # change path
# DocList = FolderListWithTerminaison('.txt') # detection of txt files in the folder
# DocumentDic = FolderListToDic(DocList) # storing document content in a dictionary

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data\\en\\directives_txt'

In [16]:
# Run this cell for only single document
temp_path = os.getcwd()

temp_path = temp_path.replace("src\\main", "data\\en\\directives_txt\\Directive_(EU)_2016_343_en.txt")
DocList = [temp_path] # saving the path of wanted txt file
DocumentDic = FolderListToDic(DocList) # storing document content in a dictionary

importing C:\Users\dnaen\PycharmProjects\bachelor_thesis_23\data\en\directives_txt\Directive_(EU)_2016_343_en.txt ...


#### 2.2 Creation of tagged doc with the use of dictionary

In [17]:
# This cell uses NLTK

# tagging by researching concept-regexed as a substring of the text
stemmer_en = SnowballStemmer("english")

for DocName in DocList:
    tagsList=[]
    taggedText = ""
    print('tagging', DocName,'...')
    print()
    text = DocumentDic[DocName]
    text = text.lower()
    taggedText = text # document's initial text

#  a concept tag will be done with a star (*), and the identifier with a +
    for concept in ConceptList:

        if concept != "": # IMPORTANT TO AVOID TAGGING ANYTHING
            # REGEX CREATION: creating regex of the concept such that it can be used to search in doc later
            regex = regex_from_term_nltk(concept, stemmer_en)

            # TAGGING#
            # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
            if re.search(regex, text) is not None:
                tagsList.append(concept)
                subRegex = r"[" + concept + r"]"
                subRegex += r"(" + EurovocReverseDic[concept] + r") " # insert the identifier
                taggedText = re.sub(regex, subRegex, taggedText)

# create a new file with the tagged file
    file = open("%s_TAGGED.txt" % DocName, "w", encoding='utf8')
    file.write(taggedText)
    file.close()


tagging C:\Users\dnaen\PycharmProjects\bachelor_thesis_23\data\en\directives_txt\Directive_(EU)_2016_343_en.txt ...



In [None]:
ConceptList[0:10]


In [84]:
# This cell uses HuggingFace

# tagging by researching concept-regexed as a substring of the text
stemmer_en = SnowballStemmer("english")

for DocName in DocList:
    tagsList=[]
    taggedText = ""
    print('tagging', DocName,'...')
    text = DocumentDic[DocName]
    text = text.lower()
    taggedText = text # document's initial text


#  a concept tag will be done with a star (*), and the identifier with a +
    for concept in ConceptList:
        if concept != "": # IMPORTANT TO AVOID TAGGING ANYTHING
            # REGEX CREATION: creating regex of the concept such that it can be used to search in doc later
            regex = regex_from_term_nltk(concept, stemmer_en)

            # TAGGING
            # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
            if re.search(regex, text) != None:
                tagsList.append(concept)
                subRegex = r"[" + concept + r"]"
                subRegex += r"(" + EurovocReverseDic[concept] + r") " # insert the identifier
                taggedText = re.sub(regex, subRegex, taggedText)

# create a new file with the tagged file
    file = open("%s_TAGGED.txt" % DocName, "w", encoding='utf8')
    file.write(taggedText)
    file.close()


tagging C:\Users\dnaen\PycharmProjects\bachelor_thesis_23\data\en\directives_txt\Directive_(EU)_2016_343_en.txt ...
AAMS countries
['AAMS', 'countries']
abandoned child
['abandoned', 'child']
abandoned land
['abandoned', 'land']
ABM Agreement
['ABM', 'Agreement']
abolition of customs duties
['abolition', 'of', 'customs', 'duties']
abortion
['abortion']
Abruzzi
['Abruzzi']
absenteeism
['absenteeism']
absolute majority
['absolute', 'majority']
abstentionism
['abstentionism']
abuse of power
['abuse', 'of', 'power']
academic freedom
['academic', 'freedom']
access to a profession
['access', 'to', 'a', 'profession']
access to Community information
['access', 'to', 'Community', 'information']
access to education
['access', 'to', 'education']
access to information
['access', 'to', 'information']
access to the courts
['access', 'to', 'the', 'courts']
accession criteria
['accession', 'criteria']
accession negotiations
['accession', 'negotiations']
accession to an agreement
['accession', 'to', 'a

KeyboardInterrupt: 

In [3]:
# checking
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "data\\en\\directives_txt\\Directive_(EU)_2016_343_en.txt_TAGGED.txt")

with open("%s" % temp_path, "r", encoding='utf8') as myfile:
    temp_text = myfile.read()

temp_text_entities = get_tokens_with_entities(temp_text)

In [5]:
temp_text_entities[0:20]

[('11.3.2016', 'O'),
 ('', 'O'),
 ('en', 'O'),
 ('', 'O'),
 ('Official', 'B-1533'),
 ('Journal', 'I-1533'),
 ('of', 'O'),
 ('the', 'O'),
 ('EURES', 'B-4054'),
 ('union', 'O'),
 ('', 'O'),
 ('l', 'O'),
 ('65/1', 'O'),
 ('', 'O'),
 ('i', 'O'),
 ('([legislation](1589)', 'O'),
 ('acts)', 'O'),
 ('', 'O'),
 ('directive', 'B-448'),
 ('directive', 'B-448')]

In [31]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    # allows us to use the [] (indexer) operators. E.g., x[i] is roughly equivalent to type(x).__getitem__(x, i).
    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_huggingface_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=self.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

In [36]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

dm = NERDataMaker(temp_text.split("\n"))

#### Evaluation

In [64]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

labels = []
for i in range(len(dm)):
    for j in dm[i][f"ner_tags"]:
        labels.append(dm.id2label[j])

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [dm.id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [dm.id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Training

In [65]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)

train_dataset = dm.as_huggingface_dataset(tokenizer=tokenizer)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Map:   0%|          | 0/702 [00:00<?, ? examples/s]

In [72]:
# parameters
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [73]:
# running
trainer.train()



Epoch,Training Loss,Validation Loss


TrainOutput(global_step=88, training_loss=0.5622047511014071, metrics={'train_runtime': 244.4003, 'train_samples_per_second': 5.745, 'train_steps_per_second': 0.36, 'total_flos': 12678801428544.0, 'train_loss': 0.5622047511014071, 'epoch': 2.0})

In [71]:
# Pipeline
from transformers import pipeline
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
pipe(text)

[]

### Trash

In [None]:
#################################################

# getting our data
temp_path = os.getcwd()
temp_path = temp_path.replace("src\\main", "src\\main\\eurovoc.tsv")
data = pd.read_csv(temp_path , sep='\t')

corpus_functions = Corpus(data)
tag_type = 'ner'
tag_dictionary = corpus_functions.make_label_dictionary(label_type = tag_type, train = True)

#################################################

import PyPDF2

#create file object variable
#opening method will be rb
pdffileobj=open('1.pdf','rb')

#create reader variable that will read the pdffileobj
pdfreader=PyPDF2.PdfFileReader(pdffileobj)

#This will store the number of pages of this pdf file
x=pdfreader.numPages

#create a variable that will select the selected number of pages
pageobj=pdfreader.getPage(x+1)

#(x+1) because python indentation starts with 0.
#create text variable which will store all text datafrom pdf file
text=pageobj.extractText()

#save the extracted data from pdf to a txt file
#we will use file handling here
#dont forget to put r before you put the file path
#go to the file location copy the path by right clicking on the file
#click properties and copy the location path and paste it here.
#put "\\your_txtfilename"
file1=open(r"C:\Users\SIDDHI\AppData\Local\Programs\Python\Python38\\1.txt","a")
file1.writelines(text)

#################################################



# RE

In [None]:
from textblob import TextBlob
import os

temp_path = os.getcwd()
custom_dict_path = temp_path + "src\\main\\eurovoc.tsv"

text = 'Barack Obama was born in Hawaii.'
tb = TextBlob(text, relations_path=custom_dict_path)
triples = tb.noun_phrases.triples()
triples

In [None]:

# !pip install git+https://github.com/26hzhang/ClausIE.git

In [None]:

!pip install stanford_openie

In [None]:

from openie import StanfordOpenIE

# https://stanfordnlp.github.io/CoreNLP/openie.html#api
# Default value of openie.affinity_probability_cap was 1/3.
properties = {
    'openie.affinity_probability_cap': 2 / 3,
}

with StanfordOpenIE(properties=properties) as client:
    print("a")
    text = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.'
    print('Text: %s.' % text)
    print("b")
    for triple in client.annotate(text):
        print('|-', triple)

In [None]:
import os

print(os.getenv('CORENLP_HOME'))  #prints None
print(os.environ['CORENLP_HOME'])  #Throws a KeyError exception

In [None]:

!pip install stanza

In [None]:

import stanza

stanza.download('en')  # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en')  # This sets up a default neural pipeline in English
doc = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
doc.sentences[0].print_dependencies()

In [None]:

!python -m pip install git+https: // github.com/mmxgn/spacy-clausie.git

In [None]:

import spacy
import claucy

nlp = spacy.load("en_core_web_sm")
claucy.add_to_pipe(nlp)

text = "Bell, a telecommunication company, which is based in Los Angeles, makes and distributes electronic, computer and building products."
doc = nlp(text)
propositions = doc._.clauses[0].to_propositions(as_text=False)

print(propositions)


In [None]:

from nltk.parse import corenlp

with corenlp.CoreNLPClient(annotators="tokenize ssplit lemma pos ner depparse natlog openie".split()) as client:
    ann = client.annotate(text)
sentence = ann.sentence[0].openieTriple
for x in ann.sentence:
    print(x.openieTriple)


In [None]:
import spacy
import claucy

nlp = spacy.load("en")
claucy.add_to_pipe(nlp)

doc = nlp("AE died in Princeton in 1955.")

print(doc._.clauses)
# Output:
# &lt;SV, AE, died, None, None, None, [in Princeton, in 1955]&gt;

propositions = doc._.clauses[0].to_propositions(as_text=True)

print(propositions)
# Output:
# [AE died in Princeton in 1955, AE died in 1955, AE died in Princeton

In [None]:

import os
import tempfile
from pathlib import Path
from subprocess import Popen
from sys import stderr
from typing import Optional
from zipfile import ZipFile

import wget


class StanfordOpenIEHM:

    def __init__(
            self,
            core_nlp_version: str = '4.5.3',  # https://stanfordnlp.github.io/CoreNLP/history.html
            install_dir_path: Optional[str] = None,
            *args,
            **kwargs
    ):
        if install_dir_path is None:
            default_path = Path('~/.stanfordnlp_resources').expanduser()
            self.install_dir = os.environ.get("OPENIE_INSTALL_PATH", default_path)
        else:
            self.install_dir = Path(install_dir_path)
        self.install_dir.mkdir(exist_ok=True)
        if len([d for d in self.install_dir.glob('*') if d.is_dir()]) == 0:
            # No coreNLP directories. Let's check for ZIP archives as well.
            zip_files = [d for d in self.install_dir.glob('*') if d.suffix == '.zip']
            if len(zip_files) == 0:
                # No dir and no ZIP. Let's download it with the desired core_nlp_version.
                remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-{}.zip'.format(core_nlp_version)
                print('Downloading from %s.' % remote_url)
                output_filename = wget.download(remote_url, out=str(self.install_dir))
                print('\nExtracting to %s.' % self.install_dir)
            else:
                output_filename = zip_files[0]
            print('Unzip %s.' % output_filename)
            zf = ZipFile(output_filename)
            zf.extractall(path=self.install_dir)
            zf.close()
        target_dir = [d for d in self.install_dir.glob('*') if d.is_dir()][0]

        os.environ['CORENLP_HOME'] = str(self.install_dir / target_dir)
        from stanfordnlp.server import CoreNLPClient
        self.client = CoreNLPClient(annotators=['openie'], memory='8G', *args, **kwargs)

    def annotate(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True):
        """
        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict.
        :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>.
        """
        # https://stanfordnlp.github.io/CoreNLP/openie.html
        annotators = 'tokenize,pos,lemma,depparse,natlog,ner,coref,openie'
        core_nlp_output = self.client.annotate(
            text=text, annotators=[annotators], output_format='json',
            properties_key=properties_key, properties=properties
        )
        if simple_format:
            triples = []
            for sentence in core_nlp_output['sentences']:
                for triple in sentence['openie']:
                    triples.append({
                        'subject': triple['subject'],
                        'relation': triple['relation'],
                        'object': triple['object']
                    })
            return triples
        else:
            return core_nlp_output

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def __del__(self):
        if hasattr(self, 'client'):
            self.client.stop()
        del os.environ['CORENLP_HOME']


In [None]:

# https://stanfordnlp.github.io/CoreNLP/openie.html#api
# Default value of openie.affinity_probability_cap was 1/3.
properties = {
    'openie.affinity_probability_cap': 2 / 3,
}

with StanfordOpenIEHM(properties=properties) as client:
    print("a")
    text = 'Barack Obama was born in Hawaii. Richard Manning wrote this sentence.'
    print('Text: %s.' % text)
    print("b")
    for triple in client.annotate(text):
        print('|-', triple)


In [None]:
import requests

text = "John has a blue car."
url = "http://localhost:9000/api"
data = {'sentence': text}

r = requests.post(url, data=data)
result = r.json()

for triple in result['openie']:
    print(triple['subject'], triple['relation'], triple['object'])

In [None]:

!python setup.py install


In [None]:

!pip install git+https: // github.com/AnthonyMRios/pyclausie.git

In [None]:

from pyclausie_modified.pyclausie import ClausIE

cl = ClausIE.get_instance()
sents = ['I learned that the 2012 Sasquatch music festival is scheduled for May 25th until May 28.']
triples = cl.extract_triples(sents)

for triple in triples:
    print(triple)




In [None]:
# functions
# -*- coding: utf-8 -*-

# English: en, German: de, French: fr, ... -> creates the tsv of given descriptor of any language
def create_tsv_of_language(given_language):
    """
    Before running this function below, the "desc_"".xml" file (that is downloaded from EuroVoc website) needs to be downloaded and added to package "data/""/descriptors/..."
    """
    cols = ['ID', given_language.upper()]  # will be saving in a tsv with ids and their corresponding terms
    rows = []

    # parsing the xml file -> with the given EuroVoc descriptors
    temp_path = os.getcwd()
    temp_path = temp_path.replace("src\\main",
                                  "data\\" + given_language + "\\descriptors\\desc_" + given_language + ".xml")
    xml_parse = Xet.parse(temp_path)
    root = xml_parse.getroot()

    # iterate through the elements of xml file
    for element in root:
        rows.append({"ID": element.find("DESCRIPTEUR_ID").text, given_language.upper(): element.find("LIBELLE").text})

    # creating the tsv file
    df = pd.DataFrame(rows, columns=cols)
    df.to_csv("eurovoc_" + given_language + ".tsv", sep='\t',
              index=False)  # using sep='\t' gives us a tsv file instead of csv


# this function assumes we get the text annotated as [entity_value](entity_name), and assigns prefixes B, I, and 0 to each token
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces (but not splitting the space inside the square brackets (so not splitting the "multi-word" entity value yet))
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"

    # flags: re.IGNORECASE and re.MULTILINE
    entity_value_pattern_compiled = re.compile(entity_value_pattern,
                                               flags=re.I | re.M)  # using it to compile a regular expression pattern provided as a string into a regex pattern object

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)  # if no match then returns None

        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))  # no match

    return tokens_with_entities


# NLTK VERSION
def regex_from_term_nltk(term, stemmer):
    regex = r"\b("  # Regex Opening
    tokensList = nltk.word_tokenize(term)

    # Adding terms to regex
    if len(tokensList) == 1:  # in case of one-word term
        for token in tokensList:
            regex += token_cleaning(token, stemmer)

    else:  # if it is a multi-word term
        decount = len(tokensList)
        for token in tokensList:
            decount = decount - 1
            # add between-words
            if decount != len(tokensList) - 1:
                regex += r'\w*\W\w*\W*'
            # add token
            regex += token_cleaning(token, stemmer)

    regex += '''\w{0,5})(\W)'''  # Regex Closure
    return regex


def token_cleaning(token, stemmer):  # TODO instead do lemmatization?
    token = token.lower()
    token = stemmer.stem(token)
    return token


# Functions for document processing were taken from @https://github.com/shashankmc/eurovoc_entity_link/blob/master/EurovocTagger.py and were modified
def tsv_dic_processing(path):
    """
    :param path: the name of the eurovoc.tsv file
    :return: Dic: Dictionary in style of {ID: Word}
    :return: RevDic: Dictionary in style of {Word: ID}
    :return: list1: list of IDs
    :return: list2: list of words (concepts)
    """
    # Dic, RevDic, list1, list2
    # Only works with a 2-columns ([ID], [EN]) TSV file
    Dic = {}
    RevDic = {}
    list1 = []
    list2 = []
    with open(path, 'rt', encoding='utf8') as csvfile:
        myreader = csv.reader(csvfile, delimiter='\t')
        rcount = 0
        for row in myreader:
            rcount += 1
            ccount = 0
            if rcount > 1:
                for cells in row:
                    ccount += 1
                    if ccount == 1:
                        list1.append(cells)
                        key = cells
                    else:
                        list2.append(cells)
                        value = cells
                Dic[key] = value
                RevDic[value] = key
    return Dic, RevDic, list1, list2


def find_folder_with_type(given_path, doc_type):  # returns all documents found in path
    doc_list = []
    for doc in os.listdir(given_path):
        if re.search(r'.*\%s$' % doc_type, doc) is not None:  # even though this shows as error in IDE it's fine
            doc_list.append(doc)
    return doc_list


def folder_list_to_dic(given_path, given_list):
    dic = {}
    old_path = os.getcwd()  # saving the previous working dir so we can switch back to that dir later
    os.chdir(given_path)

    # the input should be a list of file contained in a folder
    for file_name in given_list:
        print('importing', file_name, '...')
        with open("%s" % file_name, "r", encoding='utf8') as my_file:
            text = my_file.read()
        dic[file_name] = text

    os.chdir(old_path)
    return dic


# tagging by researching concept-regexed as a substring of the text (by using NLTK)
def tagging_document(path_of_tagged, given_doc_list, given_doc_dic, given_concept_list, given_eurovoc_reverse_dic):
    """
    This function takes the information of the descriptor (e.g., {id:concept}, id list, concept list, ...) and then with the given document information it creates the new tagged document in tagged folder. Additionally, it returns the new updated concept list which contains additional "concepts" found in the document text that seems to be related to one of the original concepts. Thus, expanding the vocabulary we have.

    :param path_of_tagged: the location (dir) of the tagged folder
    :param given_doc_list: a list of names of the documents
    :param given_doc_dic: a dic that contains the contents of the document i.e. {doc_name: doc_text}
    :param given_concept_list: the original concept list downloaded from Eurovoc
    :param given_eurovoc_reverse_dic: opposite of "given_concept_list" so {concept: id}
    :return: new_concept: this is the new expanded concept list
    """
    stemmer_en = SnowballStemmer("english")
    old_path = os.getcwd()  # saving the previous working dir so we can switch back to that dir later
    os.chdir(path_of_tagged)
    new_concept = given_concept_list.copy()

    for doc_name in given_doc_list:
        tags_list = []
        tagged_text = ""
        print('tagging', doc_name, '...')
        text = given_doc_dic[doc_name]
        text = text.lower()
        tagged_text = text  # document's initial text

        # a concept tag will be done with a star (*), and the identifier with a +
        for concept in given_concept_list:

            if concept != "":  # if concept empty, will tag everything (so need to make sure that it's not empty)
                # REGEX CREATION: creating regex of the concept such that it can be used to search in doc later
                regex = regex_from_term_nltk(concept, stemmer_en)

                # concept = concept.strip()
                # TAGGING #
                # semantically neutral symbols are chosen to prevent eurovoc concepts from matching tags
                if re.search(regex, text) is not None:
                    # these prints can be used to check performance
                    # print("Match made!")
                    # print("Found: " + re.search(regex, text).group() + ", for concept: " + concept)
                    match_in_text = re.search(regex, text).group()
                    if match_in_text not in given_concept_list:
                        new_concept.append(match_in_text)

                    tags_list.append(concept)
                    sub_regex = r"[" + concept + r"]"
                    sub_regex += r"(" + given_eurovoc_reverse_dic[concept] + r") "  # insert the identifier
                    tagged_text = re.sub(regex, sub_regex, tagged_text)

        # create a new file with the tagged file
        file = open("%s_TAGGED.txt" % doc_name, "w", encoding='utf8')
        file.write(tagged_text)
        file.close()

    os.chdir(old_path)  # change back to previous path

    return new_concept