# Assignment 1

In [3]:
import spacy
import re

def read_data(data_file, n=None):
    with open(data_file, "r") as infile:
        for i, line in enumerate(infile):
            if n is None or i < n:
                yield line
            else:
                break

def extract(doc):
    """
    Extract relevant relation instances from the specified document.

    Args:
        doc: The sentence as analysed by spaCy.

    Yields:
        Pairs of strings representing the extracted relation instances.
    """
    # pattern = re.compile(r'.*\b(head|lead|boss|manag|command|direct|rul)\b(?!\b.+ing).*')
    pattern = re.compile(r'.*(head|lead|boss|manag|command|direct|rul).*')

    person_start = None
    person_end = None
    org_start = None
    org_end = None

    pairs = []

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            person_label = ent.text
            person_start = ent.start
            person_end = ent.end
        elif person_start is not None and ent.label_ == "ORG":
            org_label = ent.text
            org_start = ent.start
            org_end = ent.end

        if person_start is not None and org_end is not None:
            words = doc.text.split()
            relation = " ".join(words[person_end:org_start]).strip()

            if pattern.match(relation):
                person = person_label.strip()
                organization = org_label.strip()

                if person and organization:
                    pairs.append((person, organization))

            person_start = None
            person_end = None
            org_start = None
            org_end = None

    return pairs

data_file = "../data/gmb.txt"
gold_file = "../data/gold.txt"

data = read_data(data_file)
gold_data = read_data(gold_file)

nlp = spacy.load("en", disable=["textcat"])

extracted = set()

for i, doc in enumerate(nlp.pipe(data)):
    entities = extract(doc)

    for person, org in entities:
        extracted.add((i, person, org))

gold = set()

for line in gold_data:
    columns = line.rstrip().split('\t')
    gold.add((int(columns[0]), columns[1], columns[2]))

In [4]:
print(len(extracted))

264


# Assignment 2

In [6]:
def evaluate(reference, predicted):
    """
    Print out the precision, recall, and F1 for the id-entity-entity
    triples in the set `predicted`, given the triples in the reference set.
    Args:
        reference: The reference set of triples.
        predicted: The set of predicted triples.
    """
    true_positive = len(predicted.intersection(reference))
    false_negative = len(reference - predicted)
    false_positive = len(predicted - reference)

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1 = 2 * precision * recall / (precision + recall)

    return precision * 100, recall * 100, f1 * 100


print("Intersection Without Normalisation")
print(extracted.intersection(gold))
print("Size:" , len(extracted.intersection(gold)))
evaluation = evaluate(gold, extracted)
print("Precision: %0.2f, Recall: %0.2f, F1-Score: %f0.2f" % evaluation)

Intersection Without Normalisation
{(44280, 'Gandhi', 'National Advisory Council'), (51507, 'Abdullah Ocalan', 'Kurdistan Workers Party'), (8633, 'Ali Rodriguez', 'Petroleos de Venezuela'), (49242, 'Ayatollah Ahmad Jannati', 'Guardian Council'), (13043, 'David Petraeus', 'U.S. Central Command'), (23016, 'Osama bin Laden', 'al-Qaida')}
Size: 6
Precision: 2.27, Recall: 13.04, F1-Score: 3.8709680.2f


# Assignment 3

We did not find looking for normalisations manually rewarding so we did it programmatically and since the gold standard is not normalised itself we had to give all the parameters as input.

In [7]:
def normalise(i, person, org):
    person_parts = {part.lower() for part in set(person.split())}

    for j, p, o in gold:
        if j == i:
            pparts = {part.lower() for part in set(p.split())}
            intersection = pparts.intersection(person_parts)

            if len(intersection) > 0:
                return j, p, o

    return i, person, org

extracted_normalised = set()

for element in extracted:
    extracted_normalised.add(normalise(*element))

print("Intersection With Normalisation")
print(extracted_normalised.intersection(gold))
print("Size:", len(extracted_normalised.intersection(gold)))
evaluation_normalised = evaluate(gold, extracted_normalised)
print("Precision: %0.2f, Recall: %0.2f, F1-Score: %f0.2f" % evaluation_normalised)

Intersection With Normalisation
{(51507, 'Abdullah Ocalan', 'Kurdistan Workers Party'), (33646, 'Mr. Coleman', 'Senate Government Affairs'), (61337, 'Lisa Jackson', 'Environmental Protection Agency'), (15906, 'President Chen Shui-bian', 'Democratic Progressive Party'), (37037, 'Ali Akbar Salehi', 'Atomic Energy Organization'), (42098, 'Mr. Abbas', 'Fatah'), (23016, 'Osama bin Laden', 'al-Qaida'), (8633, 'Ali Rodriguez', 'Petroleos de Venezuela'), (15203, 'Joseph Kony', "Lord 's Resistance Army"), (31546, 'Mr. Abbas', 'Fatah'), (18977, 'General Petraeus', 'U.S. Central Command'), (60729, 'General David Petraeus', 'U.S. Central Command'), (20496, 'Avigdor Lieberman', 'Yisrael Beitenu'), (11259, 'Joseph Domenech', "U.N. 's Food and Agricultural Organization"), (44280, 'Gandhi', 'National Advisory Council'), (53075, 'Mr. Rafsanjani', 'Expediency Council'), (57350, 'Gene Sperling', 'National Economic Council'), (49242, 'Ayatollah Ahmad Jannati', 'Guardian Council'), (28997, 'Ma', 'Nationali

# Assignment 4

Example of entries that are not included in the gold standard but are correctly identified.

1591	Fidel Castro	the Communist Party
30373	Tran Duc Luong	Communist Party National Congress
55115	Raul Reyes	FARC
8086	Porter Goss'	CIA
33787	Hosni Mubarak	National Democratic Party

### How could one create a better gold standard for this task?
One could have a database containing information about persons and their affilications that would then be queriable to match found connections, like DBpedia.

### What do precision, recall, and F1 actually measure in this context?
Precision measures the fraction of documents we retrieve that are important, i.e., in the gold standard.

Recall measures the fraction of relevant documents from the gold standard that we actually retrieve.

F1-score then measures the harmonic mean of the precision and recall as a way to get a single number for the performance. 

### What measures would be more suitable to evaluate this task?
We could modify the above metrics to also include partial matches, i.e, where the estimated entities contains only parts of the true ones.

For instance

Precision = (Correct + Partially Correct) / (Correct + Not Correct + Partially Correct)

Recall = (Correct + Partially Correct) / (Correct + Not Marked)

### What other ways of evaluating systems for information extraction can you think of?
Instead of doing quantitative analysis, we could evaluate such a system qualitatively by domain experts or users of the system.
