# SpaCy Notebook

### Imports

In [1]:
import pandas as pd
import spacy
import random
from spacy.training import Example
from spacy.tokens import DocBin
from utils import read_jsonl, continuous_string, label_distribution, train_test_split_stratify

### Load and Format Data

In [2]:
# Read individual files
eng_1_50 = read_jsonl('annotations/90min_annotations_1-50.jsonl', 1, 50)
eng_51_100 = read_jsonl('annotations/90min_annotations_51-100.jsonl', 51, 100)
eng_101_150 = read_jsonl('annotations/90min_annotations_101-150.jsonl', 101, 150)
eng_151_165 = read_jsonl('annotations/90min_annotations_151-165.jsonl', 151, 165)
dan_1_50 = read_jsonl('annotations/bold_annotations_1-50.jsonl', 1, 50)
dan_51_100 = read_jsonl('annotations/bold_annotations_51-100.jsonl', 51, 100)

# Collect files together
eng_full = eng_1_50 + eng_51_100 + eng_101_150 + eng_151_165
dan_full = dan_1_50 + dan_51_100

# Check formatting
eng_full

[{'id': 1,
  'text': 'Paris Saint-Germain have confirmed the signing of French winger Ousmane Dembele from Barcelona. Dembele had been in talks over extending his Barcelona contract before PSG voiced a desire to trigger the    50m release clause in the Frenchman\'s contract, and although that clause expired last month, Dembele\'s desire to seal the move saw Barcelona agree to the same terms. Having entered the final year of his contract at Camp Nou, Barcelona preferred to cash in on Dembele when it became clear he would leave for free next summer if he remained at the club. For Dembele, it\'s a return to Ligue 1 for the first time since he departed Rennes in 2016, with the 26-year-old inking a five-year contract at Parc des Princes. feed "I\'m delighted to be joining Paris Saint-Germain and can\'t wait to play for my new club," Dembele told club media. "I hope I can continue to grow here and make all the club\'s fans proud." President Nasser Al-Khelaifi added: "We are delighted to welc

In some cases, we would prefer if the text data is a single string instead of seperate articles. The `continuous_string` function build in the `utils.py` file allows for this.

In [3]:
eng_continuous, eng_gold = continuous_string(eng_full)
dan_continuous, dan_gold = continuous_string(dan_full)

We also want to check and save the distribution of labels in our annotations. The function `label_distribution` from `utils` handles this.

In [4]:
eng_gold_dist = label_distribution(eng_gold)
eng_gold_dist

{'ORG': 3794,
 'PER': 2605,
 'MON': 500,
 'LOC': 358,
 'DAT': 271,
 'PCT': 22,
 'TIM': 8}

In [5]:
dan_gold_dist = label_distribution(dan_gold)
dan_gold_dist

{'PER': 411, 'ORG': 661, 'DAT': 102, 'LOC': 83, 'TIM': 4, 'MON': 16, 'PCT': 1}

We can definitely say that the data set is not balanced. 

### Load spaCy

In [6]:
# Get models
spacy_eng = spacy.load('en_core_web_trf')
spacy_dan = spacy.load('da_core_news_trf')

In [7]:
# The gold label set is:
''' ['LOC', 'PER', 'ORG', 'TIM', 'MON', 'PCT', 'DAT'] '''

# Define spacy label sets
spacy_eng_labelset = spacy_eng.get_pipe('ner').labels
spacy_dan_labelset = spacy_dan.get_pipe('ner').labels

print('English Model Label Set:')
for label in spacy_eng_labelset:
    print(label, ':', spacy.explain(label))

print('\n')
print('Danish Model Label Set:')
for label in spacy_dan_labelset:
    print(label, ':', spacy.explain(label))

English Model Label Set:
CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


Danish Model Label Set:
LOC : Non-GPE locations, mountain ranges, bodies of water
MISC : Miscellaneous entities, e.g. events, nationalities, products or works of art
ORG : Co

We see that the label sets don't match exactly. We will map those that are relevant and remove the others from the model.

In [8]:
eng_label_map = {
    'DATE': 'DAT',
    'FAC': 'LOC',
    'GPE': 'LOC',
    'LOC': 'LOC',
    'MONEY': 'MON',
    'ORG': 'ORG',
    'PERCENT': 'PCT',
    'PERSON': 'PER',
    'TIME': 'TIM'
}

dan_label_map = {
    'LOC': 'LOC',
    'ORG': 'ORG',
    'PER': 'PER'
}

In [9]:
# Test that it works
example_sent = eng_full[4]
example_doc = spacy_eng(example_sent['text'])

for ent in example_doc.ents:
    print(ent.start_char, ent.end_char, ent.text, eng_label_map.get(ent.label_, 'O'))

0 16 Saudi Pro League ORG
38 53 Michael Emenalo PER
131 149 the next two years DAT
223 230 Chelsea ORG
251 255 July DAT
274 290 the Pro League's ORG
304 309 £400m MON
375 388 Karim Benzema PER
390 406 Jordan Henderson PER
408 420 Riyad Mahrez PER
426 441 Roberto Firmino PER
450 463 Saudi Arabian O
470 481 this summer DAT
528 540 Saudi Arabia LOC
552 559 Benzema PER
561 573 N'Golo Kante PER
578 595 Cristiano Ronaldo PER
660 662 26 DAT
664 675 Ruben Neves PER
679 682 one O
767 782 Richard Masters PER
803 821 the Premier League ORG
874 888 Saudi Arabia's LOC
921 928 Emenalo PER
1060 1073 Kylian Mbappe PER
1101 1111 Harry Kane PER
1179 1185 Mbappe PER
1233 1236 259 MON
1236 1237 m MON
1247 1255 Al Hilal ORG
1277 1296 Paris Saint-Germain ORG
1368 1385 a couple of years DAT
1566 1573 Emenalo PER
1659 1666 English O
1668 1682 Premier League ORG
1728 1735 Emenalo PER
1752 1774 the Saudi Pro League's ORG
1799 1817 the Premier League ORG
1841 1849 LIV Golf ORG
1854 1866 the PGA Tour ORG
1920 192

In [10]:
# Compare to gold
for start_idx, end_idx, label in example_sent['label']:
    print(start_idx, end_idx, example_sent['text'][start_idx:end_idx], label)

0 16 Saudi Pro League ORG
38 53 Michael Emenalo PER
223 230 Chelsea ORG
251 255 July DAT
278 290 Pro League's ORG
304 309 £400m MON
375 388 Karim Benzema PER
390 406 Jordan Henderson PER
408 420 Riyad Mahrez PER
426 441 Roberto Firmino PER
528 539 Saudi Arabi LOC
552 559 Benzema PER
561 573 N'Golo Kante PER
578 595 Cristiano Ronaldo PER
664 675 Ruben Neves PER
767 782 Richard Masters PER
807 821 Premier League ORG
874 888 Saudi Arabia's LOC
921 928 Emenalo PER
1060 1073 Kylian Mbappe PER
1101 1111 Harry Kane PER
1179 1185 Mbappe PER
1232 1237 £259m MON
1247 1255 Al Hilal ORG
1277 1296 Paris Saint-Germain ORG
1566 1573 Emenalo PER
1668 1682 Premier League ORG
1728 1735 Emenalo PER
1756 1774 Saudi Pro League's ORG
1803 1817 Premier League ORG
1841 1849 LIV Golf ORG
1858 1866 PGA Tour ORG
1920 1926 Europe LOC
2031 2045 Premier League ORG
2232 2246 [Jurgen] Klopp PER
2256 2271 [Pep] Guardiola PER
2378 2381 Pep PER
2443 2455 Saudi Arabia LOC


### Test Model Metrics

We want to be able to test the performance of the models. This could in many cases be solved easily by the [`spacy.scorer.Scorer`](https://spacy.io/api/scorer) function. However, for our purposes, we have some contraints that would make this impractical, mainly that our gold labelset does not exactly match the spaCy models', and we map some of the model labels to different labels in our own annotations. It seems more reasonable to handle this in metric computations instead of in post-processing, so we build our own metric functions.

We first initiate spaCy docs of our data:

In [9]:
# Create spaCy doc of full english data set
eng_doc = spacy_eng(eng_continuous)

In [10]:
# Create spaCy doc of full danish data set, we try both the english and danish models
dan_doc_dan_model = spacy_dan(dan_continuous)
dan_doc_eng_model = spacy_eng(dan_continuous)

Precision, recall, and F1-score are commonly used metrics in NER evaluation, thus, we create functions use those:

In [31]:
# We start by creating a generic matching function which will return True if the given doc span matches the given label
def check_correct(span, gold_label, mapping):
    
    # Check that there are actually tokens in the span
    if span is not None:

        # Check if the model found any entities in the span
        # There is a possibility that the model split a single entity into two in the same span, this case will not be counted as correct
        if len(span.ents) == 1:

            # Check if label is correct
            model_label = span.ents[0].label_ 
            if mapping.get(model_label, None) == gold_label:

                return True
            
    # If a single condition fails, return false
    return False

In [32]:
### Precision: TP / (TP + FP)
def spacy_precision(doc: spacy.tokens.doc.Doc, gold_labels: list, mapping: dict):

    # Count total predictions that fall within the gold labelset, i.e., ignore predictions of irrelevant tags
    total = {tag: 0 for tag in mapping.values()}
    for ent in doc.ents:
        if ent.label_ in mapping.keys():
            total[mapping[ent.label_]] += 1

    # Store correct predictions
    correct = {tag: 0 for tag in mapping.values()}

    # Iterate over gold labels
    for start_idx, end_idx, gold_label in gold_labels:

        # Only consider relevant labels in gold data
        if gold_label in mapping.values():

            # Define span of tokens for given indices
            token_span = doc.char_span(start_idx, end_idx)

            # Increment correct counter if model label matches gold label
            correct[gold_label] += check_correct(token_span, gold_label, mapping)

    # Compute precision values
    precision = {tag: correct[tag] / total[tag] for tag in mapping.values() if total[tag] != 0}
    return precision

In [33]:
### Recall: TP / (TP + FN)
def spacy_recall(doc: spacy.tokens.doc.Doc, gold_labels: list, mapping: dict):
    # Store correct and total counts
    correct = {tag: 0 for tag in mapping.values()} # Amount of correct predictions
    total = {tag: 0 for tag in mapping.values()} # Amount of total labels in gold data

    # Iterate over gold labels
    for start_idx, end_idx, gold_label in gold_labels:

        # Only consider relevant labels in gold data
        if gold_label in mapping.values():

            # Define span of tokens for given indices
            token_span = doc.char_span(start_idx, end_idx)

            # Increment correct counter if model label matches gold label
            correct[gold_label] += check_correct(token_span, gold_label, mapping)

            # Increment total counter
            total[gold_label] += 1

    # Compute recall values
    recall = {tag: correct[tag] / total[tag] for tag in mapping.values() if total[tag] != 0}
    return recall

In [34]:
### F1-score: 2 * (precision * recall) / (precision + recall)
def spacy_f1(precision_scores, recall_scores):

    # Define harmonic mean function
    def harmonic_mean(a, b):
        if a + b == 0:
            return 0 # Return 0 if sum of precision and recall is 0
        return 2 * (a * b) / (a + b)
    
    # Use harmonic mean function to compute F1-score
    f1 = {tag: harmonic_mean(precision_scores[tag], recall_scores[tag]) for tag in precision_scores.keys()}
    return f1

We create a function which will nicely output the metric results as a matrix:

In [35]:
# Create function to output a metric matrix
def metrics_matrix(doc: spacy.tokens.doc.Doc, gold_labels: list, mapping: dict):

    # Compute metrics
    precision = spacy_precision(doc, gold_labels, mapping)
    recall = spacy_recall(doc, gold_labels, mapping)
    f1 = spacy_f1(precision, recall)

    # Convert dictionaries to Pandas Series
    precision_series = pd.Series(precision, name = 'Precision')
    recall_series = pd.Series(recall, name = 'Recall')
    f1_series = pd.Series(f1, name = 'F1-Score')

    # Concatenate Series into a DataFrame
    result_df = pd.concat([precision_series, recall_series, f1_series], axis = 1)
    return result_df

English data metrics with English language model:

In [16]:
eng_metrics = metrics_matrix(eng_doc, eng_gold, eng_label_map)
print(eng_metrics)

     Precision    Recall  F1-Score
DAT   0.153592  0.686347  0.251012
LOC   0.643182  0.790503  0.709273
MON   0.274363  0.366000  0.313625
ORG   0.940548  0.858988  0.897920
PCT   0.562500  0.409091  0.473684
PER   0.967108  0.981958  0.974476
TIM   0.032967  0.375000  0.060606


Danish data metrics with Danish language model:

In [17]:
dan_metrics_dan_model = metrics_matrix(dan_doc_dan_model, dan_gold, dan_label_map)
print(dan_metrics_dan_model)

     Precision    Recall  F1-Score
LOC   0.365482  0.867470  0.514286
ORG   0.838362  0.588502  0.691556
PER   0.909091  0.924574  0.916767


Danish data metrics with English language model:

In [18]:
dan_metrics_eng_model = metrics_matrix(dan_doc_eng_model, dan_gold, eng_label_map)
print(dan_metrics_eng_model)

     Precision    Recall  F1-Score
DAT   0.527132  0.666667  0.588745
LOC   0.262948  0.795181  0.395210
MON   0.714286  0.625000  0.666667
ORG   0.759369  0.582451  0.659247
PER   0.735537  0.866180  0.795531
TIM   0.222222  0.500000  0.307692
PCT        NaN  0.000000       NaN


### Fine-Tuning a Model

We want to see if we can fine-tune the English language model with the English data to perform better.

The main question here is whether we have enough training data. The full english annotated dataset consists of 165 articles, with around 7500 annotations. We'll see..

We first split the dataset into training and test data. We could attempt to stratisfy the sets, such that the distribution of labels is equal for the two sets. However, this seems somewhat unfeasable without having to use vectors and clustering, since the data is separated into articles, each having multiple annotations. That is not the purpose of this challenge. Additionally, we could hope that the article structure itself maintains some balance in the labels within each article resembling the global distribution.

In [25]:
# Define split variables
test_size = 0.2
split_idx = int(len(eng_full) * (1 - test_size))

eng_train = eng_full[:split_idx]
eng_test = eng_full[split_idx:]
eng_train

[{'id': 1,
  'text': 'Paris Saint-Germain have confirmed the signing of French winger Ousmane Dembele from Barcelona. Dembele had been in talks over extending his Barcelona contract before PSG voiced a desire to trigger the    50m release clause in the Frenchman\'s contract, and although that clause expired last month, Dembele\'s desire to seal the move saw Barcelona agree to the same terms. Having entered the final year of his contract at Camp Nou, Barcelona preferred to cash in on Dembele when it became clear he would leave for free next summer if he remained at the club. For Dembele, it\'s a return to Ligue 1 for the first time since he departed Rennes in 2016, with the 26-year-old inking a five-year contract at Parc des Princes. feed "I\'m delighted to be joining Paris Saint-Germain and can\'t wait to play for my new club," Dembele told club media. "I hope I can continue to grow here and make all the club\'s fans proud." President Nasser Al-Khelaifi added: "We are delighted to welc

Let's check that the label distribution is not completely skewed between the two sets.

In [20]:
label_distribution(continuous_string(eng_train)[1])

{'ORG': 3071,
 'PER': 2068,
 'MON': 409,
 'LOC': 284,
 'DAT': 215,
 'PCT': 20,
 'TIM': 7}

In [22]:
label_distribution(continuous_string(eng_test)[1])

{'PER': 537, 'ORG': 723, 'LOC': 74, 'MON': 91, 'DAT': 56, 'TIM': 1, 'PCT': 2}

It seems reasonable.

We will also format the training data for spaCy training, and split the test data into text and annotations. 

In [27]:
# Training data, format to use for spaCy training
eng_train_spacy_format = []
for article in eng_train:
    # Create list of entity labels
    entities = [(start_idx, end_idx, label) for start_idx, end_idx, label in article['label']]
    # Append tuple of text and annotations
    eng_train_spacy_format.append((article['text'], entities))

# Test data
eng_test_text, eng_test_gold = continuous_string(eng_full[split_idx:])

In [29]:
# Initialize a pre-trained model
# We use a small model here, as it will be less prone to overfitting when our training data set is relatively small
spacy_sm_untuned = spacy.load('en_core_web_sm')

In [36]:
# We'll first check it's performance on the test data before tuning
eng_doc_sm_untuned = spacy_sm_untuned(eng_test_text)
eng_sm_untuned_metrics = metrics_matrix(eng_doc_sm_untuned, eng_test_gold, eng_label_map)
print(eng_sm_untuned_metrics)

     Precision    Recall  F1-Score
DAT   0.125984  0.571429  0.206452
LOC   0.191919  0.770270  0.307278
MON   0.802817  0.626374  0.703704
ORG   0.655882  0.308437  0.419567
PCT   0.000000  0.000000  0.000000
PER   0.771226  0.608939  0.680541
TIM   0.100000  1.000000  0.181818


We see that it performs significantly worse than the large transformer model. Not surprising. We also notice that all metrics are 0 for the 'PCT' tag, as there are no instances of that tag in the test data. For now, we will ignore this, as the tag generally has very little representation in the gold data.

Next, we try to fine-tune the model with 10 epochs:

In [47]:
# Refine model again for tuning
spacy_sm_tuned = spacy.load('en_core_web_sm')

# # Disable other pipeline components during training to improve speed
# other_pipes = [pipe for pipe in spacy_sm_tuned.pipe_names if pipe != 'ner']

# # Create DocBin to save docs to disk
# with spacy_sm_tuned.disable_pipes(*other_pipes):

db = DocBin() # Define DocBin

# Iterate over articles in training data
for text, labels in eng_train_spacy_format:
    
    doc = spacy_sm_tuned.make_doc(text) # Create doc for current article
    ents = [] # Define list to store entities

    # Iterate over each annotation for current article
    for start_idx, end_idx, label in labels:
        
        # Define character span and append to entity list if valid
        span = doc.char_span(start_idx, end_idx, label = label) 
        if span is not None:
            ents.append(span)

    doc.ents = ents # Add entitites to doc
    db.add(doc) # Add doc to DocBin

# Format examples
train_examples = [
    Example.from_dict(
        doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}
    ) for doc in db.get_docs(spacy_sm_tuned.vocab)
]

# Begin training
spacy_sm_tuned.begin_training()
for epoch in range(100):
    random.shuffle(train_examples)
    for example in train_examples:
        spacy_sm_tuned.update([example])

# Save the fine-tuned model
spacy_sm_tuned.to_disk('./spacy_sm_tuned')

Let's see if the performance has been improved.

In [46]:
eng_doc_sm_tuned = spacy_sm_tuned(eng_test_text)
eng_sm_tuned_metrics = metrics_matrix(eng_doc_sm_tuned, eng_test_gold, eng_label_map)
print(eng_sm_tuned_metrics)

  matches = self.matcher(doc, allow_missing=True, as_spans=False)


     Precision    Recall  F1-Score
LOC   0.544304  0.581081  0.562092
ORG   0.743802  0.746888  0.745342
DAT        NaN  0.000000       NaN
MON        NaN  0.000000       NaN
PCT        NaN  0.000000       NaN
PER        NaN  0.000000       NaN
TIM        NaN  0.000000       NaN
