# Machine Learning with Limited Data (2023): Exam Submission

Made by Anders Giovanni Møller, PhD at IT University of Copenhagen.

Email: agmo@itu.dk

In [1]:
# Imports
%load_ext autoreload
%autoreload 2

import re
import os

import pandas as pd

import nltk
import spacy

from textblob import TextBlob
from textblob.taggers import PatternTagger

import skweak

from scripts.skweak_ner_eval import evaluate
from scripts.utils import load_data_split, get_frequent_words, tag_all, penntreebank2universal, compute_recall, compute_num_conflicts

pd.set_option('display.max_rows', 500)

# Task

For this specific task, I'll be training a POS-tagger in Danish, using the Danish UD dataset. The data has been extracted and put into the `corpus/` folder.

For labels, I will make labelling functions for determiners, numerals, proper nouns, adjectives, and nouns.

In [2]:
all_labels = ["DET", "NUM", "PROPN", "ADJ", "NOUN"]

# Load training data
train_docs = load_data_split("train", "UD_Danish-DDT", all_labels)

# Display first 3 docs
for doc in train_docs[:3]:
    skweak.utils.display_entities(doc)

# Labelling Functions

### Determiners LFs

I found a list of Danish determiners and put them into a json file.

In [3]:
tries = skweak.gazetteers.extract_json_data("det.json")
det_lf = skweak.gazetteers.GazetteerAnnotator("determiners", tries, case_sensitive=False)

Extracting data from det.json
Populating trie for class DET (number: 21)


###  Numerals LFs

In [4]:
# Use a regular expression pattern to look for digits
def num_detector(doc):
    for token in doc:
        if re.search("\d+", token.text):
            yield token.i, token.i + 1, "NUM"

# Check if the token is the word of a number from 1 to 10
def num_word_detector(doc):
    for token in doc:
        if token.text.lower() in ["én", "to", "tre", "fire", "fem", "seks", "syv", "otte", "ni", "ti"]:
            yield token.i, token.i + 1, "NUM"

num_lf1 = skweak.heuristics.FunctionAnnotator("numerals1", num_detector)
num_lf2 = skweak.heuristics.FunctionAnnotator("numerals2", num_word_detector)


### Adjective LFs

In [5]:
# Look for words that end with a suffix typical for adjectives
def adj_detector_suffixes_danish(doc):
    danish_adj_suffixes = ("-lig", "-sk", "-bar", "-fuld", "-løs", "-vis", "-en", "-som", "-agtig")
    for token in doc:
        if len(token.text) > 3 and token.text.endswith(danish_adj_suffixes):
            yield token.i, token.i + 1, "ADJ"

# Look for words that start with a prefix typical for adjectives
def adj_detector_prefixes_danish(doc):
    danish_adj_prefixes = ("u", "anti", "over", "under", "mis", "efter", "gen")
    for token in doc:
        if len(token.text) > 3 and token.text.lower().startswith(danish_adj_prefixes):
            yield token.i, token.i + 1, "ADJ"

# Look for words that are preceded by a form of the verb "to be"
def adj_detector_synt1_danish(doc):
    weak_labels = ["O"] * len(doc)
    for span in doc.spans["determiners"]:
        weak_labels[span.start] = span.label_

    for token in doc[1:]:
        if not token.is_punct:
            prev = doc[token.i - 1].text.lower()
            if prev in ["er", "var", "være", "bliver", "blev"] and (
                    not token.text.endswith("ende")) and weak_labels[token.i] == "O":
                yield token.i, token.i + 1, "ADJ"

# Look for words that are preceded by a determiner or a number
def adj_detector_synt2_danish(doc):
    weak_labels = ["O"] * len(doc)

    for span in doc.spans["determiners"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["numerals1"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["numerals2"]:
        weak_labels[span.start] = span.label_

    for token in doc[1:]:
        if not token.is_punct:
            if weak_labels[token.i - 1] != "O":
                yield token.i, token.i + 1, "ADJ"


adj_lf1 = skweak.heuristics.FunctionAnnotator("adjs1", adj_detector_suffixes_danish)
adj_lf2 = skweak.heuristics.FunctionAnnotator("adjs2", adj_detector_prefixes_danish)
adj_lf3 = skweak.heuristics.FunctionAnnotator("adjs3", adj_detector_synt1_danish)
adj_lf4 = skweak.heuristics.FunctionAnnotator("adjs4", adj_detector_synt2_danish)

### Proper nouns LFs

In [6]:
# Check if the fist letter of a word or the whole word is capitalized
def propn_detector(doc):
    for token in doc:
        if token.i == 0:
            # For the first word of a sentence, check if all letters are capitalized
            if token.text.isupper():
                yield token.i, token.i + 1, "PROPN"
        else:
            if token.text.isupper() or token.text[0].isupper():
                yield token.i, token.i + 1, "PROPN"


propn_lf = skweak.heuristics.FunctionAnnotator("proper_nouns", propn_detector)

### Noun LFs

In [7]:
# Look for words that end with a suffix typical for nouns
def noun_detector_suffixes_danish(doc):
    danish_noun_suffixes = (
        "-hed", "-dom", "-else", "-ing", "-ion",
        "-skab", "-eri", "-itet", "-ance", "-ens",
        "-age", "-ør", "-ist"
    )
    for token in doc:
        if len(token.text) > 3 and token.text.endswith(danish_noun_suffixes):
            yield token.i, token.i + 1, "NOUN"


# Look for words that start with a prefix typical for nouns
def noun_detector_prefixes_danish(doc):
    danish_prefixes = (
        "over", "under", "mis", "gen", "efter", "for", "u", "be", "til", "op", "af", "an", "ud", "fore", "om"
    )
    for token in doc:
        if len(token.text) > 3 and token.text.lower().startswith(danish_prefixes):
            yield token.i, token.i + 1, "NOUN"

# # If the previous word is labeld as DET, NUM or ADJ, then the current word is an noun
def noun_detector_synt(doc):
    weak_labels = ["O"] * len(doc)

    for span in doc.spans["determiners"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["numerals1"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["numerals2"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["adjs1"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["adjs2"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["adjs3"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["adjs4"]:
        weak_labels[span.start] = span.label_

    for token in doc[1:]:
        if not token.is_punct:
            if weak_labels[token.i - 1] != "O":
                yield token.i, token.i + 1, "NOUN"

noun_lf1 = skweak.heuristics.FunctionAnnotator("nouns1", noun_detector_suffixes_danish)
noun_lf2 = skweak.heuristics.FunctionAnnotator("nouns2", noun_detector_prefixes_danish)
noun_lf3 = skweak.heuristics.FunctionAnnotator("nouns3", noun_detector_synt)

# Apply LFs

In [8]:
# Put all LFs in a list
lfs = [
    det_lf, 
    num_lf1, num_lf2, propn_lf,
    adj_lf1, adj_lf2, adj_lf3, adj_lf4,
    noun_lf1, noun_lf2, noun_lf3
]


train_docs = tag_all(train_docs, lfs)

In [9]:
# Print some of the assigned weak labels
for doc in train_docs[:3]:
    skweak.utils.display_entities(doc, ["determiners", "nouns1", "nouns2", "nouns3", "adjs1", "adjs2", "adjs3", "adjs4", "proper_nouns", "numerals1", "numerals2"])

### Train HMM and Majority

In [10]:
# HMM
hmm = skweak.aggregation.HMM("hmm", all_labels)
hmm.fit(train_docs)

# Majority voting
mv = skweak.aggregation.MajorityVoter("mv", all_labels)

# Apply LFs, HMM and MV to the test docs
test_docs = load_data_split("test", "UD_Danish-DDT", all_labels)
test_docs = tag_all(test_docs, lfs + [mv, hmm])

Starting iteration 1
Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Finished E-step with 4080 documents
Starting iteration 2


         1 -114027.17492093             +nan


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Finished E-step with 4080 documents
Starting iteration 3


         2 -104557.26234337   +9469.91257756


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Finished E-step with 4080 documents
Starting iteration 4


         3 -100172.26492503   +4384.99741834


Number of processed documents: 1000
Number of processed documents: 2000
Number of processed documents: 3000
Number of processed documents: 4000
Finished E-step with 4080 documents


         4  -96757.66263620   +3414.60228883


In [11]:
df = evaluate(test_docs, all_labels, [
    "determiners", "proper_nouns", "nouns1", "nouns2", "nouns3", "adjs1", "adjs2", "adjs3", "adjs4", "numerals1", "numerals2"
])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,17.4 %,adjs1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,17.4 %,adjs2,0.085,0.033,0.048,,,,0.085,0.033,0.048
ADJ,17.4 %,adjs3,0.188,0.066,0.098,,,,0.188,0.066,0.098
ADJ,17.4 %,adjs4,0.274,0.337,0.302,,,,0.274,0.337,0.302
ADJ,17.4 %,determiners,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,17.4 %,nouns1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,17.4 %,nouns2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,17.4 %,nouns3,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,17.4 %,numerals1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,17.4 %,numerals2,0.0,0.0,0.0,,,,0.0,0.0,0.0


### Evaluation

Generally, we observe that the determiners, numerals, and proper nouns labelling functions work well, achieving relatively high F1 scores. Nouns and adjectives are more difficult POS tags.

### Aggregator Performance

In [12]:
df = evaluate(test_docs, all_labels, ["mv", "hmm"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,17.4 %,hmm,0.28,0.342,0.308,,,,0.28,0.342,0.308
ADJ,17.4 %,mv,0.193,0.042,0.068,,,,0.193,0.042,0.068
DET,13.6 %,hmm,0.603,0.78,0.68,,,,0.603,0.78,0.68
DET,13.6 %,mv,0.58,0.673,0.624,,,,0.58,0.673,0.624
NOUN,49.8 %,hmm,0.376,0.154,0.218,,,,0.376,0.154,0.218
NOUN,49.8 %,mv,0.365,0.225,0.278,,,,0.365,0.225,0.278
NUM,4.2 %,hmm,0.902,0.66,0.762,,,,0.902,0.66,0.762
NUM,4.2 %,mv,0.886,0.712,0.79,,,,0.886,0.712,0.79
PROPN,15.0 %,hmm,0.788,0.513,0.622,,,,0.788,0.513,0.622
PROPN,15.0 %,mv,0.666,0.82,0.736,,,,0.666,0.82,0.736


For adjectives and determiners, the HMM model performs better than majority voting. For numerals, they are more or less comparable, whereas majority voting is best for nouns and proper nouns. 

## Evaluate on other languages

#### Gaelic Scottish (unrelated)

From the numbers below, we see that numerals and proper nouns are actually found occationally, while adjectives, nouns, and determiners are not detected with the model.

In [13]:
test_docs_scottish = load_data_split("test", "UD_Scottish_Gaelic-ARCOSG", all_labels)
test_docs_scottish = tag_all(test_docs_scottish, lfs + [mv, hmm])

df_scottish = evaluate(test_docs_scottish, all_labels, ["mv", "hmm", "determiners", "proper_nouns", "nouns1", "nouns2", "nouns3", "adjs1", "adjs2", "numerals1", "numerals2", "adjs3", "adjs4"])
df_scottish.loc[['ADJ', 'DET', 'NOUN', 'NUM', 'PROPN']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,10.2 %,adjs1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,10.2 %,adjs2,0.028,0.005,0.008,,,,0.028,0.005,0.008
ADJ,10.2 %,adjs3,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,10.2 %,adjs4,0.042,0.005,0.008,,,,0.042,0.005,0.008
ADJ,10.2 %,determiners,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,10.2 %,hmm,0.042,0.005,0.008,,,,0.042,0.005,0.008
ADJ,10.2 %,mv,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,10.2 %,nouns1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,10.2 %,nouns2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,10.2 %,nouns3,0.0,0.0,0.0,,,,0.0,0.0,0.0


#### Turkish (unrelated)

The numbers below show that proper nouns are detected relatively frequently, while adjectives, numerals, and nouns sparsely are captured. Determiners are never found. 

In [14]:
test_docs_turkish = load_data_split("test", "UD_Turkish-IMST", all_labels)
test_docs_turkish = tag_all(test_docs_turkish, lfs + [mv, hmm])

df_turkish = evaluate(test_docs_turkish, all_labels, ["mv", "hmm", "determiners", "proper_nouns", "nouns1", "nouns2", "nouns3", "adjs1", "adjs2", "numerals1", "numerals2", "adjs3", "adjs4"])
df_turkish.loc[['ADJ', 'DET', 'NOUN', 'NUM', 'PROPN']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,20.2 %,adjs1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,20.2 %,adjs2,0.238,0.033,0.058,,,,0.238,0.033,0.058
ADJ,20.2 %,adjs3,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,20.2 %,adjs4,0.224,0.024,0.044,,,,0.224,0.024,0.044
ADJ,20.2 %,determiners,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,20.2 %,hmm,0.22,0.024,0.044,,,,0.22,0.024,0.044
ADJ,20.2 %,mv,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,20.2 %,nouns1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,20.2 %,nouns2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,20.2 %,nouns3,0.0,0.0,0.0,,,,0.0,0.0,0.0


#### Spanish (related)

We see how proper nouns and numerals are frequently found, and nouns are captured relatively often considering the substantial proportion.  

In [15]:
test_docs_spanish = load_data_split("test", "UD_Spanish-GSD", all_labels)
test_docs_spanish = tag_all(test_docs_spanish, lfs + [mv, hmm])

df_spanish = evaluate(test_docs_spanish, all_labels, ["mv", "hmm", "determiners", "proper_nouns", "nouns1", "nouns2", "nouns3", "adjs1", "adjs2", "numerals1", "numerals2", "adjs3", "adjs4"])
df_spanish.loc[['ADJ', 'DET', 'NOUN', 'NUM', 'PROPN']]



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,12.3 %,adjs1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.3 %,adjs2,0.269,0.031,0.056,,,,0.269,0.031,0.056
ADJ,12.3 %,adjs3,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.3 %,adjs4,0.022,0.039,0.028,,,,0.022,0.039,0.028
ADJ,12.3 %,determiners,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.3 %,hmm,0.023,0.039,0.028,,,,0.023,0.039,0.028
ADJ,12.3 %,mv,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.3 %,nouns1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.3 %,nouns2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.3 %,nouns3,0.0,0.0,0.0,,,,0.0,0.0,0.0


#### Swedish (related)

We observe how our model performs relatively well on Swedish, which is very related to Danish. Most of the labels obtain high F1 scores, except for nouns, which was also found to be challenging on Danish.

In [16]:
test_docs_swedish = load_data_split("test", "UD_Swedish-LinES", all_labels)
test_docs_swedish = tag_all(test_docs_swedish, lfs + [mv, hmm])

df_swedish = evaluate(test_docs_swedish, all_labels, ["mv", "hmm", "determiners", "proper_nouns", "nouns1", "nouns2", "nouns3", "adjs1", "adjs2", "numerals1", "numerals2", "adjs3", "adjs4"])
df_swedish.loc[['ADJ', 'DET', 'NOUN', 'NUM', 'PROPN']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,21.5 %,adjs1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,21.5 %,adjs2,0.12,0.031,0.05,,,,0.12,0.031,0.05
ADJ,21.5 %,adjs3,0.261,0.024,0.044,,,,0.261,0.024,0.044
ADJ,21.5 %,adjs4,0.322,0.277,0.298,,,,0.322,0.277,0.298
ADJ,21.5 %,determiners,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,21.5 %,hmm,0.323,0.277,0.298,,,,0.323,0.277,0.298
ADJ,21.5 %,mv,0.312,0.021,0.04,,,,0.312,0.021,0.04
ADJ,21.5 %,nouns1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,21.5 %,nouns2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,21.5 %,nouns3,0.0,0.0,0.0,,,,0.0,0.0,0.0


## Token Classification Model

#### Convert data to correct format

In [17]:
import pandas as pd


def transform_train(docs, aggregation_model="mv"): # also possible to use HMM

  for doc in docs:
      doc.set_ents(doc.spans.get(aggregation_model, []))

  df = []
  for sidx, doc in enumerate(docs):
      for tok in doc:
          if tok.ent_type_ == "":
              df.append([sidx, tok.text, "O"])
          else:
              df.append([sidx, tok.text, tok.ent_type_])

  df = pd.DataFrame(df, columns=["sentence_id", "words", "labels"])
  return df


def transform_test(all_labels):
    docs = load_data_split("test", "UD_Danish-DDT", all_labels)
    df = []

    for sidx, doc in enumerate(docs):
        for tok in doc:
            if tok.ent_type_ == "":
                df.append([sidx, tok.text, "O"])
            else:
                df.append([sidx, tok.text, tok.ent_type_])
    df = pd.DataFrame(df, columns=["sentence_id", "words", "labels"])
    return df

# Tag the training data with the HMM and MV
train_docs = tag_all(train_docs, [mv, hmm])

# Transform the training and test data into correct format
df_train = transform_train(train_docs, aggregation_model="mv")
df_test = transform_test(all_labels)

# Write to csv
df_train.to_csv("train.csv", sep=';', index=None)
df_test.to_csv("test.csv", sep=';', index=None)

#### Train Classifier

In [18]:
from simpletransformers.ner import NERModel

# Create a NERModel
model = NERModel(
    "bert",
    "bert-base-cased",
    use_cuda=False,
    labels=df_train.labels.unique().tolist(),
    args={"overwrite_output_dir": True, "reprocess_input_data": True, "train_batch_size": 64},
)

# # Train the model
model.train_model(df_train)

# # Evaluate the model
result, model_outputs, predictions = model.eval_model(df_test)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/9 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/71 [00:00<?, ?it/s]



In [19]:
result

{'eval_loss': 1.3793464252646541,
 'precision': 0.41172741679873215,
 'recall': 0.38329890823251694,
 'f1_score': 0.3970048899755501}

## Evaluation

We see from the evaluation metrics above, that the BERT-based model performs worse than MV and HMM, achieving a micro F1 score of 0.392. This is, however, not surprising as our labels in the training data is based on the MV, which itself does not perform well in regards to a micro F1 of 0.424. Oppositely, it is noteworthy that the BERT model doesn't manage to use the extensive language understanding from the pre-training, prior to this task-specific fine-tuning. This might be attributed to the lack of Danish data in the initial pre-training. 

This result highlights the need for high quality labels when fine-tuning a model. For this particular case, we would need additional and perhaps more complicated heuristics to obtain more accurate weak labels. 