# Imports & reading the data

In [None]:
!pip install gliner
from gliner import GLiNER
import json
import torch
from tqdm import tqdm
from prettytable import PrettyTable

import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1", device="cpu").to(device)

Collecting gliner
  Downloading gliner-0.1.12-py3-none-any.whl (26 kB)
Collecting huggingface-hub>=0.21.4 (from gliner)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flair==0.13.1 (from gliner)
  Downloading flair-0.13.1-py3-none-any.whl (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from gliner)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3>=1.20.27 (from flair==0.13.1->gliner)
  Downloading boto3-1.34.92-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m19.5 MB/s[0m eta [36

pytorch_model.bin:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

In [None]:
with open("../train.jsonl") as json_file:
    json_list = list(json_file)

# sentences and ners from `train.jsonl`
train_sent = list()
train_ners = list()

for json_str in json_list:
    result = json.loads(json_str)
    train_sent.append(result["sentences"])
    # sorting the ners by end idx and start idx before storing them
    train_ners.append(
        sorted(
            sorted([(result["sentences"][start:end+1], start, end, t) for start, end, t in result["ners"]], key=lambda l:l[2], reverse=True),
            key=lambda l: l[1]
        )
    )

with open("../test.jsonl") as json_file:
    json_list = list(json_file)

# sentences and corresponding ids from `test.jsonl`
test_sent = list()
test_idx = list()

for json_str in json_list:
    result = json.loads(json_str)
    test_sent.append(result["senences"]) # sEnEnEnSeS
    test_idx.append(result["id"])

# stores all existing labels (required for model)
labels = set()
for ners in train_ners:
    for item in ners:
        labels.add(item[3])

# Model code

In [None]:
def get_entities(sent, recursion_depth=5, threshold=0.2):
    """
    This function takes a sentence, a maximum recursion depth, and a threshold confidence
    and returns a list of all found entities without duplicated that have a minimum specified confidence.

    Arguments:
        sent (str): a string where we need to find the entities
        recursion_depth (int): maximum recursion depth we may call
        threshold (float): a minimum confidence value that all found entities should have to be included in output

    Returns:
        List(Tuple(str, int, int, int)): a list of all found entities
                                         in a format (<word>, <start idx>, <end idx inclusive>, <label>)
    """

    # calculates entities from current sentence
    entities = model.predict_entities(sent, labels)
    entities = [(d["text"], d["start"], d["end"]-1, d["label"]) for d in entities if d["score"]>threshold]

    # stores entities from sub-calls
    sub_ents = list()

    for ent in entities:
        s = ent[0].split()
        if len(s) < 2: continue

        for idx in range(1, len(s)):
            s1 = ' '.join(s[:idx])
            new_ent = get_entities(s1, recursion_depth-1, threshold)
            new_ent = [(d[0], d[1]+ent[1], d[2]+ent[1], d[3]) for d in new_ent]
            sub_ents += new_ent

    return list(set(entities + sub_ents))

# Funi-tuning

In [None]:
def eval_f1(pred, true):
    """
    Evaluation function for NER. Calculates the f1-score like this:
    true positives are the entities correctly classified
    false positives are predicted entities that aren't in ground truth
    false negatives are true entities that weren't predicted.
    The f1-score is then calculated as usual

    Format of input variables:
    List of tuples (<word>, <start char idx>, <end char idx (inclusive)>, <type>)
    """
    true = set(true)
    pred = set(pred)

    true_positive = len(true & pred)
    false_positive = len(pred - true)
    false_negative = len(true - pred)

    if not true_positive:
        return 0

    recall = true_positive / (true_positive+false_positive)
    precision = true_positive / (true_positive+false_negative)

    return 2*precision*recall / (precision+recall)

In [None]:
items_to_test = 30
max_depth = 4
thres_to_test = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.7, 0.8]

table = PrettyTable()
table.field_names = ["Threshold"] + [f"depth={x}" for x in range(max_depth)]

pbar = tqdm(total=max_depth*len(thres_to_test)*items_to_test, position=0)

for thres in thres_to_test:
    row = [f"Thr = {thres}"]

    for depth in range(max_depth):
        total_f1 = 0

        for idx in range(items_to_test):
            pbar.set_postfix_str(f"Processing threshold={thres}, depth={depth}, item={idx+1}, f1-score: {round(100*total_f1/(idx+1), 3)}%")

            pred = get_entities(train_sent[idx], recursion_depth=depth, threshold=thres)
            true = train_ners[idx]
            f1_score = eval_f1(pred, true)
            total_f1 += f1_score

            pbar.update(1)

        row.append(f"{round(100*total_f1/items_to_test, 2)}%")
    table.add_row(row)

print()
print(table)

  5%|▍         | 92/1950 [01:41<34:11,  1.10s/it, Processing threshold=0.3, depth=3, item=3, f1-score: 28.454%]
100%|██████████| 1080/1080 [09:52<00:00,  4.39it/s, Processing threshold=0.8, depth=3, item=30, f1-score: 25.174%]


+------------+---------+---------+---------+---------+
| Threshold  | depth=0 | depth=1 | depth=2 | depth=3 |
+------------+---------+---------+---------+---------+
| Thr = 0.3  |  39.66% |  39.66% |  39.66% |  39.66% |
| Thr = 0.35 |  39.66% |  39.66% |  39.66% |  39.66% |
| Thr = 0.4  |  39.66% |  39.66% |  39.66% |  39.66% |
| Thr = 0.45 |  39.66% |  39.66% |  39.66% |  39.66% |
| Thr = 0.5  |  39.66% |  39.66% |  39.66% |  39.66% |
| Thr = 0.55 |  38.31% |  38.31% |  38.31% |  38.31% |
| Thr = 0.6  |  37.23% |  37.23% |  37.23% |  37.23% |
| Thr = 0.7  |  33.38% |  33.38% |  33.38% |  33.38% |
| Thr = 0.8  |  26.37% |  26.37% |  26.37% |  26.37% |
+------------+---------+---------+---------+---------+


# Predictions

In [None]:
pbar = tqdm(
    total=len(test_sent), position=0
)

depth = 0
threshold = 0.6

with open("test.jsonl", "w") as f:
    for s, idx in zip(test_sent, test_idx):
        ners = get_entities(s, depth, threshold)
        ners = [ [x[1], x[2], x[3]] for x in ners]
        ners = {"ners": ners, "id": idx}
        ners = json.dumps(ners)
        f.write(f"{ners}\n")
        pbar.update(1)

print()
!zip test test.jsonl

100%|██████████| 65/65 [02:17<00:00,  2.12s/it]
100%|██████████| 65/65 [00:30<00:00,  1.43it/s]


updating: test.jsonl (deflated 77%)
