<a href="https://colab.research.google.com/github/DanielNorth/NLU-2023-Labs/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install sklearn_crfsuite
!pip install es_core_news_sm

Collecting es_core_news_sm
  Downloading es_core_news_sm-3.1.0-py3-none-any.whl (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy<3.2.0,>=3.1.0 (from es_core_news_sm)
  Downloading spacy-3.1.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.1.0,>=8.0.12 (from spacy<3.2.0,>=3.1.0->es_core_news_sm)
  Downloading thinc-8.0.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (659 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m659.5/659.5 kB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Collecting wasabi<1.1.0,>=0.8.1 (from spacy<3.2.0,>=3.1.0->es_core_news_sm)
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting typer<0.5.0,>=0.3.0 (from spacy<3.2.0,>=3.1.0->es_core_news_sm)
  Downloading typer-

In [10]:
import nltk
from nltk.corpus import conll2002
from sklearn_crfsuite import CRF
import spacy
from spacy.tokenizer import Tokenizer
import es_core_news_sm

import re

# !pip install sklearn_crfsuite
# !pip install es_core_news_sm


def stats():
    return {'cor': 0, 'hyp': 0, 'ref': 0}


def evaluate(ref, hyp, otag='O'):
    # evaluation for NLTK
    aligned = align_hyp(ref, hyp)
    return conlleval(aligned, otag=otag)


def align_hyp(ref, hyp):
    # align references and hypotheses for evaluation
    # add last element of token tuple in hyp to ref
    if len(ref) != len(hyp):
        raise ValueError("Size Mismatch: ref: {} & hyp: {}".format(len(ref), len(hyp)))

    out = []
    for i in range(len(ref)):
        if len(ref[i]) != len(hyp[i]):
            raise ValueError("Size Mismatch: ref: {} & hyp: {}".format(len(ref), len(hyp)))
        out.append([(*ref[i][j], hyp[i][j][-1]) for j in range(len(ref[i]))])
    return out


def conlleval(data, otag='O'):
    # token, segment & class level counts for TP, TP+FP, TP+FN
    tok = stats()
    seg = stats()
    cls = {}

    for sent in data:

        prev_ref = otag      # previous reference label
        prev_hyp = otag      # previous hypothesis label
        prev_ref_iob = None  # previous reference label IOB
        prev_hyp_iob = None  # previous hypothesis label IOB

        in_correct = False  # currently processed chunks is correct until now

        for token in sent:

            hyp_iob, hyp = parse_iob(token[-1])
            ref_iob, ref = parse_iob(token[-2])

            ref_e = is_eoc(ref, ref_iob, prev_ref, prev_ref_iob, otag)
            hyp_e = is_eoc(hyp, hyp_iob, prev_hyp, prev_hyp_iob, otag)

            ref_b = is_boc(ref, ref_iob, prev_ref, prev_ref_iob, otag)
            hyp_b = is_boc(hyp, hyp_iob, prev_hyp, prev_hyp_iob, otag)

            if not cls.get(ref) and ref:
                cls[ref] = stats()

            if not cls.get(hyp) and hyp:
                cls[hyp] = stats()

            # segment-level counts
            if in_correct:
                if ref_e and hyp_e and prev_hyp == prev_ref:
                    in_correct = False
                    seg['cor'] += 1
                    cls[prev_ref]['cor'] += 1

                elif ref_e != hyp_e or hyp != ref:
                    in_correct = False

            if ref_b and hyp_b and hyp == ref:
                in_correct = True

            if ref_b:
                seg['ref'] += 1
                cls[ref]['ref'] += 1

            if hyp_b:
                seg['hyp'] += 1
                cls[hyp]['hyp'] += 1

            # token-level counts
            if ref == hyp and ref_iob == hyp_iob:
                tok['cor'] += 1

            tok['ref'] += 1

            prev_ref = ref
            prev_hyp = hyp
            prev_ref_iob = ref_iob
            prev_hyp_iob = hyp_iob

        if in_correct:
            seg['cor'] += 1
            cls[prev_ref]['cor'] += 1

    return summarize(seg, cls)


def parse_iob(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, None)


def is_boc(lbl, iob, prev_lbl, prev_iob, otag='O'):
    """
    is beginning of a chunk

    supports: IOB, IOBE, BILOU schemes
        - {E,L} --> last
        - {S,U} --> unit

    :param lbl: current label
    :param iob: current iob
    :param prev_lbl: previous label
    :param prev_iob: previous iob
    :param otag: out-of-chunk label
    :return:
    """
    boc = False

    boc = True if iob in ['B', 'S', 'U'] else boc
    boc = True if iob in ['E', 'L'] and prev_iob in ['E', 'L', 'S', otag] else boc
    boc = True if iob == 'I' and prev_iob in ['S', 'L', 'E', otag] else boc

    boc = True if lbl != prev_lbl and iob != otag and iob != '.' else boc

    # these chunks are assumed to have length 1
    boc = True if iob in ['[', ']'] else boc

    return boc


def is_eoc(lbl, iob, prev_lbl, prev_iob, otag='O'):
    """
    is end of a chunk

    supports: IOB, IOBE, BILOU schemes
        - {E,L} --> last
        - {S,U} --> unit

    :param lbl: current label
    :param iob: current iob
    :param prev_lbl: previous label
    :param prev_iob: previous iob
    :param otag: out-of-chunk label
    :return:
    """
    eoc = False

    eoc = True if iob in ['E', 'L', 'S', 'U'] else eoc
    eoc = True if iob == 'B' and prev_iob in ['B', 'I'] else eoc
    eoc = True if iob in ['S', 'U'] and prev_iob in ['B', 'I'] else eoc

    eoc = True if iob == otag and prev_iob in ['B', 'I'] else eoc

    eoc = True if lbl != prev_lbl and iob != otag and prev_iob != '.' else eoc

    # these chunks are assumed to have length 1
    eoc = True if iob in ['[', ']'] else eoc

    return eoc


def score(cor_cnt, hyp_cnt, ref_cnt):
    # precision
    p = 1 if hyp_cnt == 0 else cor_cnt / hyp_cnt
    # recall
    r = 0 if ref_cnt == 0 else cor_cnt / ref_cnt
    # f-measure (f1)
    f = 0 if p+r == 0 else (2*p*r)/(p+r)
    return {"p": p, "r": r, "f": f, "s": ref_cnt}


def summarize(seg, cls):
    # class-level
    res = {lbl: score(cls[lbl]['cor'], cls[lbl]['hyp'], cls[lbl]['ref']) for lbl in set(cls.keys())}
    # micro
    res.update({"total": score(seg.get('cor', 0), seg.get('hyp', 0), seg.get('ref', 0))})
    return res


def read_corpus_conll(corpus_file, fs="\t"):
    """
    read corpus in CoNLL format
    :param corpus_file: corpus in conll format
    :param fs: field separator
    :return: corpus
    """
    featn = None  # number of features for consistency check
    sents = []  # list to hold words list sequences
    words = []  # list to hold feature tuples

    for line in open(corpus_file):
        line = line.strip()
        if len(line.strip()) > 0:
            feats = tuple(line.strip().split(fs))
            if not featn:
                featn = len(feats)
            elif featn != len(feats) and len(feats) != 0:
                raise ValueError("Unexpected number of columns {} ({})".format(len(feats), featn))

            words.append(feats)
        else:
            if len(words) > 0:
                sents.append(words)
                words = []
    return sents


def get_chunks(corpus_file, fs="\t", otag="O"):
    sents = read_corpus_conll(corpus_file, fs=fs)
    return set([parse_iob(token[-1])[1] for sent in sents for token in sent if token[-1] != otag])


# Function to extract features using sent2spacy_features
def sent2spacy_features(sent):
    nlp = es_core_news_sm.load()
    nlp.tokenizer = Tokenizer(nlp.vocab)
    spacy_sent = nlp(" ".join(sent2tokens(sent)))
    feats = []
    for token in spacy_sent:
        token_feats = {
            'bias': 1.0,
            'word.lower()': token.lower_,
            'pos': token.pos_,
            'lemma': token.lemma_
        }
        feats.append(token_feats)
    return feats

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

# Function to extract tokens using sent2tokens
def sent2tokens(sent):
    return [token for token, pos, iob in sent]

# Function to extract features for the tutorial
def word2features(sent, i):
    word = sent[i][0]
    pos = sent[i][1]
    return {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'pos': pos,
        'pos[:2]': pos[:2]
    }

# Function to extract features with a given window size
def sent2features_window(sent, window):
    return [word2features(sent, i - window) if i >= window else word2features(sent, i) for i in range(len(sent))]

# Function to train and test a CRF model with given features
def train_test_crf_model(features_train, labels_train, features_test):
    crf_model = CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf_model.fit(features_train, labels_train)
    pred_labels = crf_model.predict(features_test)
    return pred_labels


# Add the "suffix" feature
def sent2spacy_suffix_features(sent):
    nlp = es_core_news_sm.load()
    nlp.tokenizer = Tokenizer(nlp.vocab)
    spacy_sent = nlp(" ".join(sent2tokens(sent)))
    feats = []
    for i, token in enumerate(spacy_sent):
        token_feats = {
            'bias': 1.0,
            'word.lower()': token.lower_,
            'pos': token.pos_,
            'lemma': token.lemma_,
            'suffix': token.text[-3:]
        }
        feats.append(token_feats)
    return feats

  ) from None


In [12]:
import pandas as pd

# Load the conll2002 dataset
nltk.download('conll2002')
trn_sents = conll2002.iob_sents('esp.train')
tst_sents = conll2002.iob_sents('esp.testa')

trn_sents = trn_sents[:10]
tst_sents = tst_sents[:10]

# Baseline using sent2spacy_features
trn_feats_baseline = [sent2spacy_features(s) for s in trn_sents]
tst_feats_baseline = [sent2spacy_features(s) for s in tst_sents]
trn_labels = [sent2labels(s) for s in trn_sents]

pred_labels_baseline = train_test_crf_model(trn_feats_baseline, trn_labels, tst_feats_baseline)
hyp_baseline = [[(tst_feats_baseline[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred_labels_baseline)]
results_baseline = evaluate(tst_sents, hyp_baseline)
pd_tbl_baseline = pd.DataFrame().from_dict(results_baseline, orient='index')
pd_tbl_baseline.round(decimals=3)

# Add the "suffix" feature
# def sent2spacy_suffix_features(sent):
#     nlp = es_core_news_sm.load()
#     nlp.tokenizer = Tokenizer(nlp.vocab)
#     spacy_sent = nlp(" ".join(sent2tokens(sent)))
#     feats = []
#     for i, token in enumerate(spacy_sent):
#         token_feats = {
#             'bias': 1.0,
#             'word.lower()': token.lower_,
#             'pos': token.pos_,
#             'lemma': token.lemma_,
#             'suffix': token.text[-3:]
#         }
#         feats.append(token_feats)
#     return feats

trn_feats_suffix = [sent2spacy_suffix_features(s) for s in trn_sents]
tst_feats_suffix = [sent2spacy_suffix_features(s) for s in tst_sents]

pred_labels_suffix = train_test_crf_model(trn_feats_suffix, trn_labels, tst_feats_suffix)
hyp_suffix = [[(tst_feats_suffix[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred_labels_suffix)]
results_suffix = evaluate(tst_sents, hyp_suffix)
pd_tbl_suffix = pd.DataFrame().from_dict(results_suffix, orient='index')
pd_tbl_suffix.round(decimals=3)

# Add all features used in the tutorial
trn_feats_tutorial = [sent2features_tutorial(s) for s in trn_sents]
tst_feats_tutorial = [sent2features_tutorial(s) for s in tst_sents]

pred_labels_tutorial = train_test_crf_model(trn_feats_tutorial, trn_labels, tst_feats_tutorial)
hyp_tutorial = [[(tst_feats_tutorial[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred_labels_tutorial)]
results_tutorial = evaluate(tst_sents, hyp_tutorial)
pd_tbl_tutorial = pd.DataFrame().from_dict(results_tutorial, orient='index')
pd_tbl_tutorial.round(decimals=3)

# Increase feature window to [-1, +1]
trn_feats_window_1 = [sent2features_window(s, 1) for s in trn_sents]
tst_feats_window_1 = [sent2features_window(s, 1) for s in tst_sents]

pred_labels_window_1 = train_test_crf_model(trn_feats_window_1, trn_labels, tst_feats_window_1)
hyp_window_1 = [[(tst_feats_window_1[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred_labels_window_1)]
results_window_1 = evaluate(tst_sents, hyp_window_1)
pd_tbl_window_1 = pd.DataFrame().from_dict(results_window_1, orient='index')
pd_tbl_window_1.round(decimals=3)

# Increase feature window to [-2, +2]
trn_feats_window_2 = [sent2features_window(s, 2) for s in trn_sents]
tst_feats_window_2 = [sent2features_window(s, 2) for s in tst_sents]

pred_labels_window_2 = train_test_crf_model(trn_feats_window_2, trn_labels, tst_feats_window_2)
hyp_window_2 = [[(tst_feats_window_2[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred_labels_window_2)]
results_window_2 = evaluate(tst_sents, hyp_window_2)
pd_tbl_window_2 = pd.DataFrame().from_dict(results_window_2, orient='index')
pd_tbl_window_2.round(decimals=3)


[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!
  ) from None


ValueError: ignored