In [None]:
!pip install evaluate seqeval sklearn-crfsuite

In [None]:
## Reference : https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

In [None]:
import bz2
import random
import pandas as pd
import numpy as np
from sklearn_crfsuite import CRF
import evaluate
seqeval = evaluate.load('seqeval')

In [None]:
token_docs = []
tag_docs = []
datasets = ["aij-wikiner-en-wp2.bz2", "aij-wikiner-en-wp3.bz2"]

for dataset in datasets:
    with bz2.open(f"../Data/{dataset}", "rb") as bz_file:
        docs = bz_file.readlines()
        for doc in docs:
            doc = doc.strip().decode()
            if len(doc) <= 1:
                continue

            tokens = []
            tags_l = []

            for seq in doc.split(" "):

                token, pos, tag = seq.split("|")

                tokens.append(token)
                tags_l.append(tag)

            token_docs.append(tokens)
            tag_docs.append(tags_l)


In [None]:
texts, tags_l = token_docs, tag_docs

In [None]:
print((f"Total amount of data = {len(texts)}"))
data_perc = round(len(texts) * 0.25)
print(f"Current sample of data = {data_perc}")

random.seed(100)
random_samples = random.sample(range(0, len(texts)), data_perc)
print(f"First sample index = {random_samples[0]}") ## 76372
texts, tags_l = [token_docs[i] for i in random_samples], [tag_docs[i] for i in random_samples]

In [None]:
# To calculate max len of sentences
m_len = 0
for i in texts:
    m_len = max(m_len, len(i))
print(f"Largest sentence by length = {m_len}")

In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_tags, test_tags = train_test_split(texts, tags_l, test_size=.1,random_state=100)

train_texts, val_texts, train_tags, val_tags = train_test_split(train_texts, train_tags, test_size=.1,random_state=100)

train_perc = round( 100 * (len(train_texts)/len(texts)))
val_perc = round( 100 * (len(val_texts)/len(texts)))
test_perc = round( 100 * (len(test_texts)/len(texts)))


print(f"{train_perc}% of data is TRAINING")
print(f"{val_perc}% of data is VALIDATION")
print(f"{test_perc}% of data is TESTING")

In [None]:
crf = CRF(

      algorithm = 'lbfgs',
      c1=0.1, # L1 regularization
      c2=0.1, # L2 regularization
      max_iterations = 100,
      all_possible_transitions = True,
      verbose = True
)

In [None]:
CRF_model = crf.fit(train_texts,train_tags)

In [None]:
pred_test_tags = crf.predict(test_texts)
pred_val_tags = crf.predict(val_texts)

In [None]:
result_test = seqeval.compute(predictions=pred_test_tags, references=test_tags,mode="strict",scheme="IOB1")
result_val = seqeval.compute(predictions=pred_val_tags, references=val_tags,mode="strict",scheme="IOB1")

In [None]:
def generate_metric_csv(result, csv_name):
    ## result is the o/p obtained from seqeval.compute
    test_metrics = []
    for key in result.keys():
        metric = {}
        if key in ["LOC", "MISC", "ORG", "PER"]:
            for _m in result[key].keys():
                if _m in ["precision", "recall", "f1"]:
                    metric["metric"] = f"{key}_{_m}"
                    metric["val"] = round(result[key][_m], 4)
        else:
            metric["metric"] = key
            metric["val"] = round(result[key], 4)

        test_metrics.append(metric)
    pd.DataFrame(test_metrics).to_csv(f"Results/{csv_name}.csv", index=False)

In [None]:
generate_metric_csv(result_test,"test_crf")
generate_metric_csv(result_val,"validation_crf")

In [None]:
def get_op_for_pred(test_t, test_tag, crf_predict, i):
    sent = " ".join(test_t[i])
    print(sent)
    print("\tACTUAL")
    print("\t______")
    print()
    for j, tag in enumerate(test_tag[i]):
        if tag != "O":
            print(f"\t\t{test_t[i][j]} {tag}")
    print()
    print("\tPREDICTION")
    print("\t__________")
    print()
    for j, tag in enumerate(crf_predict[i]):
        if tag != "O":
            print(f"\t\t{test_t[i][j]} {tag}")

In [None]:
_random_index = 900
get_op_for_pred(test_texts,test_tags,pred_test_tags,_random_index)