In [None]:
from src.definitions import PROJECT_ROOT
from pathlib import Path
import pandas as pd

pd.options.display.max_columns = 99

In [None]:
df = pd.read_csv(PROJECT_ROOT / "data/processed/labeled_passages.csv")

df.labels[df.labels == "B-CellLine"] = "O"
df.labels[df.labels == "I-CellLine"] = "O"
df.labels[df.labels == "B-Mutation"] = "O"
df.labels[df.labels == "I-Mutation"] = "O"

df.labels.value_counts()

In [None]:
df[df.pubtator_id == 79].T

In [None]:
df.pubtator_id.nunique(), df.words.nunique(), df.POS.nunique(), df.labels.nunique()

In [None]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system

def word2features(sent, i):
    word = str(sent[i][0])
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-5:]': word[-5:],
        'word[-4:]': word[-4:],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-1:]': word[-1:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    
    if i == 0:
        features['BOS'] = True
    if i == len(sent):
        features['EOS'] = True
    
    for step in (1, 2, 3, 4, 5):
        if i > step-1:
            word1 = str(sent[i-step][0])
            postag1 = sent[i-step][1]
            features.update({
                f'-{step}:word.lower()': word1.lower(),
                f'-{step}:word.istitle()': word1.istitle(),
                f'-{step}:word.isupper()': word1.isupper(),
                f'-{step}:postag': postag1,
                f'-{step}:postag[:2]': postag1[:2],
            })

        if i < len(sent)-step:
            word1 = str(sent[i+step][0])
            postag1 = sent[i+step][1]
            features.update({
                f'+{step}:word.lower()': word1.lower(),
                f'+{step}:word.istitle()': word1.istitle(),
                f'+{step}:word.isupper()': word1.isupper(),
                f'+{step}:postag': postag1,
                f'+{step}:postag[:2]': postag1[:2],
            })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
grouped_df = df.groupby(
    ["pubtator_id", "passage_id"]
).apply(
    lambda s: [
        (w, p, t) 
        for w, p, t in zip(
            s['words'].values.tolist(), 
            s['POS'].values.tolist(), 
            s['labels'].values.tolist(),
        )
    ]
)

passages = list(grouped_df.iloc)

grouped_df

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array([sent2features(s) for s in passages])
y = np.array([sent2labels(s) for s in passages])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

In [None]:
pd.concat([pd.DataFrame(X_train[1]), pd.DataFrame({'label': y_train[1]})], axis=1)

In [None]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=150,
                           all_possible_transitions=True,
                           verbose=True)

In [None]:
crf.fit(X_train, y_train)

In [None]:
from sklearn_crfsuite import metrics as crf_metrics

y_pred = crf.predict(X_test)

labels = list(crf.classes_)
labels.remove('O')

print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels))