In [3]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")
PACKAGE_ROOT = os.path.dirname(os.path.abspath(""))
# print(PACKAGE_ROOT)
sys.path.insert(0, PACKAGE_ROOT)

import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

from commons import constants as C
from feature_engineering import generate_simple_word_features
from feature_engineering import FeatureTransformer

# train data
train_df = pd.read_csv(r"C:\Users\sharm\Documents\nlp_assignment\ner-bilstm-crf\data\train.csv")

# validation data
valid_df = pd.read_csv(r"C:\Users\sharm\Documents\nlp_assignment\ner-bilstm-crf\data\valid.csv")

# test data
test_df = pd.read_csv(r"C:\Users\sharm\Documents\nlp_assignment\ner-bilstm-crf\data\test.csv")

In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
getter = SentenceGetter(train_df)
sentences = getter.sentences

In [6]:
# sentences

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    # postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        # 'postag': postag,
        # 'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        # postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            # '-1:postag': postag1,
            # '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        # postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            # '+1:postag': postag1,
            # '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, label in sent]

def sent2tokens(sent):
    return [token for token, _ in sent]

In [8]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [9]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [10]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [11]:
import eli5

In [13]:
# eli5.show_weights(crf, top=30)

In [8]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report


pred = cross_val_predict(estimator=crf, X=X, y=y, cv=3, verbose=4, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min finished


In [9]:
report = flat_classification_report(y_pred=pred, y_true=y, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
B-AerospaceManufacturer,0.833333,0.439815,0.575758,216.0
B-AnatomicalStructure,0.73224,0.345361,0.469352,388.0
B-ArtWork,0.526316,0.201005,0.290909,199.0
B-Artist,0.507392,0.573276,0.538325,3712.0
B-Athlete,0.445424,0.366425,0.402081,1793.0
B-CarManufacturer,0.709402,0.333333,0.453552,249.0
B-Cleric,0.645669,0.274247,0.384977,299.0
B-Clothing,0.888889,0.203046,0.330579,197.0
B-Disease,0.657534,0.387097,0.48731,372.0
B-Drink,0.577778,0.245283,0.344371,212.0


In [10]:
# !pip install eli5
# import eli5

In [11]:
# eli5.show_weights(crf, top=30)

In [12]:
crf = CRF(algorithm='lbfgs',
c1=10,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)

In [13]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=3, verbose=4, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


In [14]:
report = flat_classification_report(y_pred=pred, y_true=y, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
B-AerospaceManufacturer,0.75,0.027778,0.053571,216.0
B-AnatomicalStructure,0.0,0.0,0.0,388.0
B-ArtWork,0.0,0.0,0.0,199.0
B-Artist,0.390076,0.412985,0.401204,3712.0
B-Athlete,0.463127,0.087563,0.14728,1793.0
B-CarManufacturer,1.0,0.008032,0.015936,249.0
B-Cleric,1.0,0.133779,0.235988,299.0
B-Clothing,0.0,0.0,0.0,197.0
B-Disease,0.578947,0.05914,0.107317,372.0
B-Drink,0.0,0.0,0.0,212.0
