In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")
PACKAGE_ROOT = os.path.dirname(os.path.abspath(""))
# print(PACKAGE_ROOT)
sys.path.insert(0, PACKAGE_ROOT)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

from commons import constants as C
from feature_engineering import generate_simple_word_features
from feature_engineering import FeatureTransformer

# train data
train_df = pd.read_csv(C.CSV_TRAIN_PATH)

# validation data
valid_df = pd.read_csv(C.CSV_VALID_PATH)

# test data
test_df = pd.read_csv(C.CSV_TEST_PATH)

In [2]:
words = list(set(train_df["token"].values))
words.append("ENDPAD")
n_words = len(words)
print(f"Vocab size: {n_words}")

Vocab size: 34141


In [3]:
tags = list(set(train_df["tag"].values))
n_tags = len(tags)
print(f"Tags size: {n_tags}")

Tags size: 68


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
getter = SentenceGetter(train_df)
sentences = getter.sentences

In [6]:
max_len = max([len(sent) for sent in sentences])
max_len

68

In [7]:
# max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [8]:
# word2idx["Obama"]

In [9]:
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences, to_categorical

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [10]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
print(type(X_tr), type(y_tr), type(X_te), type(y_te))

<class 'numpy.ndarray'> <class 'list'> <class 'numpy.ndarray'> <class 'list'>


In [11]:
X_tr = np.array([np.array(row) for row in X_tr])
y_tr = np.array([np.array(row) for row in y_tr])
X_te = np.array([np.array(row) for row in X_te])
y_te = np.array([np.array(row) for row in y_te])

In [12]:
type(X_tr)

numpy.ndarray

In [13]:
from tensorflow.keras import Input, Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Dropout

In [14]:
# from tensorflow_addons.layers import CRF

In [15]:
# input = Input(shape=(max_len,))
# model = Embedding(input_dim=n_words + 1, output_dim=20,
#                   input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
# model = Bidirectional(LSTM(units=50, return_sequences=True,
#                            recurrent_dropout=0.1))(model)  # variational biLSTM
# model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
# crf = CRF(n_tags)  # CRF layer
# out = crf(model)  # output

# model = Model(input, out)
# model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

In [16]:
from tf2crf import CRF, ModelWithCRFLoss

In [17]:
input = Input(shape=(max_len,), dtype=np.int32)
model = Embedding(input_dim=n_words, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(units=n_tags)
output = crf(model)
base_model = Model(input, output)
model = ModelWithCRFLoss(base_model, sparse_target=True)
model.compile(optimizer="adam", loss=crf.accuracy_fn)

# model = Model(input, out)
# model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [19]:
model.fit(X_tr, y_tr)

AssertionError: in user code:

    File "c:\Users\sharm\anaconda3\envs\nlp\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\sharm\anaconda3\envs\nlp\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\sharm\anaconda3\envs\nlp\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\sharm\anaconda3\envs\nlp\lib\site-packages\tf2crf\model_wrapper.py", line 57, in train_step
        assert len(y.shape) == 2

    AssertionError: 
