# Character embedding model
#### Copied from https://www.depends-on-the-definition.com/lstm-with-char-embeddings-for-ner/

In [1]:
import os
import pandas as pd
import numpy as np
pd.set_option("max_columns",500)
pd.set_option("max_rows",500)

In [2]:
DATA_DIR="../../Data/SEC-filings/data/"
TRAIN_DIR=DATA_DIR+"train/"
TEST_DIR=DATA_DIR+"test/"
sentence=0
def convert_to_list(fName):
	global sentence
	df_list=[]
	with open(fName,"r") as f:
		for line in f:
			if line=="\n":
				sentence+=1
			else:
				df_list+=[[sentence]+line.strip().split(" ")]
	print(fName,sentence)
	return df_list

train_l=convert_to_list(TRAIN_DIR+"FIN5.txt")
test_l=convert_to_list(TEST_DIR+"FIN3.txt")

train=pd.DataFrame(train_l[1:],columns=["Sentence", "Word","POS","-","TAG"])
test=pd.DataFrame(test_l[1:],columns=["Sentence", "Word","POS","-","TAG"])

../../Data/SEC-filings/data/train/FIN5.txt 1169
../../Data/SEC-filings/data/test/FIN3.txt 1475


In [3]:
df=pd.concat([train,test], ignore_index=True, axis=0)

In [4]:

max_len = 69
max_len_char = 10

sentences=df.groupby("Sentence").apply(lambda x: list(zip(x["Word"].values,x["POS"].values,x["TAG"].values))).values
words=df['Word'].unique()
n_words=words.shape[0]
tags=df['TAG'].unique()
n_tags=tags.shape[0]

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [5]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0

print(n_chars)

87


In [7]:
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

In [8]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')

In [9]:
from sklearn.model_selection import train_test_split

X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=1)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=1)

In [12]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len, mask_zero=True)(word_in)

char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char, mask_zero=True))(char_in)

char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5))(emb_char)

x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x)
out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)

model = Model([word_in, char_in], out)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 69, 10)       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 69)           0                                            
__________________________________________________________________________________________________
time_distributed_4 (TimeDistrib (None, 69, 10, 10)   890         input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 69, 20)       83420       input_3[0][0]                    
__________________________________________________________________________________________________
time_distr

In [13]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr).reshape(len(y_tr), max_len, 1),
                    batch_size=32, epochs=10, validation_split=0.1, verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 1192 samples, validate on 133 samples
Epoch 1/10


InvalidArgumentError: Incompatible shapes: [2208] vs. [32,69]
	 [[{{node metrics/acc/Equal}}]]

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
from keras.optimizers import SGD
from keras.callbacks import TensorBoard, ModelCheckpoint, CSVLogger
from datetime import datetime


MAX_LEN=69
n_words=4169
n_tags=5

def trainM(model, mod_dir):
	MODEL_DIR=mod_dir
	N_EPOCHS=50
	B_SIZE=32

	if not os.path.exists(MODEL_DIR):
		os.makedirs(MODEL_DIR)

	chkpt_path=os.path.join(MODEL_DIR,"{epoch:02d}-{val_crf_accuracy:.4f}.hdf5")
	checkpointer = ModelCheckpoint(chkpt_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

	log_path=os.path.join(MODEL_DIR,"metrics.log")
	csv_logger = CSVLogger(log_path)

	tensorboard=TensorBoard(log_dir=MODEL_DIR)

	

	history = model.fit([X_word_tr, np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
						np.array(y_tr).reshape(len(y_tr), max_len, 1),
						batch_size=32, 
						epochs=10, 
						validation_split=0.1, 
						verbose=1,
						callbacks=[checkpointer,csv_logger,tensorboard])

In [None]:
from keras.optimizers import Adadelta,Adagrad, Adam, Adamax, Nadam, RMSprop, SGD
MODEL_NAME="Bi-LSTM"

opt=[]
opt.append(("Adadelta",Adadelta()))
opt.append(("Adagrad",Adagrad()))
opt.append(("Adam",Adam()))
opt.append(("Adamax",Adamax()))
opt.append(("Nadam",Nadam()))
opt.append(("rmsprop",RMSprop()))
opt.append(("SGD",SGD()))
opt.append(("SGDNesterov",SGD(nesterov=True)))

for c_opt, optimizer in opt:
	try:
		mod_dir="Models/"+MODEL_NAME+"/"+c_opt+"_"+datetime.now().strftime("%Y%m%d-%H%M%S")
		word_in = Input(shape=(max_len,))
		emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
							 input_length=max_len, mask_zero=True)(word_in)

		char_in = Input(shape=(max_len, max_len_char,))
		emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
								   input_length=max_len_char, mask_zero=True))(char_in)

		char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
										recurrent_dropout=0.5))(emb_char)

		x = concatenate([emb_word, char_enc])
		x = SpatialDropout1D(0.3)(x)
		main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
									   recurrent_dropout=0.6))(x)
		out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)

		model = Model([word_in, char_in], out)
		trainM(model,mod_dir)
	except Exception as e:
		print(e)