<a href="https://colab.research.google.com/github/DmitryKutsev/ml_hw/blob/master/hw8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#form https://docs.google.com/forms/d/e/1FAIpQLSdH_5zoN0BvuYJdUaIYjQDkz7niBVXR671zpUm3p-RqJdBDfA/viewform

#handout https://www.depends-on-the-definition.com/lstm-with-char-embeddings-for-ner/

#lect notebook https://colab.research.google.com/drive/1xb3OD0b8IPqqyVUqRtYu6NgE27Brgc_P?usp=sharing


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, SpatialDropout1D, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, InputLayer, Embedding, Conv1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import tensorflow as tf

In [0]:
from numpy.random import seed
seed(0)
from tensorflow.random import set_seed
set_seed(0)

In [69]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [0]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [0]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [0]:
sent_train, sent_test, tag_train, tag_test = train_test_split(sentences, sentence_tags, test_size=0.2, random_state=0)

In [0]:
# vocab = Counter()
# for sent in sent_train:
#     sent = [word.lower() for word in sent]
#     vocab.update(sent)

In [0]:
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    for word in sent:
      for i in word:
        vocab.update(sent)

In [0]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [0]:
vocab_test = Counter()
for sent in sent_test:
    sent = [word.lower() for word in sent]
    vocab_test.update(sent)
filtered_vocab_test = {word for word in vocab_test if vocab_test[word] > 5}

In [0]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

id2word = {i:word for word, i in word2id.items()}

In [0]:
tag2id = {'PAD':0}  
for tags in tag_train:
    for tag in tags:
      if tag.lower() not in tag2id:
        tag2id[tag.lower()] = len(tag2id)

id2tag = {i:tag for tag, i in tag2id.items()}

In [0]:
def data2ints(data, smth2id):
  int_data = []
  for seq in data:
      int_seq = []
      for i in seq:
          try:
            int_seq.append(smth2id[i.lower()])
          except KeyError:
            int_seq.append(smth2id['UNK'])
  
      int_data.append(int_seq)
  return int_data
 

In [155]:
sent_train[0]

('Edward', 'L.', 'Kane', 'succeeded', 'Mr.', 'Taylor', 'as', 'chairman', '.')

In [156]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)


print(X_train_ids[0])
print(X_test_ids[0])
print(y_train_ids[0])
print(y_test_ids[0])

[6020, 1050, 2590, 9712, 8679, 393, 5581, 1087, 4396]
[854, 7104, 6150, 7134, 6395, 888, 6269, 1, 8805, 5917, 2700, 8573, 443, 4396]
[1, 1, 1, 2, 1, 1, 3, 4, 5]
[18, 19, 21, 24, 10, 25, 24, 18, 21, 14, 3, 7, 15, 5]


In [157]:
X_train_symbols, X_test_symbols = sent_train, sent_test

y_train_symbols, y_test_symbols = tag_train, tag_test


print(X_train_symbols[0])
print(X_test_symbols[0])
print(y_train_symbols[0])
print(y_test_symbols[0])

('Edward', 'L.', 'Kane', 'succeeded', 'Mr.', 'Taylor', 'as', 'chairman', '.')
('You', 'do', "n't", 'want', '*-1', 'to', 'get', 'yourself', 'too', 'upset', 'about', 'these', 'things', '.')
('NNP', 'NNP', 'NNP', 'VBD', 'NNP', 'NNP', 'IN', 'NN', '.')
('PRP', 'VBP', 'RB', 'VB', '-NONE-', 'TO', 'VB', 'PRP', 'RB', 'JJ', 'IN', 'DT', 'NNS', '.')


In [0]:
MAX_LEN = max(len(x) for x in X_train_ids)

In [0]:
MAX_SYMB_LEN = max(len(x) for x in vocab.keys())

MAX_SEQ_LEN = max(len(x) for x in sent_train)

In [160]:
MAX_LEN, MAX_SEQ_LEN, MAX_SYMB_LEN 

(128, 128, 24)

In [0]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_LEN, padding='post')
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_LEN, padding='post')

In [0]:
X_train_symb, X_test_symb = pad_sequences(X_train_ids, maxlen=MAX_SYMB_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_SYMB_LEN, padding='post')
y_train_symb_pad, y_test_symb_pad = pad_sequences(y_train_ids, maxlen=MAX_SYMB_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_SYMB_LEN, padding='post')

In [163]:
print(X_train_symb.shape, y_train_symb_pad.shape, X_test_symb.shape, y_test_symb_pad.shape)

(3131, 24) (3131, 24) (783, 24) (783, 24)


In [164]:
print(X_train.shape, y_train_pad.shape, X_test.shape, y_test_pad.shape)

(3131, 128) (3131, 128) (783, 128) (783, 128)


In [0]:
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [0]:
y_train_symb, y_test_symb = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [220]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN, ))
embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs, )
bilstm1 = tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True))(embeddings)
drop = tf.keras.layers.Dropout(0.2)(bilstm1)
bilstm2 = tf.keras.layers.Bidirectional(LSTM(128, return_sequences=True))(drop)
outputs1 = tf.keras.layers.TimeDistributed(Dense(len(tag2id), activation='softmax'))(bilstm2)

inputs2 = tf.keras.layers.Input(shape=(MAX_LEN, MAX_SYMB_LEN))
embeddings2 = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=100)(inputs2, )
conv = TimeDistributed(tf.keras.layers.Conv1D(kernel_size=5, filters=32, strides=1))(embeddings2)
flatten = TimeDistributed(tf.keras.layers.Flatten())(conv)
#dense = tf.keras.layers.Dense(50, activation='relu')(flatten)
outputs2 = tf.keras.layers.TimeDistributed(Dense(len(tag2id), activation='softmax'))(flatten)



concat = tf.keras.layers.concatenate([outputs1, outputs2])
concat = tf.keras.layers.Dropout(0.2)(concat)
main_lstm = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.6))(concat)
outputs = TimeDistributed(Dense(len(tag2id), activation="sigmoid"))(main_lstm)

inputs = [inputs, inputs2]
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='Adam', 
                   metrics=['accuracy'])
model.summary()


Model: "model_40"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_181 (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
embedding_250 (Embedding)       (None, 128, 100)     1007100     input_181[0][0]                  
__________________________________________________________________________________________________
input_182 (InputLayer)          [(None, 128, 24)]    0                                            
__________________________________________________________________________________________________
bidirectional_234 (Bidirectiona (None, 128, 512)     731136      embedding_250[0][0]              
___________________________________________________________________________________________

In [0]:
model.fit(X_train,y_train, validation_data=(X_test, y_test), batch_size=128, epochs=10)