<a href="https://colab.research.google.com/github/DmitryKutsev/ml_hw/blob/master/hw8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#form https://docs.google.com/forms/d/e/1FAIpQLSdH_5zoN0BvuYJdUaIYjQDkz7niBVXR671zpUm3p-RqJdBDfA/viewform

#handout https://www.depends-on-the-definition.com/lstm-with-char-embeddings-for-ner/

#lect notebook https://colab.research.google.com/drive/1xb3OD0b8IPqqyVUqRtYu6NgE27Brgc_P?usp=sharing


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, InputLayer, Embedding, Conv1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

In [0]:
from numpy.random import seed
seed(0)
from tensorflow.random import set_seed
set_seed(0)

In [23]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [0]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [0]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [0]:
sent_train, sent_test, tag_train, tag_test = train_test_split(sentences, sentence_tags, test_size=0.2, random_state=0)

In [0]:
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    vocab.update(sent)

In [0]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [0]:
vocab_test = Counter()
for sent in sent_test:
    sent = [word.lower() for word in sent]
    vocab_test.update(sent)
filtered_vocab_test = {word for word in vocab_test if vocab_test[word] > 5}

In [0]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

id2word = {i:word for word, i in word2id.items()}

In [0]:
tag2id = {'PAD':0}  
for tags in tag_train:
    for tag in tags:
      if tag.lower() not in tag2id:
        tag2id[tag.lower()] = len(tag2id)

id2tag = {i:tag for tag, i in tag2id.items()}

In [0]:
def data2ints(data, smth2id):
  int_data = []
  for seq in data:
      int_seq = []
      for i in seq:
          try:
            int_seq.append(smth2id[i.lower()])
          except KeyError:
            int_seq.append(smth2id['UNK'])
  
      int_data.append(int_seq)
  return int_data
 

In [51]:
sent_train[0]

('Edward', 'L.', 'Kane', 'succeeded', 'Mr.', 'Taylor', 'as', 'chairman', '.')

In [32]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)


print(X_train_ids[0])
print(X_test_ids[0])
print(y_train_ids[0])
print(y_test_ids[0])

[619, 1, 1, 1, 758, 1, 344, 732, 1273]
[566, 1378, 719, 1397, 877, 588, 800, 1, 851, 1, 169, 682, 311, 1273]
[1, 1, 1, 2, 1, 1, 3, 4, 5]
[18, 19, 21, 24, 10, 25, 24, 18, 21, 14, 3, 7, 15, 5]


In [52]:
X_train_symbols, X_test_symbols = sent_train, sent_test

y_train_symbols, y_test_symbols = tag_train, tag_test


print(X_train_symbols[0])
print(X_test_symbols[0])
print(y_train_symbols[0])
print(y_test_symbols[0])

('Edward', 'L.', 'Kane', 'succeeded', 'Mr.', 'Taylor', 'as', 'chairman', '.')
('You', 'do', "n't", 'want', '*-1', 'to', 'get', 'yourself', 'too', 'upset', 'about', 'these', 'things', '.')
('NNP', 'NNP', 'NNP', 'VBD', 'NNP', 'NNP', 'IN', 'NN', '.')
('PRP', 'VBP', 'RB', 'VB', '-NONE-', 'TO', 'VB', 'PRP', 'RB', 'JJ', 'IN', 'DT', 'NNS', '.')


In [0]:
MAX_LEN = max(len(x) for x in X_train_ids)

In [34]:
MAX_LEN

128

In [0]:
MAX_SYMB_LEN = 10
for i in (vocab + vocab_test).keys():
  if len(i)>MAX_SYMB_LEN:
    MAX_SYMB_LEN = len(i)


In [0]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_LEN, padding='post')
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_LEN, padding='post')

In [0]:
X_train_symb, X_test_symb = pad_sequences(X_train_ids, maxlen=MAX_SYMB_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_SYMB_LEN, padding='post')
y_train_symb_pad, y_test_symb_pad = pad_sequences(y_train_ids, maxlen=MAX_SYMB_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_SYMB_LEN, padding='post')

In [56]:
print(X_train_symb.shape, y_train_symb_pad.shape, X_test_symb.shape, y_test_symb_pad.shape)

(3131, 24) (3131, 24) (783, 24) (783, 24)


In [57]:
print(X_train.shape, y_train_pad.shape, X_test.shape, y_test_pad.shape)

(3131, 128) (3131, 128) (783, 128) (783, 128)


In [0]:
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [35]:
mask_model = Sequential()
mask_model.add(InputLayer(input_shape=(MAX_LEN)))
mask_model.add(Embedding(len(word2id),100, mask_zero=True))
mask_model.add(Bidirectional(LSTM(256, return_sequences=True)))
mask_model.add(Dropout(0.2))
mask_model.add(Bidirectional(LSTM(128, return_sequences=True)))
mask_model.add(TimeDistributed(Dense(len(tag2id))))


mask_model.add(InputLayer(input_shape=(MAX_SYMB_LEN)))
mask_model.add(Embedding(len(word2id),100, mask_zero=True))
onv_global = tf.keras.layers.Conv1D(kernel_size=5, filters=32, strides=1)(concat)
flatten = tf.keras.layers.Flatten()(conv_global)
mask_model.add(Bidirectional(LSTM(256, return_sequences=True)))
mask_model.add(Dropout(0.2))
mask_model.add(Bidirectional(LSTM(128, return_sequences=True)))
mask_model.add(TimeDistributed(Dense(len(tag2id))))


mask_model.add(Activation('softmax'))
mask_model.compile(loss='categorical_crossentropy', optimizer='Adam', 
                   metrics=['accuracy'])
mask_model.summary()
mask_model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 100)          168100    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 512)          731136    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 512)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128, 256)          656384    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 128, 47)           12079     
_________________________________________________________________
activation_1 (Activation)    (None, 128, 47)           0         
Total params: 1,567,699
Trainable params: 1,567,699
Non-trainable params: 0
____________________________________________

In [36]:
mask_model.fit(X_train,y_train, validation_data=(X_test, y_test), batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f053a14dc18>

In [0]:
#