<a href="https://colab.research.google.com/github/ElizavetaNosova/HSE_ML_homework/blob/master/CharCNN_(bi)LSTM_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [0]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU, Bidirectional, TimeDistributed, Input, Embedding, Conv1D, Flatten, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter

In [0]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [0]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [0]:
sent_train, sent_test, tag_train, tag_test = train_test_split(sentences, sentence_tags, test_size=0.2, random_state=0)

In [0]:
from numpy.random import seed
seed(0)
from tensorflow.random import set_seed
set_seed(0)

In [0]:
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    vocab.update(sent)

In [0]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [0]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

id2word = {i:word for word, i in word2id.items()}

In [0]:
tag2id = {'PAD':0}  
for tags in tag_train:
    for tag in tags:
      if tag.lower() not in tag2id:
        tag2id[tag.lower()] = len(tag2id)

id2tag = {i:tag for tag, i in tag2id.items()}

In [0]:
def data2ints(data, smth2id):
  int_data = []
  for seq in data:
      int_seq = []
      for i in seq:
          try:
            int_seq.append(smth2id[i.lower()])
          except KeyError:
            int_seq.append(smth2id['UNK'])
  
      int_data.append(int_seq)
  return int_data

In [0]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)


print(X_train_ids[0])
print(X_test_ids[0])
print(y_train_ids[0])
print(y_test_ids[0])

[976, 1, 1, 1, 462, 1, 349, 13, 1458]
[717, 1289, 317, 766, 516, 1241, 434, 1, 943, 1, 836, 118, 119, 1458]
[1, 1, 1, 2, 1, 1, 3, 4, 5]
[18, 19, 21, 24, 10, 25, 24, 18, 21, 14, 3, 7, 15, 5]


In [0]:
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [0]:
MAX_LEN = max(len(x) for x in X_train_ids) + 5 #в тесте могут быть более длинные предложения

In [0]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN), pad_sequences(X_test_ids, maxlen=MAX_LEN)
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN), pad_sequences(y_test_ids, maxlen=MAX_LEN)

In [0]:
char2id = {'PAD':0,'UNK':1}
for sent in sent_train:
    for word in sent:
        for char in word:
            if char not in char2id:
                char2id[char] = len(char2id)

id2char = {i:char for char, i in char2id.items()}

In [0]:
MAX_WORD_LEN = max([len(word) for word in sent for sent in sent_train]) + 5 #в тесте могут быть более длинные слова

In [0]:
MAX_WORD_LEN

13

In [0]:
X_train_char_ids = [data2ints(sent, char2id) for sent in sent_train]
X_test_char_ids = [data2ints(sent, char2id) for sent in sent_test]

In [0]:
X_train_char_ids[0]

[[11, 3, 4, 5, 6, 3],
 [18, 8],
 [38, 5, 10, 11],
 [12, 13, 14, 14, 11, 11, 3, 11, 3],
 [22, 6, 8],
 [34, 5, 17, 18, 19, 6],
 [5, 12],
 [14, 20, 5, 21, 6, 22, 5, 10],
 [8]]

In [0]:
X_train_char = [pad_sequences(sent, maxlen = MAX_WORD_LEN) for sent in X_train_char_ids]
X_test_char = [pad_sequences(sent, maxlen = MAX_WORD_LEN) for sent in X_test_char_ids]

In [0]:
X_train_char_pad = pad_sequences(X_train_char, maxlen = MAX_LEN)
X_test_char_pad = pad_sequences(X_test_char, maxlen = MAX_LEN)

In [0]:
X_train_char_pad[7]

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 20, 21, 10],
       [ 0,  0,  0, ..., 21, 10, 12],
       [ 0,  0,  0, ...,  0,  0,  8]], dtype=int32)

In [0]:
input_token = Input((MAX_LEN), name='input_tokens')
embeddings_token =  Embedding(input_dim=len(word2id), output_dim=10, mask_zero=True)(input_token)
lstm_token = Bidirectional(LSTM(256, return_sequences=True))(embeddings_token)
drop_token = Dropout(0.05)(lstm_token)

input_char = Input(shape=(MAX_LEN, MAX_WORD_LEN), name='input_char')
embedding_char =  TimeDistributed(Embedding(len(char2id), MAX_WORD_LEN, input_length=MAX_WORD_LEN))(input_char)
conv_char = TimeDistributed(Conv1D(MAX_LEN, 3, 1, padding='same'))(embedding_char)
flat_char = TimeDistributed(Flatten())(conv_char)
drop_char = Dropout(0.05)(flat_char)

concat = concatenate([drop_token, drop_char])
lstm = Bidirectional(LSTM(256, return_sequences=True))(concat)
outputs =  TimeDistributed(Dense(len(tag2id), activation='softmax'))(lstm)

model = Model([input_token, input_char], outputs)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics='accuracy')

In [0]:
import tensorflow as tf
# ModelCheckpoint сохраняет лучшие версии моделей
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.weights', # названия файла 
                                                monitor='val_accuracy', # за какой метрикой следить
                                                verbose=1, # будет печатать что происходит
                                                save_weights_only=True, # если нужно только веса сохранить
                                                save_best_only=True, # сохранять только лучшие
                                                mode='max', # если метрика должна расти, то тут max и min если наоборот
                                                save_freq='epoch' # как часто вызывать
                                               )

# EarlyStopping позволяет автоматически остановить обучение, если качество не улучшается 
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
                                              min_delta=0.01, # какая разница считается как улучшение
                                              patience=3, # сколько эпох терпеть отсутствие улучшений
                                              verbose=1, 
                                              mode='max',
                                              )

In [0]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_char (InputLayer)         [(None, 133, 13)]    0                                            
__________________________________________________________________________________________________
input_tokens (InputLayer)       [(None, 133)]        0                                            
__________________________________________________________________________________________________
time_distributed_24 (TimeDistri (None, 133, 13, 13)  1040        input_char[0][0]                 
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 133, 10)      1330        input_tokens[0][0]               
____________________________________________________________________________________________

In [0]:
model.fit([X_train, X_train_char_pad], y_train, validation_data = ([X_test, X_test_char_pad], y_test), epochs=50, callbacks = [checkpoint, early_stop])

Epoch 1/50
Epoch 00001: val_accuracy improved from -inf to 0.08645, saving model to model.weights
Epoch 2/50
Epoch 00002: val_accuracy improved from 0.08645 to 0.14822, saving model to model.weights
Epoch 3/50
Epoch 00003: val_accuracy improved from 0.14822 to 0.16327, saving model to model.weights
Epoch 4/50
Epoch 00004: val_accuracy improved from 0.16327 to 0.17561, saving model to model.weights
Epoch 5/50
Epoch 00005: val_accuracy improved from 0.17561 to 0.18162, saving model to model.weights
Epoch 6/50
Epoch 00006: val_accuracy improved from 0.18162 to 0.18398, saving model to model.weights
Epoch 7/50
Epoch 00007: val_accuracy improved from 0.18398 to 0.18471, saving model to model.weights
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f93ec673160>

In [0]:
def iterpretable_prediction(sentence_prediction):
    return [id2tag[np.argmax(i)] for i in sentence_prediction]

In [0]:
prediction = model.predict([X_test, X_test_char_pad])

In [0]:
prediction[0]

array([[2.0906666e-02, 2.1889850e-02, 2.1574924e-02, ..., 2.1194980e-02,
        2.0976990e-02, 2.1011991e-02],
       [2.0906666e-02, 2.1889850e-02, 2.1574924e-02, ..., 2.1194980e-02,
        2.0976990e-02, 2.1011991e-02],
       [2.0906666e-02, 2.1889850e-02, 2.1574924e-02, ..., 2.1194980e-02,
        2.0976990e-02, 2.1011991e-02],
       ...,
       [3.2371176e-08, 8.8042916e-06, 1.9811404e-08, ..., 3.2756786e-06,
        4.2776517e-08, 1.6002232e-06],
       [2.4476089e-08, 1.1895517e-05, 2.8251648e-07, ..., 1.2950598e-08,
        4.7003366e-09, 2.4635369e-07],
       [4.3388830e-09, 6.4117166e-06, 4.0674117e-10, ..., 9.2310620e-07,
        4.2549146e-08, 9.0667776e-08]], dtype=float32)

In [0]:
interpretation = [iterpretable_prediction(pred) for pred in list(prediction)]

In [0]:
def print_prediction(sentences, interpretation, correct, n=len(sentences)):
    for i in range(n):
      sentence = sentences[i]
      meaningful_interpretation = interpretation[i][-len(sentence):]
      correct_tag = correct[i]
      print('---')
      for j in range(len(sentence)):
        print(sentence[j], meaningful_interpretation[j], correct_tag[j])

In [0]:
print_prediction(sent_test, interpretation, tag_test, n=20)

---
You prp PRP
do vbp VBP
n't rb RB
want vb VB
*-1 -none- -NONE-
to to TO
get vb VB
yourself vb PRP
too rb RB
upset vbn JJ
about in IN
these dt DT
things nns NNS
. . .
---
Analysts nns NNS
calculate nnp VBP
Cray nnp NNP
Computer nnp NNP
's pos POS
initial jj JJ
book nnp NN
value nn NN
at in IN
about in IN
$ $ $
4.75 cd CD
*U* -none- -NONE-
a dt DT
share nn NN
. . .
---
A dt DT
study nn NN
by in IN
Tulane nnp NNP
Prof. nnp NNP
James nnp NNP
Wright nnp NNP
says vbz VBZ
0 -none- -NONE-
homelessness nnp NN
is vbz VBZ
due jj JJ
to to TO
a dt DT
complex nn JJ
array nn NN
of in IN
problems nns NNS
, , ,
with in IN
the dt DT
common jj JJ
thread nn NN
of in IN
poverty nn NN
. . .
---
`` `` ``
I prp PRP
would md MD
like vb VB
*-1 -none- -NONE-
to to TO
go vb VB
back rb RB
to to TO
1970 cd CD
. . .
---
Then rb RB
, , ,
just rb RB
as in IN
an dt DT
image nn NN
of in IN
the dt DT
statue nn NN
of in IN
Thomas nnp NNP
Jefferson nnp NNP
dissolves nns VBZ
from in IN
the dt DT
screen nnp NN
, , ,
the d

In [0]:
model.save('morpho_model')

INFO:tensorflow:Assets written to: morpho_model/assets


Если посчитать accuracy этого предсказания, игнорируя регистр, получится 0,98+. При этом паддинги предсказаны как '.', а не PAD. Считала в другой тетрадке, поэтому результата вывода нет. Видимо, внутри модели падинги считаются как неправильно размеченные элементы

In [0]:
def real_accuracy(prediction, correct_tags): #принимает 
    n = 0
    mistakes = 0
    for i in range(len(prediction)):
        correct_tags = tag_test[i]
        predicted_tags = prediction[i]
        meaningful_len = min(len(predicted_tags), len(correct_tags))
        meaningful_predicted_tags = predicted_tags[-meaningful_len:]
        for j in range(meaningful_len):
            n += 1
            if meaningful_predicted_tags[j].lower() != correct_tags[j].lower():
                mistakes += 1
      return mistakes/n

In [0]:
import json

In [0]:
with open('word2id.json', 'w') as f:
  json.dump(word2id, f)

In [0]:
with open('char2id.json', 'w') as f:
  json.dump(char2id, f)