# Setup

In [0]:
!mkdir elmo_twitter
!mkdir checkpoints

## Download pretrained ELMo

In [0]:
!wget http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz

--2019-08-17 15:48:09--  http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 93.175.29.74
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|93.175.29.74|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 694200117 (662M) [application/octet-stream]
Saving to: ‘elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz’


2019-08-17 15:51:29 (3.34 MB/s) - ‘elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz’ saved [694200117/694200117]



In [0]:
!mv elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz elmo_twitter/elmo.tar.gz
%cd elmo_twitter
!tar -xvzf elmo.tar.gz
!rm elmo.tar.gz
%cd ..

/content/elmo_twitter
./
./tfhub_module.pb
./assets/
./variables/
./variables/variables.index
./variables/variables.data-00001-of-00002
./variables/variables.data-00000-of-00002
./saved_model.pb
/content


## Install libs

In [0]:
!pip install keras-self-attention
!pip install regex

Collecting keras-self-attention
  Downloading https://files.pythonhosted.org/packages/1b/1c/01599219bef7266fa43b3316e4f55bcb487734d3bafdc60ffd564f3cfe29/keras-self-attention-0.41.0.tar.gz
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.41.0-cp36-none-any.whl size=17289 sha256=6eca9a03857ad5e3e4d3bb1b5f6400b2d8fbf0b6d6867af53c9467e1a381219c
  Stored in directory: /root/.cache/pip/wheels/cc/dc/17/84258b27a04cd38ac91998abe148203720ca696186635db694
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.41.0
Collecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4e/1b178c38c9a1a184288f72065a65ca01f3154df43c6ad898624149b8b4e0/regex-2019.06.08.tar.gz (651kB)
[K     |████████████████████████████████| 655kB 5.2MB/s 
[?25hBuildin

## Authorize in google drive to be able to save checkpoints in it

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once in a notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

[?25l[K     |▎                               | 10kB 18.7MB/s eta 0:00:01[K     |▋                               | 20kB 3.2MB/s eta 0:00:01[K     |█                               | 30kB 4.6MB/s eta 0:00:01[K     |█▎                              | 40kB 3.0MB/s eta 0:00:01[K     |█▋                              | 51kB 3.7MB/s eta 0:00:01[K     |██                              | 61kB 4.3MB/s eta 0:00:01[K     |██▎                             | 71kB 5.0MB/s eta 0:00:01[K     |██▋                             | 81kB 5.5MB/s eta 0:00:01[K     |███                             | 92kB 6.1MB/s eta 0:00:01[K     |███▎                            | 102kB 4.9MB/s eta 0:00:01[K     |███▋                            | 112kB 4.9MB/s eta 0:00:01[K     |████                            | 122kB 4.9MB/s eta 0:00:01[K     |████▎                           | 133kB 4.9MB/s eta 0:00:01[K     |████▋                           | 143kB 4.9MB/s eta 0:00:01[K     |█████                     

# Start

## Imports

In [0]:
import os
import json
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow.keras.backend as K
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from keras_self_attention import SeqSelfAttention
import keras
import regex as re

Using TensorFlow backend.


## Hyperparameters

In [0]:
batch_size = 25
max_seq_len = 200 
num_epochs = 30
category = 1
learning_rate = 0.000001
dropout_rate = 0.3
model_name = "elmo_full+self-attention+dense"

## Loading data from train and test files

In [0]:
df_train = pd.read_csv(str(category) + "train.csv")
df_test = pd.read_csv(str(category) + "test.csv")

## Split data into x and y

In [0]:
categories = [
    'внешний вид',
    'экран',
    'камера',
    'батарея',
    'звук',
    'по',
    'цена',
    'производительность',
    'память',
    'тональность'
]

# y = to_categorical(df[categories[category]].to_numpy())

x_train = df_train['текст'].values
x_test = df_test['текст'].values
y_train = to_categorical(df_train[categories[category]].values)
y_test = to_categorical(df_test[categories[category]].values)

## Preprocessing

In [0]:
# replace urls
re_url = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                    .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
                    re.MULTILINE|re.UNICODE)
# replace ips
re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

for i in range (len(x_train)):
  text = x_train[i]
  # replace URLs
  text = re_url.sub("URL", text)
  # replace IPs
  text = re_ip.sub("IPADDRESS", text)
  tokens = text.split()[:max_seq_len]
  for j in range (max_seq_len-len(tokens)):
    tokens.append("NaN")
  x_train[i] = " ".join(tokens)
  
for i in range (len(x_test)):
  text = x_test[i]
  # replace URLs
  text = re_url.sub("URL", text)
  # replace IPs
  text = re_ip.sub("IPADDRESS", text)
  tokens = text.split()[:max_seq_len]
  for j in range (max_seq_len-len(tokens)):
    tokens.append("NaN")
  x_test[i] = " ".join(tokens)

## Load tensorflow hub module for ELMo models

In [0]:
elmo = hub.Module("elmo_twitter", trainable=True)
# def elmo_func(x):
# #   result = elmo(
# #       inputs={
# #           "tokens": x,
# #           "sequence_len": [max_seq_len, 1]
# #       },
# #       signature="tokens",
# #       as_dict=True)["default"]
# #   result.set_shape([None, max_seq_len, 1024])
#   return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["elmo"]

## Models

### ELMo (mean) + Dense

In [0]:
def elmo_func(x):
  return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

input_layer = keras.layers.Input(shape=(1,), dtype=tf.string, name='input')
elmo_layer = keras.layers.Lambda(elmo_func, name='elmo')(input_layer)
dropout_layer = keras.layers.Dropout(dropout_rate, name='dropout')(elmo_layer)
output_layers = []
output_layers.append(keras.layers.Dense(3, activation='softmax', name='dense_' + str(category))(dropout_layer))
model = keras.models.Model(inputs=[input_layer], outputs=output_layers)
adam = keras.optimizers.Adam(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1)                 0         
_________________________________________________________________
elmo (Lambda)                (None, 1024)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 3075      
Total params: 3,075
Trainable params: 3,075
Non-trainable params: 0
_________________________________________________________________


### ELMo (full) + Dense

In [0]:
def elmo_func(x):
  return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["elmo"]

input_layer = keras.layers.Input(shape=(1,), dtype=tf.string, name='input')
elmo_layer = keras.layers.Lambda(elmo_func, name='elmo', output_shape=(max_seq_len, 1024))(input_layer)#, output_shape=(max_seq_len, 1024)
# self_attention_layer = SeqSelfAttention(attention_activation='sigmoid', name='self_attention_' + str(category))(elmo_layer)
flatten_layer = keras.layers.Flatten(name='flatten')(elmo_layer) # (self_attention_layer)
dropout_layer = keras.layers.Dropout(dropout_rate, name='dropout')(flatten_layer)
output_layers = []
output_layers.append(keras.layers.Dense(3, activation='softmax', name='dense_' + str(category))(dropout_layer))
model = keras.models.Model(inputs=[input_layer], outputs=output_layers)
adam = keras.optimizers.Adam(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1)                 0         
_________________________________________________________________
elmo (Lambda)                (None, 200, 1024)         0         
_________________________________________________________________
flatten (Flatten)            (None, 204800)            0         
_________________________________________________________________
dropout (Dropout)            (None, 204800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 614403    
Total params: 614,403
Trainable params: 614,403
Non-trainable params: 0
_________________________________________________________________


### ELMo (full) + Self-attention + Dense

In [0]:
def elmo_func(x):
  return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["elmo"]

input_layer = keras.layers.Input(shape=(1,), dtype=tf.string, name='input')
elmo_layer = keras.layers.Lambda(elmo_func, name='elmo', output_shape=(max_seq_len, 1024))(input_layer)
dropout_layer = keras.layers.Dropout(dropout_rate, name='dropout')(elmo_layer)
self_attention_layer = SeqSelfAttention(attention_activation='sigmoid', name='self_attention_' + str(category))(dropout_layer)
flatten_layer = keras.layers.Flatten(name='flatten')(self_attention_layer)
output_layers = []
output_layers.append(keras.layers.Dense(3, activation='softmax', name='dense_' + str(category))(flatten_layer))
model = keras.models.Model(inputs=[input_layer], outputs=output_layers)
adam = keras.optimizers.Adam(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy', f_score])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1)                 0         
_________________________________________________________________
elmo (Lambda)                (None, 200, 1024)         0         
_________________________________________________________________
dropout (Dropout)            (None, 200, 1024)         0         
_________________________________________________________________
self_attention_1 (SeqSelfAtt (None, 200, 1024)         65601     
_________________________________________________________________
flatten (Flatten)            (None, 204800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 614403    
Total params: 680,004
Trainable params: 680,004
Non-trainable params: 0
_________________________________________________________________


## Training

In [0]:
callbacks_list = []
initial_epoch = 36
model_file_name = 'CatABSA_elmo_full+self-attention+dense_cat1_e36.hdf5'
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights(model_file_name)
    for i in range(initial_epoch, initial_epoch + num_epochs):
        print("EPOCH " + str(i) + "\n")
        history = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs=1, batch_size=batch_size, callbacks=callbacks_list)#, initial_epoch=initial_epoch)
        cp_filename = model_name + '_cat' + str(category) + '_e' + str(i) + '.hdf5'
        model.save_weights('checkpoints/' + cp_filename)
        
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        uploaded = drive.CreateFile({'title': 'CatABSA_' + cp_filename})
        uploaded.SetContentFile('checkpoints/' + cp_filename)
        uploaded.Upload()
        print('Uploaded file with ID {}'.format(uploaded.get('id')))

EPOCH 36

Train on 31287 samples, validate on 3477 samples
Epoch 1/1
 2150/31287 [=>............................] - ETA: 40:20 - loss: 0.8732 - acc: 0.6065

KeyboardInterrupt: ignored

In [0]:
model_file_name = 'CatABSA_elmo_full+self-attention+dense_cat1_e36.hdf5'
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights(model_file_name)
    res = model.evaluate(x_test, y_test)
    



In [0]:
res

In [0]:
def f_score(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = y_true * y_pred
    sum_true = tf.reduce_sum(y_true)#, axis=1)
    sum_pred = tf.reduce_sum(y_pred)#, axis=1)
    sum_correct = tf.reduce_sum(y_correct)#, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 2 * precision * recall / (precision + recall)
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

## Upload a single file

In [0]:
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

cp_filename = str(category) + "_cat_e1.hdf5"
uploaded = drive.CreateFile({'title': 'CatABSA_' + cp_filename})
uploaded.SetContentFile('checkpoints/' + cp_filename)
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1IsryA1uj6HaYme3OlHenIaNgYjotY8Zy
