In [None]:
from lxml import etree
from typing import List, Tuple


def load_sentirueval_2016(file_name: str) -> Tuple[List[str], List[str]]:
    texts = []
    labels = []
    with open(file_name, mode='rb') as fp:
        xml_data = fp.read()
    root = etree.fromstring(xml_data)
    for database in root.getchildren():
        if database.tag == 'database':
            for table in database.getchildren():
                if table.tag != 'table':
                    continue
                new_text = None
                new_label = None
                for column in table.getchildren():
                    if column.get('name') == 'text':
                        new_text = str(column.text).strip()
                        if new_label is not None:
                            break
                    elif column.get('name') not in {'id', 'twitid', 'date'}:
                        if new_label is None:
                            label_candidate = str(column.text).strip()
                            if label_candidate == '-1':
                                new_label = 0
                            elif label_candidate == '0':
                                new_label = 1
                            elif label_candidate == '1':
                                new_label = 2
                                if new_text is not None:
                                    break
                if (new_text is None) or (new_label is None):
                    raise ValueError('File `{0}` contains some error!'.format(file_name))
                texts.append(new_text)
                labels.append(new_label)
            break
    return texts, labels

In [None]:
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
texts, labels = load_sentirueval_2016('drive/MyDrive/bank_train_2016.xml')
texts1, labels1 = load_sentirueval_2016('drive/MyDrive/tkk_train_2016.xml')
texts.extend(texts1)
labels.extend(labels1)

In [None]:
texts_for_testing, labels_for_testing = load_sentirueval_2016('drive/MyDrive/banks_test_etalon.xml')
texts_for_testing1, labels_for_testing1 = load_sentirueval_2016('drive/MyDrive/tkk_test_etalon.xml')
texts_for_testing.extend(texts_for_testing1)
labels_for_testing.extend(labels_for_testing1)

In [None]:
!rm -f ft_native_300_ru_twitter_nltk_word_tokenize.bin
!wget http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin

--2022-01-03 16:25:48--  http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 178.63.27.41
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin [following]
--2022-01-03 16:25:48--  https://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3417475450 (3.2G) [application/octet-stream]
Saving to: ‘ft_native_300_ru_twitter_nltk_word_tokenize.bin’


2022-01-03 16:28:04 (24.1 MB/s) - ‘ft_native_300_ru_twitter_nltk_word_tokenize.bin’ saved [3417475450/3417475450]



In [None]:
import gensim
from gensim.models.fasttext import FastText


fasttext_model = FastText()
fasttext_model.file_name = 'ft_native_300_ru_twitter_nltk_word_tokenize.bin'
fasttext_model.load_binary_data()

In [None]:
import numpy as np


x_train = []
for text in texts:
  x_train.append(fasttext_model.wv[text])
x_train = np.array(x_train, dtype=np.float32)
x_train = x_train.reshape(len(texts), 100, 1, 1)

x_test = []
for text in texts_for_testing:
  x_test.append(fasttext_model.wv[text])
x_test = np.array(x_test, dtype=np.float32)
x_test = x_test.reshape(len(texts_for_testing), 100, 1, 1)

y_train = np.array(labels)
y_test = np.array(labels_for_testing)

In [None]:
import random
from sklearn.model_selection import train_test_split


RANDOM_SEED = 42
random.seed(RANDOM_SEED)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Reshape
from tensorflow.keras.initializers import he_uniform, glorot_uniform
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D, SpatialDropout2D


cnn = Sequential()
cnn.add(Conv2D(32, (3, 1), padding='same', activation='relu', input_shape=x_train.shape[1:],
               kernel_initializer=he_uniform(seed=RANDOM_SEED), name='Conv_Block1_Layer1'))
cnn.add(Conv2D(32, (3, 1), activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
               name='Conv_Block1_Layer2'))
cnn.add(MaxPooling2D(pool_size=(2, 1), name='MaxPool1'))
cnn.add(SpatialDropout2D(rate=0.15, name='SpatialDropout1', seed=RANDOM_SEED))

cnn.add(Conv2D(64, (3, 1), padding='same', activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
              name='Conv_Block2_Layer1'))
cnn.add(Conv2D(64, (3, 1), activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
               name='Conv_Block2_Layer2'))
cnn.add(MaxPooling2D(pool_size=(2, 1), name='MaxPool2'))
cnn.add(SpatialDropout2D(rate=0.15, name='SpatialDropout2', seed=RANDOM_SEED))
cnn.add(Flatten())
cnn.add(Dense(512, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED), name='HiddenLayer'))
cnn.add(Dropout(rate=0.5, seed=RANDOM_SEED, name='DropoutAfterHidden'))
cnn.add(Dense(10, activation='softmax', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), name='OutputLayer'))
cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
cnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Conv_Block1_Layer1 (Conv2D)  (None, 100, 1, 32)       128       
                                                                 
 Conv_Block1_Layer2 (Conv2D)  (None, 98, 1, 32)        3104      
                                                                 
 MaxPool1 (MaxPooling2D)     (None, 49, 1, 32)         0         
                                                                 
 SpatialDropout1 (SpatialDro  (None, 49, 1, 32)        0         
 pout2D)                                                         
                                                                 
 Conv_Block2_Layer1 (Conv2D)  (None, 49, 1, 64)        6208      
                                                                 
 Conv_Block2_Layer2 (Conv2D)  (None, 47, 1, 64)        12352     
                                                      

In [None]:
BATCH_SIZE = 128
cnn.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    validation_data=(x_val, y_val),
    shuffle=True, epochs=100,
    callbacks=[
        EarlyStopping(
            monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
        )
    ],
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 00019: early stopping


<keras.callbacks.History at 0x7fac4c9081d0>

In [None]:
y_pred = np.argmax(cnn.predict(x_test, batch_size=128), axis=-1)

In [None]:
label_names = ['negative', 'neutral', 'positive']

In [None]:
from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred, target_names=label_names, digits=4))

              precision    recall  f1-score   support

    negative     0.3425    0.4237    0.3788       767
     neutral     0.7214    0.7592    0.7398      2238
    positive     0.4444    0.0130    0.0252       308

    accuracy                         0.6121      3313
   macro avg     0.5028    0.3986    0.3813      3313
weighted avg     0.6080    0.6121    0.5898      3313



In [None]:
!pip install textattack[tensorflow,optional]

Collecting textattack[optional,tensorflow]
  Downloading textattack-0.3.4-py3-none-any.whl (373 kB)
[K     |████████████████████████████████| 373 kB 5.2 MB/s 
[?25hCollecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.1 MB/s 
Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 50.3 MB/s 
Collecting terminaltables
  Downloading terminaltables-3.1.10-py2.py3-none-any.whl (15 kB)
Collecting flair
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[K     |████████████████████████████████| 322 kB 54.1 MB/s 
Collecting lemminflect
  Downloading lemminflect-0.2.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 61.4 MB/s 
[?25hCollecting tqdm<4.50.0,>=4.27
  Downloading tqdm-4.49.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 6.8 MB/s 
[?25hCollecting language-tool-python
  Downloa

In [None]:
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapNeighboringCharacterSwap
from textattack.constraints.pre_transformation import RepeatModification
from textattack.augmentation import Augmenter


def aug_train_data(texts, labels):
  transformation = WordSwapNeighboringCharacterSwap()
  constraints = [RepeatModification()]
  aug = Augmenter(transformation=transformation, pct_words_to_swap=0.5, transformations_per_example=7)
  aug_texts = []
  aug_labels = []
  texts_total = len(texts)
  for text, label in zip(texts, labels):
    aug_text_options = aug.augment(text)
    aug_texts.extend(aug_text_options)
    aug_labels.extend([label] * len(aug_text_options))
    texts_total = len(texts)
  return aug_texts, aug_labels

In [None]:
aug_texts, aug_labels = aug_train_data(texts, labels)
x_train = []
for text in aug_texts:
  try:
    x_train.append(fasttext_model.wv[text])
  except KeyError:
    x_train.append(np.zeros(100))
x_train = np.array(x_train, dtype=np.float32)
x_train = x_train.reshape(len(aug_texts), 100, 1, 1)
y_train = np.array(aug_labels)

In [None]:
cnn_with_augmentation = Sequential()
cnn_with_augmentation.add(Conv2D(32, (3, 1), padding='same', activation='relu', input_shape=x_train.shape[1:],
                                 kernel_initializer=he_uniform(seed=RANDOM_SEED), name='Conv_Block1_Layer1'))
cnn_with_augmentation.add(Conv2D(32, (3, 1), activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
                                 name='Conv_Block1_Layer2'))
cnn_with_augmentation.add(MaxPooling2D(pool_size=(2, 1), name='MaxPool1'))
cnn_with_augmentation.add(SpatialDropout2D(rate=0.15, name='SpatialDropout1', seed=RANDOM_SEED))
cnn_with_augmentation.add(Conv2D(64, (3, 1), padding='same', activation='relu',
                                 kernel_initializer=he_uniform(seed=RANDOM_SEED), name='Conv_Block2_Layer1'))
cnn_with_augmentation.add(Conv2D(64, (3, 1), activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
                                 name='Conv_Block2_Layer2'))
cnn_with_augmentation.add(MaxPooling2D(pool_size=(2, 1), name='MaxPool2'))
cnn_with_augmentation.add(SpatialDropout2D(rate=0.15, name='SpatialDropout2', seed=RANDOM_SEED))
cnn_with_augmentation.add(Flatten())
cnn_with_augmentation.add(Dense(512, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
                                name='HiddenLayer'))
cnn_with_augmentation.add(Dropout(rate=0.5, seed=RANDOM_SEED, name='DropoutAfterHidden'))
cnn_with_augmentation.add(Dense(10, activation='softmax', kernel_initializer=glorot_uniform(seed=RANDOM_SEED),
                                name='OutputLayer'))
cnn_with_augmentation.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
cnn_with_augmentation.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Conv_Block1_Layer1 (Conv2D)  (None, 100, 1, 32)       128       
                                                                 
 Conv_Block1_Layer2 (Conv2D)  (None, 98, 1, 32)        3104      
                                                                 
 MaxPool1 (MaxPooling2D)     (None, 49, 1, 32)         0         
                                                                 
 SpatialDropout1 (SpatialDro  (None, 49, 1, 32)        0         
 pout2D)                                                         
                                                                 
 Conv_Block2_Layer1 (Conv2D)  (None, 49, 1, 64)        6208      
                                                                 
 Conv_Block2_Layer2 (Conv2D)  (None, 47, 1, 64)        12352     
                                                      

In [None]:
BATCH_SIZE = 128

cnn_with_augmentation.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    validation_data=(x_val, y_val),
    shuffle=True, epochs=100,
    callbacks=[
        EarlyStopping(
            monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
        )
    ],
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 00033: early stopping


<keras.callbacks.History at 0x7fabd2f59350>

In [None]:
y_pred = np.argmax(cnn_with_augmentation.predict(x_test, batch_size=128), axis=-1)

In [None]:
print(classification_report(y_test, y_pred, target_names=label_names, digits=4))

              precision    recall  f1-score   support

    negative     0.4888    0.3954    0.4372     12606
     neutral     0.6455    0.7681    0.7015     22747
    positive     0.1498    0.0693    0.0948      3563

    accuracy                         0.5834     38916
   macro avg     0.4280    0.4109    0.4111     38916
weighted avg     0.5493    0.5834    0.5603     38916



Свёрточная нейросеть без аугментации дала более хорошие результаты, чем с аугментацией, и более плохие, чем логистическая регрессия.