#RuSentiment Dataset Preprocessing

In [1]:
from collections import defaultdict
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
max_features = 20000  # Only consider the top 20k words
maxlen = 20  # Only consider the first 200 words of text

In [3]:
df_preselected = pd.read_csv('rusentiment_preselected_posts.csv')
df_random = pd.read_csv('rusentiment_random_posts.csv')

df_train = pd.concat([df_preselected, df_random]).reset_index()
df_test = pd.read_csv('rusentiment_test.csv')

In [4]:
def data_preprocessing(df, fq = {}):
  df_np = df[(df['label']=='positive')|(df['label']=='negative')]
  text_tokenized = []
  all_wf = []
  for text in df_np['text']:
    text_tokens = word_tokenize(text)
    text_tokenized.append(text_tokens)
    all_wf.extend(text_tokens)
  df_np['text_tokenized'] = text_tokenized
  fq_dict = defaultdict(int)
  for wf in all_wf:
    fq_dict[wf] += 1
  if len(fq) > 0:
    fq_dict = fq
  x = []
  for text_tokens in df_np['text_tokenized']:
    emb = []
    for w in text_tokens:
      emb.append(fq_dict[w])
    x.append(emb)
  y = []
  for label in df_np['label']:
    if label=='negative':
      y.append(0)
    elif label=='positive':
      y.append(1)
  return x, y, fq_dict

In [5]:
x_train_ru, y_train_ru, fq = data_preprocessing(df_train)
x_test_ru, y_test_ru, fq = data_preprocessing(df_test, fq)

print(len(x_train_ru), "Training sequences")
print(len(x_test_ru), "Validation sequences")
x_train_ru = np.array(keras.preprocessing.sequence.pad_sequences(x_train_ru, maxlen=maxlen))
x_test_ru = np.array(keras.preprocessing.sequence.pad_sequences(x_test_ru, maxlen=maxlen))
y_train_ru = np.array(y_train_ru)
y_test_ru = np.array(y_test_ru)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


9764 Training sequences
794 Validation sequences


## BiLSTM

In [7]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of text

In [8]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
#model.summary()

In [9]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [11]:
model.compile("adam", "binary_crossentropy", metrics=["accuracy", f1_m, precision_m, recall_m])
model.fit(x_train_ru, y_train_ru, batch_size=32, epochs=2, validation_data=(x_test_ru, y_test_ru))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f66bdd83550>

##CNN+LSTM

In [12]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb

# Embedding
max_features = 20000
maxlen = 100
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2

model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [13]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=["accuracy", f1_m, precision_m, recall_m])

print('Train...')
model.fit(np.array(x_train_ru), np.array(y_train_ru),
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(np.array(x_test_ru), np.array(y_test_ru)))

Train...
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f66b976c630>