# Sentiment Analysis of Forum Posts relating to Ukraine-Russia War

### Preprocessing

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

DATA_FILE = "parsed_data.csv"
df = pd.read_csv("parsed_data.csv")
titles = df["title"].values.astype(str)
povs = df["pov"].values


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titles, pd.get_dummies(povs).values.astype(int), test_size=0.2, random_state=1000)

In [3]:
VOCAB_SIZE = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

Found 12358 unique tokens.


### Model Creation

In [4]:
OUTPUT_SIZE = 3

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=X_train.shape[1]),
    tf.keras.layers.LSTM(512, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(units=256),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dense(units=16),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dense(units=OUTPUT_SIZE),
    tf.keras.layers.Softmax()
])

In [5]:
loss_metric = 'categorical_crossentropy'

acc_metrics = [
    'accuracy'
]

model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
        loss=loss_metric, 
        metrics=acc_metrics,
    )

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          5000000   
                                                                 
 lstm (LSTM)                 (None, 512)               1255424   
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 16)                4112      
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 3)                 5

In [8]:
EPOCHS = 10
BATCH_SIZE = 64
model.fit(X_train, y_train, 
          validation_data=(X_test, y_test), 
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          validation_split=0.1, 
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/10
  2/157 [..............................] - ETA: 12:30 - loss: 0.8376 - accuracy: 0.4688

KeyboardInterrupt: 