# Sentiment Analysis of Forum Posts relating to Ukraine-Russia War

### Preprocessing

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

DATA_FILE = "parsed_data.csv"
df = pd.read_csv("parsed_data.csv")
titles = df["title"].values.astype(str)
povs = df["pov"].values

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titles, pd.get_dummies(povs).values.astype(int), test_size=0.2, random_state=1000)

In [None]:
VOCAB = set()
for x in X_train:
    VOCAB.add(x)
VOCAB_SIZE = len(VOCAB)
# Max number of words in title
MAX_SEQUENCE_LENGTH = 50
# This is fixed.
EMBEDDING_DIM = 60
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print(f'Vocab size is {VOCAB_SIZE}')

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

### Model Creation

In [4]:
OUTPUT_SIZE = 3

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=X_train.shape[1]),
    tf.keras.layers.LSTM(512, dropout=0.5, recurrent_dropout=0.5, return_sequences=True, return_state=False),
    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(units=512),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(units=256),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(units=128),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(units=64),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dropout(0.4),

    tf.keras.layers.Dense(units=32),
    tf.keras.layers.LeakyReLU(alpha=0.3),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Dense(units=OUTPUT_SIZE),
    tf.keras.layers.Softmax()
])

In [5]:
loss_metric = 'categorical_crossentropy'

acc_metrics = [
    'accuracy'
]

model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
        loss=loss_metric, 
        metrics=acc_metrics,
    )

In [6]:
# model.summary()

In [None]:
EPOCHS = 10
BATCH_SIZE = 32
model.fit(X_train, y_train, 
          validation_data=(X_test, y_test), 
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          validation_split=0.1, 
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, min_delta=0.0001, verbose=1)])

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))