# Recurrent Neural Network Classifier
Finds the tone of an article based on the article's headline, abstract and key words

In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, Dense, Embedding, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [13]:
# Load training data
train_df = pd.read_csv('../data/cleaned_train.csv')
x_train = train_df['numerical_sequence'].apply(lambda x: [int(i) for i in x.split(',')])
y_train = np.array(train_df['BERT_sentiment_score'])

In [14]:
# Load test data
test_df = pd.read_csv('../data/cleaned_test.csv')
x_test = np.array(test_df['numerical_sequence'].apply(lambda x: [int(i) for i in x.split(',')]))
y_test = np.array(test_df['BERT_sentiment_score'])

In [15]:
# Get the longest sequence
max_length = max([len(seq) for seq in x_test] + [len(seq) for seq in x_train])

# Pad sequences with zeros ro have the same length
x_train = pad_sequences(x_train, maxlen= max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen= max_length, padding='post', truncating='post')

In [16]:
# Training and validation split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [17]:
# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', 
                               # Number of epochs to wait for improvement
                               patience=3,  
                               verbose=1, 
                               # Restore the weights of the best epoch
                               restore_best_weights=True)  

In [18]:
#Hyperparameters
num_classes = 3
vocab_size = max(np.max(x_train), np.max(x_test)) + 1

In [19]:
# Define the LSTM model architecture
model = Sequential()
# Input layer
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=x_train.shape[1]))
#Hidden layers
model.add(Bidirectional(LSTM(units=50, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=50, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Flatten())
# Output layer
model.add(Dense(10, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 227, 32)           1222944   
                                                                 
 bidirectional_3 (Bidirectio  (None, 227, 100)         33200     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 227, 100)          0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 227, 100)         60400     
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 227, 100)          0         
                                                                 
 bidirectional_5 (Bidirectio  (None, 227, 100)        

In [20]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, batch_size=16, epochs=2, validation_data=(x_val, y_val), 
         callbacks=[early_stopping])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1c9ef3b5190>

In [21]:
# save the trained model
model.save('../models/RNN_classification_model.h5')

In [11]:
model = load_model('../models/RNN_classification_model.h5')



In [27]:
# Evaluate the model on validation data
loss, accuracy = model.evaluate(x_val, y_val)
print('Validation loss:', loss)
print('Validation accuracy:', accuracy)

Validation loss: 0.8374772667884827
Validation accuracy: 0.6369107365608215
