### Imports

In [1]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split

In [2]:
from nltk.corpus import stopwords

In [3]:
import nltk

In [4]:
stopwords = set(stopwords.words('english'))

### Data import & Preprocessing

In [5]:
train_data = pd.read_csv('data/train_2024.csv', quoting=3)
eval_data = pd.read_csv('data/test_2024.csv', quoting=3)
dev_data = pd.read_csv('data/dev_2024.csv', quoting=3)

In [6]:
train_data['text'] = train_data['text'].str.lower()
eval_data['text'] = eval_data['text'].str.lower()
dev_data['text'] = dev_data['text'].str.lower()

In [7]:
def remove_stopwords(data):
    data['text_wo_stopwords'] = data['text'].apply(lambda x : ' '.join([word for word in x.split() if word not in stopwords]))
    return data

def remove_tags(test_str):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for ele in test_str:
        if ele in punc:
            test_str = test_str.replace(ele, "")
    return test_str

In [8]:
def get_X_Y(data):
    '''
        Takes in df data
        Processes the text in the dataframe
        returns the texts as a list, and labels as an array
    '''
    data = remove_stopwords(data)
    data['clean_text']= data['text_wo_stopwords'].apply(lambda cw : remove_tags(cw))
    text_list = []
    for i in range(len(data)):
        text_list.append(data['clean_text'][i])

    labels = np.array(data['label'])
    return text_list, labels

In [9]:
X_train, Y_train = get_X_Y(train_data)
X_eval, Y_eval= get_X_Y(eval_data)
X_dev, Y_dev = get_X_Y(dev_data)

X_train.extend(X_dev)
X = X_train
Y = np.concatenate((Y_train, Y_dev))

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
len(xtrain), len(xtest)

(88000, 22000)

In [13]:
eval_data_ids = np.array(eval_data['id'])

In [14]:
vocab = set()
for i in xtrain:
    words = i.split()
    vocab.update(words)
    
vocab_size = len(vocab)

In [15]:
oov_tok = '<OOV>'
tokenizer = Tokenizer(num_words = vocab_size, 
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(xtrain)

In [16]:
word_index = tokenizer.word_index
total_words = len(word_index)
total_words

88808

In [17]:
maxLen=50
training_sequences = tokenizer.texts_to_sequences(xtrain)
training_padded = pad_sequences(training_sequences,
                                maxlen = maxLen,
                                padding = 'post',
                                truncating = 'post')

testing_sequences = tokenizer.texts_to_sequences(xtest)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = maxLen,
                               padding = 'post',
                               truncating = 'post')

eval_sequences = tokenizer.texts_to_sequences(X_eval)
eval_padded = pad_sequences(eval_sequences,
                               maxlen = maxLen,
                               padding = 'post',
                               truncating = 'post')

In [19]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)
print('Shape of dev tensor: ', eval_padded.shape)

Shape of training tensor:  (88000, 50)
Shape of testing tensor:  (22000, 50)
Shape of dev tensor:  (12001, 50)


In [20]:
from keras.models import Sequential
from keras.layers import Bidirectional


def createLSTMmodel():
    # Define parameter
    n_lstm = 128
    drop_lstm = 0.2
    embedding_dim = 32
    # Define LSTM Model 
    model2 = Sequential()
    model2.add(Embedding(vocab_size, embedding_dim, input_length = maxLen))
    model2.add(Bidirectional(LSTM(n_lstm, return_sequences = False)))
    model2.add(Dropout(drop_lstm))
    model2.add(Dense(1, activation='sigmoid'))
    
    return model2

In [22]:
model = createLSTMmodel()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 32)            2843872   
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               164864    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 3008993 (11.48 MB)
Trainable params: 3008993 (11.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [24]:
num_epochs = 10
#model1.fit(training_padded, Y_train, epochs=num_epochs, validation_data=(dev_padded, Y_dev), verbose=2)
model.fit(training_padded, ytrain, epochs=num_epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1c18ee0f790>

In [25]:
model.save('models/biLSTM_trainableEmbeds.keras')

In [26]:
preds_test = model.predict(testing_padded)
ypred_test = []
for i in range(len(preds_test)):
    if preds_test[i] > 0.5:
        ypred_test.append(1)
    else:
        ypred_test.append(0)
        
preds_test = np.array(preds_test)



In [28]:
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(ytest, ypred_test)
f1 = f1_score(ytest, ypred_test)
print(acc, f1)

0.8938636363636364 0.8550679659859725


In [29]:
preds_test = model.predict(testing_padded)
ypred_test = []
for i in range(len(preds_test)):
    if preds_test[i] > 0.5:
        ypred_test.append(1)
    else:
        ypred_test.append(0)
        
preds_test = np.array(preds_test)

