In [1]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split

In [2]:
from nltk.corpus import stopwords

In [3]:
import nltk

In [4]:
stopwords = set(stopwords.words('english'))

Data Preprocessing

In [5]:
train_data = pd.read_csv('data/train_2024.csv', quoting=3)
eval_data = pd.read_csv('data/test_2024.csv', quoting=3)
dev_data = pd.read_csv('data/dev_2024.csv', quoting=3)

In [11]:
def remove_stopwords(data):
    data['text_wo_stopwords'] = data['text'].apply(lambda x : ' '.join([word for word in x.split() if word not in stopwords]))
    return data

def remove_tags(test_str):
    punc = '''()-[]{};:'"\,<>/@#$%^&*_~'''
    for ele in test_str:
        if ele in punc:
            test_str = test_str.replace(ele, "")
    return test_str

def split_join(data):
    data['cleaner_text'] = data['clean_text'].apply(lambda x : ' '.join([word for word in re.findall( r'\w+|[^\s\w]+', x)]))
    return data

def get_X_Y(data):
    '''
        Takes in df data
        Processes the text in the dataframe
        returns the texts as a list, and labels as an array
    '''
    data = remove_stopwords(data)
    data['clean_text']= data['text_wo_stopwords'].apply(lambda cw : remove_tags(cw))
    data = split_join(data)
    
    text_list = []
    
    for i in range(len(data)):
        text_list.append(data['cleaner_text'][i])

    labels = np.array(data['label'])
    return text_list, labels

In [12]:
X_train, Y_train = get_X_Y(train_data)
X_eval, Y_eval= get_X_Y(eval_data)
X_dev, Y_dev = get_X_Y(dev_data)

X_train.extend(X_dev)
X = X_train
Y = np.concatenate((Y_train, Y_dev))

In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
len(xtrain), len(xtest)

(88000, 22000)

In [16]:
eval_data_ids = np.array(eval_data['id'])

Tokenizer

In [18]:
#Find how many unique tokens are in the train set:
vocab = set()
for i in xtrain:
    words = i.split()
    vocab.update(words)
    
vocab_size = len(vocab)
print(vocab_size)

100911


In [25]:
oov_tok = '<OOV>'
tokenizer = Tokenizer(num_words = vocab_size, char_level = False, oov_token = oov_tok)
tokenizer.fit_on_texts(xtrain)

words_to_index = tokenizer.word_index

In [26]:
total_words = len(words_to_index)
print(total_words)

80426


In [27]:
maxLen=50
training_sequences = tokenizer.texts_to_sequences(xtrain)
training_padded = pad_sequences(training_sequences, maxlen = maxLen, padding = 'post', truncating = 'post')

testing_sequences = tokenizer.texts_to_sequences(xtest)
testing_padded = pad_sequences(testing_sequences, maxlen = maxLen, padding = 'post', truncating = 'post')

eval_sequences = tokenizer.texts_to_sequences(X_eval)
eval_padded = pad_sequences(eval_sequences, maxlen = maxLen, padding = 'post', truncating = 'post')

In [29]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)
print('Shape of dev tensor: ', eval_padded.shape)

Shape of training tensor:  (88000, 50)
Shape of testing tensor:  (22000, 50)
Shape of dev tensor:  (12001, 50)


Model

In [38]:
embedding_dim = 16
def toxicity_model(input_shape):
    X_indices = Input(input_shape)
    X = Embedding(vocab_size, embedding_dim, input_length=maxLen)(X_indices)
    X = LSTM(128, return_sequences=True)(X)
    X = Dropout(0.2)(X)
    X = LSTM(128, return_sequences=True)(X)
    X = Dropout(0.2)(X)
    X = LSTM(128)(X)
    X = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=X_indices, outputs=X)

    return model

In [39]:
model = toxicity_model(maxLen)

In [40]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 50, 16)            1614576   
                                                                 
 lstm_4 (LSTM)               (None, 50, 128)           74240     
                                                                 
 dropout_2 (Dropout)         (None, 50, 128)           0         
                                                                 
 lstm_5 (LSTM)               (None, 50, 128)           131584    
                                                                 
 dropout_3 (Dropout)         (None, 50, 128)           0         
                                                                 
 lstm_6 (LSTM)               (None, 128)               1315

In [41]:
adam = keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(training_padded, ytrain, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x23c8911e090>

In [42]:
model.save('models/trainable_embeds_lstm_5.keras')

In [43]:
test_preds = model.predict(testing_padded)

test_pred_labels = []
for i in range(len(test_preds)):
    if test_preds[i] > 0.5:
        test_pred_labels.append(1)
    else:
        test_pred_labels.append(0)
        
test_pred_labels = np.array(test_pred_labels)



In [44]:
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(ytest, test_pred_labels)
f1 = f1_score(ytest, test_pred_labels)
print(acc, f1)

0.9085454545454545 0.8756796836381611


Evaluating on the Eval Set (hidden test set)

In [46]:
eval_preds = model.predict(eval_padded)



In [49]:
eval_pred_labels = []
for i in range(len(eval_preds)):
    if eval_preds[i] > 0.5:
        eval_pred_labels.append(1)
    else:
        eval_pred_labels.append(0)
        
eval_pred_labels = np.array(eval_pred_labels)

preds_df = pd.DataFrame({'label': eval_pred_labels})

eval_data_ids = eval_data['id']
dev_set_ids = pd.DataFrame({'id': eval_data_ids})

final_output = pd.concat([dev_set_ids, preds_df], axis=1)
final_output.to_csv('output2.csv',index=False)

In [55]:
final_output.to_csv('TrainableEmbeddings_LSTM.csv',index=False)

In [47]:
fei_op = pd.read_csv('submission-stack.csv')

In [50]:
len(final_output) == len(fei_op)

True

In [51]:
count_matches = 0
for i in range(0, len(fei_op)):
    if final_output['label'][i] == fei_op['label'][i]:
        count_matches += 1

In [53]:
count_matches/len(final_output)

0.9143404716273644

In [54]:
count_matches

10973