Creating an LSTM using Train + Dev set

Used Glove embeddings for the words

Data preprocessing involved removing punctuation, converting to lowercase and removing stop words.

Working code - restults submitted on kaggle - output2.csv file

In [1]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split

In [2]:
from nltk.corpus import stopwords

In [3]:
import nltk

In [5]:
stopwords = set(stopwords.words('english'))

In [6]:
train_data = pd.read_csv('data/train_2024.csv', quoting=3)
eval_data = pd.read_csv('data/test_2024.csv', quoting=3)
dev_data = pd.read_csv('data/dev_2024.csv', quoting=3)

In [7]:
train_data['text'] = train_data['text'].str.lower()
eval_data['text'] = eval_data['text'].str.lower()
dev_data['text'] = dev_data['text'].str.lower()

In [8]:
def remove_stopwords(data):
    data['text_wo_stopwords'] = data['text'].apply(lambda x : ' '.join([word for word in x.split() if word not in stopwords]))
    return data

def remove_tags(test_str):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for ele in test_str:
        if ele in punc:
            test_str = test_str.replace(ele, "")
    return test_str

In [9]:
def get_X_Y(data):
    '''
        Takes in df data
        Processes the text in the dataframe
        returns the texts as a list, and labels as an array
    '''
    data = remove_stopwords(data)
    data['clean_text']= data['text_wo_stopwords'].apply(lambda cw : remove_tags(cw))
    text_list = []
    for i in range(len(data)):
        text_list.append(data['clean_text'][i])

    labels = np.array(data['label'])
    return text_list, labels

In [10]:
X_train, Y_train = get_X_Y(train_data)
X_eval, Y_eval= get_X_Y(eval_data)
X_dev, Y_dev = get_X_Y(dev_data)

X_train.extend(X_dev)
X = X_train
Y = np.concatenate((Y_train, Y_dev))

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
len(xtrain), len(xtest)

(88000, 22000)

In [13]:
eval_data_ids = np.array(eval_data['id'])

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(xtrain)

words_to_index = tokenizer.word_index

In [15]:
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

    return word_to_vec_map

In [16]:
word_to_vec_map = read_glove_vector('data/glove.6B.50d.txt')

In [17]:
maxLen = 50
vocab_len = len(words_to_index)+1
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [22]:
def toxicity_model(input_shape):
    X_indices = Input(input_shape)
    embeddings = embedding_layer(X_indices)
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.2)(X)
    X = LSTM(128, return_sequences=True)(X)
    X = Dropout(0.2)(X)
    X = LSTM(128)(X)
    X = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=X_indices, outputs=X)

    return model

In [23]:
X_train_indices = tokenizer.texts_to_sequences(xtrain)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [24]:
model = toxicity_model(maxLen)

In [25]:
adam = keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_indices, ytrain, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1a2564947d0>

In [26]:
model.save('models/glove_lstm_4.keras')

In [27]:
X_test_indices = tokenizer.texts_to_sequences(xtest)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [28]:
test_preds = model.predict(X_test_indices)



In [30]:
test_pred_labels = []
for i in range(len(test_preds)):
    if test_preds[i] > 0.5:
        test_pred_labels.append(1)
    else:
        test_pred_labels.append(0)
        
test_pred_labels = np.array(test_pred_labels)

In [31]:
from sklearn.metrics import accuracy_score, f1_score

acc = accuracy_score(ytest, test_pred_labels)
f1 = f1_score(ytest, test_pred_labels)
print(acc, f1)

0.9118181818181819 0.8760858456821665


In [32]:
X_eval_indices = tokenizer.texts_to_sequences(X_eval)
X_eval_indices = pad_sequences(X_eval_indices, maxlen=maxLen, padding='post')

In [33]:
eval_preds = model.predict(X_eval_indices)



In [34]:
eval_pred_labels = []
for i in range(len(eval_preds)):
    if eval_preds[i] > 0.5:
        eval_pred_labels.append(1)
    else:
        eval_pred_labels.append(0)
        
eval_pred_labels = np.array(eval_pred_labels)

In [None]:
eval_data_ids = eval_data['id']
dev_set_ids = pd.DataFrame({'id': eval_data_ids})

preds_df = pd.DataFrame({'label': eval_pred_labels})

final_output = pd.concat([dev_set_ids, preds_df], axis=1)
final_output.to_csv('output2.csv',index=False)

In [35]:
preds_df = pd.DataFrame({'label': eval_pred_labels})

In [36]:
dev_set_ids = pd.DataFrame({'id': eval_data_ids})

In [37]:
final_output = pd.concat([dev_set_ids, preds_df], axis=1)
final_output

Unnamed: 0,id,label
0,0,1
1,1,0
2,2,0
3,3,1
4,4,1
...,...,...
11996,11996,1
11997,11997,0
11998,11998,1
11999,11999,1


In [38]:
final_output.to_csv('output2.csv',index=False)

In [41]:
fei_op = pd.read_csv('submission-stack.csv')

In [42]:
len(final_output) == len(fei_op)

True

In [43]:
count_correct = 0
for i in range(0, len(fei_op)):
    if final_output['label'][i] == fei_op['label'][i]:
        count_correct += 1

In [44]:
count_correct

11139

In [45]:
len(fei_op)

12001

In [46]:
count_correct/len(final_output)

0.9281726522789767