# **Instructions to run**
- Ensure that you have the correct paths in the sencdon and fourth code cell (train.csv, val.csv, glove.6B.100d.txt). If you have issues with the path, try "train.csv" and etc.
- Run each cell in order
- Don't run the last cell if you don't want to save the model



In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import utils as utils

In [2]:
# Paths to train and validation data sets
train_path = '../Data/train.csv'
val_path = '../Data/val.csv'

# conver to data frames
train_data = pd.read_csv(train_path)
val_data=pd.read_csv(val_path)

train_data.describe()

Unnamed: 0,text,label
count,45706,45706
unique,45631,5
top,FEARLESS FRIDAYS MEGA THREAD. Here we discuss ...,self.depression
freq,15,15714


In [3]:
# Loads in pre-trained GloVe vectors
def load_glove_vectors(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Path to the GloVe file (adjust as needed)
glove_path = '../Data/glove.6B.100d.txt'
embeddings_index = load_glove_vectors(glove_path)

In [5]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

#saving the tokenizer
with open('LSTMtokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Convert text to sequences and pad them
max_length = max([len(s.split()) for s in train_data['text']])
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
val_sequences = tokenizer.texts_to_sequences(val_data['text'])

X_data = pad_sequences(train_sequences, maxlen=max_length)
X_val = pad_sequences(val_sequences, maxlen=max_length)

# Encode labels
label_encoder = LabelEncoder()
y_data = label_encoder.fit_transform(train_data['label'])
y_val = label_encoder.transform(val_data['label'])

#split to train set and test set (80% train 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,test_size=0.2, random_state=42)

In [7]:
output_units = len(label_encoder.classes_)  # Number of classes

# converting encoded labels to one_hot encoding
y_train_one_hot = to_categorical(y_train, output_units)
y_test_one_hot = to_categorical(y_test, output_units)
y_val_one_hot = to_categorical(y_val, output_units)

In [8]:
# getting embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [10]:
# builds LSTM model
def build_model():
    model = Sequential([
        # 3 layers
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
        LSTM(210, activation='tanh', recurrent_dropout=0),
        Dense(output_units, activation='softmax')
    ])
    # compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [17]:
results = {'precision': [], 'recall': [], 'f1': [], 'accuracy': []}

#loop through each percentage
for fraction in [0.999]:
    model = build_model()  # Recreate the model for each iteration
    partial_X_train, _, partial_y_train, _ = train_test_split(X_train, y_train_one_hot,train_size=fraction, random_state=42)
    model.fit(partial_X_train, partial_y_train, epochs=8, batch_size=40, verbose=1,validation_data=(X_val, y_val_one_hot))

    # Predict on X_test
    y_pred_prob = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred_prob, axis=1)
    y_test_integers = np.argmax(y_test_one_hot, axis=1)

    # Calculate metrics
    precision = precision_score(y_test_integers, y_pred_classes, average='weighted', zero_division=0)
    recall = recall_score(y_test_integers, y_pred_classes, average='weighted', zero_division=0)
    f1 = f1_score(y_test_integers, y_pred_classes, average='weighted', zero_division=0)
    accuracy = accuracy_score(y_test_integers, y_pred_classes)

    # Store metrics
    results['precision'].append(precision)
    results['recall'].append(recall)
    results['f1'].append(f1)
    results['accuracy'].append(accuracy)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [18]:
# print the metric results
print(results)

{'precision': [0.6572776306829258], 'recall': [0.6559833734412601], 'f1': [0.6562656988287998], 'accuracy': [0.6559833734412601]}


In [None]:
# save the model
model.save('LSTM.h5')

  saving_api.save_model(
