**Instructions to run**


*   Ensure you have the correct paths to the required files (data.csv, val.csv, glove.6B.100d.txt)
*   Try changing the path if there are issues loading in the files
* run each cell in order
* don't run last cell if you don't want to save the model



In [24]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [25]:
# Train path
train_path = '../Data/train.csv'
# Validation path
val_path = '../Data/val.csv'

# Convert the labels to binary
train_data = pd.read_csv(train_path)
train_data['label'] = (train_data['label'] == 'self.SuicideWatch').astype(int)

val_data=pd.read_csv(val_path)
val_data['label'] = (val_data['label'] == 'self.SuicideWatch').astype(int)

# Describe the train data
train_data.describe()

Unnamed: 0,label
count,45706.0
mean,0.187459
std,0.390284
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [26]:
# Get majority and minority classes
df_majority = train_data[train_data['label'] == 0]
df_minority = train_data[train_data['label'] == 1]

# upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# downsample majority class
df_majority_downsampled = resample(df_majority,
                                    replace=False,
                                    n_samples=len(df_minority),
                                    random_state=42)

# recreate the data set
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled = df_downsampled.sample(frac=1, random_state=42)

In [27]:
# helper function to load in GloVe vectors
def load_glove_vectors(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Path to the GloVe file
glove_path = '../Data/glove.6B.100d.txt'

# Load GloVe vectors
embeddings_index = load_glove_vectors(glove_path)

In [28]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_upsampled['text'])

# saving the tokenizer
with open('LSTMtokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# getting data set constants
max_length = max([len(s.split()) for s in df_upsampled['text']])
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(df_upsampled['text'])
val_sequences = tokenizer.texts_to_sequences(val_data['text'])

# pad the sequences
X_data = pad_sequences(train_sequences, maxlen=max_length)
X_val = pad_sequences(val_sequences, maxlen=max_length)

y_data = df_upsampled['label']
y_val = val_data['label']

#split to train set and test set
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,test_size=0.2, random_state=42)

In [29]:
# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [30]:
# Builds LSTM model
def build_model():
    model = Sequential([
        # 3 layers
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
        LSTM(210, activation='tanh'),
        Dense(1, activation='sigmoid')
    ])
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [31]:
# Dictionary to hold metrics
results = {'precision': [], 'recall': [], 'f1': [], 'accuracy': []}

# loop through each percentage (if there are multiple)
for fraction in [0.999]:
    model = build_model()  # Recreate the model for each iteration
    partial_X_train, _, partial_y_train, _ = train_test_split(X_train, y_train,train_size=fraction)
    # Train the model
    model.fit(partial_X_train, partial_y_train, epochs=10, batch_size=64, verbose=1,validation_data=(X_val, y_val))

     # Predict on X_test
    y_pred_prob = model.predict(X_test)
    y_pred_class = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]

    # Calculate metrics
    precision = precision_score(y_test, y_pred_class)
    recall = recall_score(y_test, y_pred_class)
    f1 = f1_score(y_test, y_pred_class)
    accuracy = accuracy_score(y_test, y_pred_class)

    # Store metrics
    results['precision'].append(precision)
    results['recall'].append(recall)
    results['f1'].append(f1)
    results['accuracy'].append(accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
# Displaying the metrics
print(results)

{'precision': [0.8685103708359523], 'recall': [0.9300040382285637], 'f1': [0.8982059282371295], 'accuracy': [0.8945880452342488]}


In [None]:
# Save the model
model.save('LSTMBinary.h5')

  saving_api.save_model(
