In [2]:
import pandas as pd
import numpy as np
import re
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import utils as utils

In [14]:
train_path = 'train.csv'
val_path = 'val.csv'

train_data = pd.read_csv(train_path)
train_data['label'] = (train_data['label'] == 'self.SuicideWatch').astype(int)

val_data=pd.read_csv(val_path)
val_data['label'] = (val_data['label'] == 'self.SuicideWatch').astype(int)
train_data.describe()

Unnamed: 0,label
count,45706.0
mean,0.187459
std,0.390284
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [6]:
def load_glove_vectors(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Path to the GloVe file
glove_path = 'glove.6B.100d.txt'  # Adjust this path to where you've saved your GloVe data
embeddings_index = load_glove_vectors(glove_path)

In [34]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

#saving the tokenizer
with open('LSTMtokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Convert text to sequences and pad them
max_length = max([len(s.split()) for s in train_data['text']])
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
val_sequences = tokenizer.texts_to_sequences(val_data['text'])

X_data = pad_sequences(train_sequences, maxlen=max_length)
X_val = pad_sequences(val_sequences, maxlen=max_length)

y_val = val_data['label']
#split to train set and test set
X_train, X_test, y_train, y_test = train_test_split(X_data, train_data['label'],test_size=0.2, random_state=42)

In [28]:
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

In [35]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [43]:
def build_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False),
        LSTM(200, activation='tanh', recurrent_dropout=0),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [45]:
results = {'precision': [], 'recall': [], 'f1': [], 'accuracy': []}

#loop through each percentage
for fraction in [0.999]:
    model = build_model()  # Recreate the model for each iteration
    partial_X_train, _, partial_y_train, _ = train_test_split(X_train, y_train,train_size=fraction)
    model.fit(partial_X_train, partial_y_train, epochs=8, batch_size=80, verbose=1,validation_data=(X_val, y_val))

    # Predict on X_test
    y_pred_prob = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred_prob, axis=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 23/286 [=>............................] - ETA: 1:29 - loss: 0.3067 - accuracy: 0.8618

KeyboardInterrupt: 

In [None]:
model.save('LSTMBinary.h5')

  saving_api.save_model(
