<a href="https://colab.research.google.com/github/ArunKoundinya/SoulGuard/blob/master/jupyternotebooks/SuicideModelFineTuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dropout, Bidirectional, LSTM, Dense, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.initializers import GlorotUniform

from tensorflow.keras.models import Model


from sklearn.metrics import accuracy_score, classification_report


import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
import os
os.chdir("/content/drive/MyDrive/DeepLearning/Capstone-SoulGuard")

In [6]:
SuicideDetection = pd.read_csv('https://media.githubusercontent.com/media/ArunKoundinya/SoulGuard/refs/heads/master/data/SoulG_Update.csv')
X = SuicideDetection['cleaned_text']
X = X.astype(str)
y = SuicideDetection['class']
y = y.astype(str)
y = pd.factorize(y)[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(oov_token="<UNK>",)
tokenizer.fit_on_texts(X_train)

tokenizer.word_index['<PAD>'] = 0

X_sequences_train = tokenizer.texts_to_sequences(X_train)
X_sequences_test = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_sequences_train, padding='post', maxlen=100)
X_test_padded = pad_sequences(X_sequences_test, padding='post', maxlen=100)

vocab_size = len(tokenizer.word_index)

In [10]:
def load_embeddings(glove_path):
    embedding_index = {}
    with open(glove_path, encoding="utf8") as glove_file:
        for line in glove_file:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embedding_index[word] = coefs
    return embedding_index

def create_embedding_matrix(embedding_index, word2idx, vocab_size, embedding_dim):
    mat=np.zeros((vocab_size,embedding_dim))
    for key,value in word2idx.items():
      mat[value]=embedding_index.get(key)
    mat[np.isnan(mat)] = 0
    return mat

glove_path = f"/content/drive/My Drive/MSIS/IntroductiontoDeepLearning/Project/glove.6B/glove.twitter.27B.200d.txt"
embedding_index = load_embeddings(glove_path)

word2idx = tokenizer.word_index
embedding_dim = 200
embedding_matrix = create_embedding_matrix(embedding_index, word2idx, vocab_size, embedding_dim)

In [11]:
import json
with open('best_hyperparameters.json', 'r') as f:
    best_hyperparameters = json.load(f)

best_hyperparameters

{'lstm_units_1': 32,
 'lstm_units_2': 64,
 'lstm_units_3': 28,
 'dense_units_1': 8,
 'dense_units_2': 8,
 'dropout_rate': 0.05,
 'learning_rate': 0.0022173357103291887,
 'tuner/epochs': 4,
 'tuner/initial_epoch': 2,
 'tuner/bracket': 2,
 'tuner/round': 1,
 'tuner/trial_id': '0037'}

In [13]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization

# Extract the best hyperparameters from the dictionary
lstm_units_1 = best_hyperparameters['lstm_units_1']
lstm_units_2 = best_hyperparameters['lstm_units_2']
lstm_units_3 = best_hyperparameters['lstm_units_3']
dense_units_1 = best_hyperparameters['dense_units_1']
dense_units_2 = best_hyperparameters['dense_units_2']
dropout_rate = best_hyperparameters['dropout_rate']
learning_rate = best_hyperparameters['learning_rate']


def build_model_with_l2(lstm_units_1, lstm_units_2, lstm_units_3, dense_units_1, dense_units_2, dropout_rate, learning_rate):
    inputs = Input(shape=(100,))

    embedding_layer = Embedding(input_dim=vocab_size, output_dim=200, input_length=100, weights=[embedding_matrix], trainable=False)(inputs)

    bilstm = Bidirectional(LSTM(lstm_units_1, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001)))(embedding_layer)
    bilstm = BatchNormalization()(bilstm)
    bilstm = Bidirectional(LSTM(lstm_units_2, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001)))(bilstm)
    bilstm = BatchNormalization()(bilstm)
    bilstm = Bidirectional(LSTM(lstm_units_3, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001)))(bilstm)
    bilstm = BatchNormalization()(bilstm)

    bilstm = Dropout(dropout_rate)(bilstm)

    flatten = Flatten()(bilstm)
    dense = Dense(dense_units_1, activation="relu", kernel_regularizer=l2(0.001))(flatten)
    dense = Dense(dense_units_2, activation="relu", kernel_regularizer=l2(0.001))(dense)
    outputs = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=inputs, outputs=outputs)

    # Define exponential decay schedule
    lr_schedule = ExponentialDecay(
    initial_learning_rate=learning_rate,  # Best hyperparameter value
    decay_steps=1000,  # How often to decay the learning rate
    decay_rate=0.9,    # Factor to decay the learning rate
    staircase=True     # If True, the learning rate decreases in steps
    )

    # Use the schedule in your optimizer
    optimizer = Adam(learning_rate=lr_schedule)

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Create the model with the best hyperparameters
best_model = build_model_with_l2(lstm_units_1, lstm_units_2, lstm_units_3, dense_units_1, dense_units_2, dropout_rate, learning_rate)

# Train the model if necessary
best_model.fit(X_train_padded, y_train, batch_size = 64, epochs=20, validation_data=(X_test_padded, y_test))

Epoch 1/20
[1m2698/2698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 35ms/step - accuracy: 0.8876 - loss: 0.5079 - val_accuracy: 0.9182 - val_loss: 0.2552
Epoch 2/20
[1m2698/2698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 35ms/step - accuracy: 0.9225 - loss: 0.2487 - val_accuracy: 0.9185 - val_loss: 0.2522
Epoch 3/20
[1m2698/2698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 35ms/step - accuracy: 0.9264 - loss: 0.2263 - val_accuracy: 0.9227 - val_loss: 0.2276
Epoch 4/20
[1m2698/2698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 34ms/step - accuracy: 0.9312 - loss: 0.2081 - val_accuracy: 0.9233 - val_loss: 0.2209
Epoch 5/20
[1m2698/2698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 35ms/step - accuracy: 0.9327 - loss: 0.2006 - val_accuracy: 0.9316 - val_loss: 0.1983
Epoch 6/20
[1m2698/2698[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 35ms/step - accuracy: 0.9357 - loss: 0.1896 - val_accuracy: 0.9313 - val_loss: 0.1980
Epo

<keras.src.callbacks.history.History at 0x7f7520276200>

In [15]:
import pickle
with open('suicide_detection_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)