<a href="https://colab.research.google.com/github/ArunKoundinya/SoulGuard/blob/master/jupyternotebooks/SuicideClassification_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook consists of both Suicide Classification & Sentiment Models. Althought these models have been developed independently, these integrated here for purpose of future CI/CD.

This notebook have following sections:
1.   Loading Libraries
2.   Data Loading
3.   Loading Glove Embeddings
4.   Suicide Classification Model
5.   Pickling the Suicide Classification Model
6.   Data Cleaning for Sentiment Model
7.   Custom Made TextBlob, Vader & WorryWords
8.   Developing a sample py file for testing purpose for integration



## Loading Libraries

In [None]:
pip install vaderSentiment emoji textblob

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, vaderSentiment
Successfully installed emoji-2.14.0 vaderSentiment-3.3.2


In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dropout, Bidirectional, LSTM, Dense, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.initializers import GlorotUniform

from tensorflow.keras.models import Model


from sklearn.metrics import accuracy_score, classification_report


from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from collections import Counter
import emoji  # Make sure to import the emoji module

# Download VADER lexicon if not already done
nltk.download('vader_lexicon')
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
SuicideDetection = pd.read_csv('https://media.githubusercontent.com/media/ArunKoundinya/SoulGuard/refs/heads/master/data/SoulG_Update.csv')
SuicideDetection = SuicideDetection.sample(n=200000, random_state=42)
X = SuicideDetection['cleaned_text']
X = X.astype(str)
y = SuicideDetection['class']
y = y.astype(str)
y = pd.factorize(y)[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer(oov_token="<UNK>",)
tokenizer.fit_on_texts(X_train)

tokenizer.word_index['<PAD>'] = 0

X_sequences_train = tokenizer.texts_to_sequences(X_train)
X_sequences_test = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_sequences_train, padding='post', maxlen=100)
X_test_padded = pad_sequences(X_sequences_test, padding='post', maxlen=100)

vocab_size = len(tokenizer.word_index)

In [None]:
def load_embeddings(glove_path):
    embedding_index = {}
    with open(glove_path, encoding="utf8") as glove_file:
        for line in glove_file:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embedding_index[word] = coefs
    return embedding_index

def create_embedding_matrix(embedding_index, word2idx, vocab_size, embedding_dim):
    mat=np.zeros((vocab_size,embedding_dim))
    for key,value in word2idx.items():
      mat[value]=embedding_index.get(key)
    mat[np.isnan(mat)] = 0
    return mat

drive.mount('/content/drive')
glove_path = f"/content/drive/My Drive/MSIS/IntroductiontoDeepLearning/Project/glove.6B/glove.twitter.27B.200d.txt"
embedding_index = load_embeddings(glove_path)

word2idx = tokenizer.word_index
embedding_dim = 200
embedding_matrix = create_embedding_matrix(embedding_index, word2idx, vocab_size, embedding_dim)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open('/content/drive/MyDrive/DeepLearning/Capstone-SoulGuard/vocab_dict.pkl', 'wb') as f:
  pickle.dump(tokenizer.word_index, f)

In [None]:
# Input layer
inputs = Input(shape=(100,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=200, input_length=100, weights=[embedding_matrix], trainable=False)(inputs)
bilstm = Bidirectional(LSTM(16, activation='tanh', return_sequences=True))(embedding_layer)
bilstm = Bidirectional(LSTM(8, activation='tanh', return_sequences=True))(bilstm)
bilstm = Bidirectional(LSTM(4, activation='tanh', return_sequences=True))(bilstm)
flatten = Flatten()(bilstm)
dense = Dense(16, activation="relu")(flatten)
dense = Dense(4, activation="relu")(dense)
outputs = Dense(1, activation='sigmoid')(dense)

# Build the model
model_lstm_bi_embed = Model(inputs=inputs, outputs=outputs)

model_lstm_bi_embed.build(input_shape=(None, 100))  # Batch size unspecified

optimizer = Adam(learning_rate=0.001)
model_lstm_bi_embed.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model_lstm_bi_embed.fit(X_train_padded, y_train, epochs=10, validation_data=(X_test_padded, y_test))




Epoch 1/10
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 31ms/step - accuracy: 0.8868 - loss: 0.2764 - val_accuracy: 0.9281 - val_loss: 0.1872
Epoch 2/10
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 31ms/step - accuracy: 0.9343 - loss: 0.1708 - val_accuracy: 0.9347 - val_loss: 0.1684
Epoch 3/10
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 31ms/step - accuracy: 0.9399 - loss: 0.1560 - val_accuracy: 0.9360 - val_loss: 0.1672
Epoch 4/10
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 31ms/step - accuracy: 0.9448 - loss: 0.1455 - val_accuracy: 0.9367 - val_loss: 0.1647
Epoch 5/10
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 31ms/step - accuracy: 0.9487 - loss: 0.1347 - val_accuracy: 0.9343 - val_loss: 0.1748
Epoch 6/10
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 31ms/step - accuracy: 0.9519 - loss: 0.1256 - val_accuracy: 0.9353 - val_loss: 0.174

<keras.src.callbacks.history.History at 0x7ac834576ce0>

In [None]:
with open('suicide_detection_model.pkl', 'wb') as f:
    pickle.dump(model_lstm_bi_embed, f)

Sentiment Model


In [None]:
# Define stopwords and lemmatizer
custom_stopwords = {'knew', 'become', 'way', 'put', 'took', 'im', 'etc', 'went', 'got', 'yet',
                    'literally', 'na', 'even', 'gon', 'id', 'wan', 'due', 'instead', 've',
                    't', 'hes', 'ket', 'lot', 'ask', 'many', 'u', 'ni', 'cum', 'basically',
                    'cecil', 'tell', 'stuff', 'use', 'put', 'seem', 'yet', 'yeah', 'done', 'im',
                    'least', 'eve', 'let', 'may', 'actually', 'lol', 'cake', 'give',
                    'ta', 'na', 'give', 'got', 'something', 'like', 'ive', 'ye', 'filler', 'fillerfiller','ampx200b','gtpoplt',
                    'pog', 'penis', 'bacon', 'bruh', 'corn', 'title', 'discochocolate', 'fuck', 'sus', 'gtbyelt', 'as', 'gt', 'lt',
                    'pop', 'amp', 'ampx200b', 'gt', 'jake', 'paul', 'cheese', 'x200b','ur','1','cum', 'brazil'}

stop_en = set(stopwords.words('english')).union(custom_stopwords)

stop_words = set(stop_en) - { 'not', 'no', 'couldn', "couldn't", "wouldn't", "shouldn't", "isn't",
                                                "aren't", "wasn't", "weren't", "don't", "doesn't", "hadn't", "hasn't",
                                                 "won't", "can't", "mightn't","needn't","nor","shouldn","should've","should",
                                                 "weren","wouldn","mustn't","mustn","didn't","didn","doesn","did","does","hadn",
                                                 "hasn","haven't","haven","needn","shan't"}

lemmatizer = WordNetLemmatizer()

# Function to preprocess the text
def custom_preprocess(text):
    # 1. Remove URLs and replace them with a space
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)

    # 2. Replace specific special characters and punctuation with spaces
    text = re.sub(r'[!@#$%^&*()\-={}[\]\\|;:"\'<>,.?/`~]+', ' ', text)

    # 3. Remove long binary numbers and meaningless long repetitive characters
    text = re.sub(r'\b[01]{10,}\b', ' ', text)  # Long binary numbers
    text = re.sub(r'(.)\1{5,}', ' ', text)  # Characters repeated more than 5 times

    # 4. Tokenize the text
    tokens = word_tokenize(text.lower())

    # 5. Remove tokens longer than 25 characters
    tokens = [token for token in tokens if len(token) < 26]

    # 6. Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # 7. Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]

    # Join tokens back to a single string
    return ' '.join(tokens)

# Apply the preprocessing function to the dataset
SuicideDetection['cleaned_text1'] = SuicideDetection['text'].apply(custom_preprocess)


In [None]:
# custom worrywords
Worry= pd.read_csv("/content/drive/MyDrive/DeepLearning/Capstone-SoulGuard/worrywords-v1.csv")
Worry = Worry[Worry['Mean']>0]
worrywords_dict = dict(zip(Worry['Term'],Worry['Mean']))

# custom words
custom_lexicon = {
    'suicide': -1.0, 'depression': -1.0, 'hurt': -0.8, 'pain': -0.8, 'loneliness': -0.8, 'struggle': -0.5, 'failure': -0.6, 'hope': 0.8, 'help': 0.7,
    'love': 0.9, 'support': 0.8, 'peace': 0.6, 'family': 0.7, 'friend': 0.8, 'happy': 1.0, 'life': 0.5, 'future': 0.5, 'escape': -0.4, 'numb': -0.6,
    'scared': -0.5, 'broken': -0.8, 'lost': -0.7, 'anxious': -0.5,
    'kill': -0.9, 'stop': -0.4, 'abuse': -0.9, 'guilty': -0.6, 'commit': -0.5, 'fake': -0.5, 'dead': -0.8, 'stress': -0.6, 'depress': -0.9, 'fail': -0.7,
    'death': -1.0, 'lose': -0.5, 'fear': -0.6, 'scar': -0.4, 'angry': -0.7, 'trauma': -0.8, 'cruel': -0.8, 'poison': -0.8, 'unlovable': -0.9,
    'lonely': -0.8, 'mistake': -0.5, 'destroy': -0.8, 'miserable': -0.9, 'mess': -0.4, 'die': -1.0, 'cry': -0.6, 'tear': -0.5, 'guilt': -0.6,
    'threat': -0.7, 'hopeless': -1.0, 'despair': -0.9, 'misery': -0.9, 'sorrow': -0.8, 'grief': -0.8, 'worthless': -0.9, 'anxiety': -0.7, 'upset': -0.5,
    'panic': -0.6, 'rage': -0.8, 'distress': -0.7, 'shattered': -0.9, 'inadequate': -0.7, 'rejected': -0.8, 'unloved': -0.9, 'cursed': -0.8,
    'burdened': -0.8, 'restless': -0.4, 'toxic': -0.8, 'suffer': -0.8, 'isolate': -0.7, 'discourage': -0.5, 'frighten': -0.6, 'struggling': -0.7,
    'manipulate': -0.5, 'cheat': -0.5, 'waste': -0.6, 'resent': -0.5, 'regret': -0.6, 'grudge': -0.6, 'detest': -0.7, 'void': -0.8, 'wreck': -0.7,
    'mourn': -0.8
}

#  custom emoji lexicon
emoji_lexicon = {'😂': 0.7, '😔': -0.5, '😏': 0.2, '😝': 0.6, '😘': 0.8, '❤': 0.9, '😳': 0.0, '😎': 0.6, '🥴': -0.3, '🙄': -0.1, '😭': -0.9,
                 '😬': -0.4, '🤭': 0.3, '😩': -0.6, '🤔': 0.0, '🥰': 0.9, '😀': 1.0, '🤗': 0.8, '😡': -0.8, '🤧': -0.6, '😐': 0.0,
                 '😁': 0.6, '😊': 0.7, '♥': 0.9, '😠': -0.7, '🥵': -0.5, '💜': 0.9, '💙': 0.8, '😈': -0.3, '💃': 0.5, '😍': 1.0,
                 '💕': 0.9, '🤯': -0.2, '🥳': 0.9, '😻': 1.0, '😤': -0.4, '🤣': 0.8, '😥': -0.7, '😖': -0.7, '🙂': 0.4, '😞': -0.8,
                 '😓': -0.6, '😪': -0.6}

def extract_emojis(text):
    return [char for char in text if emoji.is_emoji(char)]

In [None]:
# Function for calculating score for custom words
def calculation_custom_score(filter, customdict , text):
      words = word_tokenize(text.lower())
      scores = []

      for word in words:
        if word in customdict:
          scores.append(customdict[word])

      if scores:
        score = sum(scores) / len(scores)
        if filter == "worry":
          score = score / 3
          score = score * -1
        return score
      else :
        return 0

In [None]:
# Initialize VADER Sentiment Analyzer
vader_analyzer = SentimentIntensityAnalyzer()

# Function to get TextBlob and VADER sentiment scores
def hybrid_sentiment_analysis_worry(text):
    # TextBlob sentiment analysis
    blob = TextBlob(text)
    textblob_polarity = blob.sentiment.polarity  # Range from -1 (negative) to +1 (positive)

    # VADER sentiment analysis
    vader_scores = vader_analyzer.polarity_scores(text)
    vader_compound = vader_scores['compound']  # Range from -1 to +1

    # Custom lexicon scoring
    custom_score = calculation_custom_score("custom",custom_lexicon,text)

    # worrywords
    worry_score = calculation_custom_score("worry",worrywords_dict,text)

    # Emoji scoring
    emoji_score = calculation_custom_score("emoji",emoji_lexicon,text)

    # Combine scores with adjustable weights
    combined_score = (textblob_polarity * 0.10 + vader_compound * 0.50 +
                      custom_score * 0.10 + emoji_score * 0.10 + worry_score*0.20)

    # Scale the combined score to a 0 to 1 range
    scaled_score = (combined_score + 1) / 2

    return scaled_score

In [None]:
SuicideDetection['text'][100]

'Check out this new horror short I made https://youtu.be/S50McngM1ws it took me a while to make so hope you enjoy it'

Testing

In [None]:
model_bistm_pretrained = pickle.load(open("/content/drive/MyDrive/DeepLearning/Capstone-SoulGuard/suicide_detection_model.pkl", 'rb'))
vocab_dict = pickle.load(open('/content/drive/MyDrive/DeepLearning/Capstone-SoulGuard/vocab_dict.pkl', 'rb'))


In [None]:
import numpy as np
import pandas as pd
import pickle
import warnings
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text into words
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join the words back into a single string
    text = ' '.join(words)
    return text

stop_words = set(stopwords.words('english')) - { 'not', 'no', 'couldn', "couldn't", "wouldn't", "shouldn't", "isn't",
                                                "aren't", "wasn't", "weren't", "don't", "doesn't", "hadn't", "hasn't",
                                                 "won't", "can't", "mightn't","needn't","nor","shouldn","should've","should",
                                                 "weren","wouldn","mustn't","mustn","didn't","didn","doesn","did","does","hadn",
                                                 "hasn","haven't","haven","needn","shan't"}

def process_sentence(sentence):
  list1 = []
  for word in sentence.split():
    if word in vocab_dict:
      list1.append(vocab_dict[word])
    else:
      list1.append(vocab_dict["<UNK>"])
  return list1

def format_examples(data1, vocab_dict, maxlen):
  sequences_data=data1['cleaned_text'].apply(process_sentence).tolist()
  padded_sequences_data = pad_sequences(sequences_data,padding='post', maxlen=100)
  return padded_sequences_data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
text = SuicideDetection['text'][100]

cleanedtext = preprocess(text)

df = pd.DataFrame({
    'text' : [text],
    'cleaned_text': [cleanedtext]
})

X_input = format_examples(df, vocab_dict, 100)
prediction = model_bistm_pretrained.predict(X_input).astype(float)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


In [None]:
prediction.max()

0.9699795842170715

In [None]:
hybrid_sentiment_analysis_worry(text)

0.6605810606060606