In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
data=pd.read_csv('/Suicide_Detection.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [None]:
data = data[['text', 'class']]
data.dropna(subset=['text', 'class'], inplace=True)

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
  text = text.lower()
  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text).strip()
  # Remove special characters and numbers
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  # Tokenize words
  words = text.split()

  # Remove stopwords
  words = [word for word in words if word not in stop_words]

  # Lemmatize words
  words = [lemmatizer.lemmatize(word) for word in words]

  # Remove single characters (like "a", "b", etc.)
  words = [word for word in words if len(word) > 1]

  # Join words back into a single string
  cleaned_text = ' '.join(words)

  return cleaned_text

In [None]:
data['text'] = data['text'].apply(clean_text)

In [None]:
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
max_words = 10000  
max_length = 100   

In [None]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['text'])

In [None]:
X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])

In [None]:
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')

In [None]:
y_train = train_data['class'].values
y_test = test_data['class'].values

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
checkpoint = ModelCheckpoint(
    filepath='best_suicidal_detection_model.keras',  
    monitor='val_loss',                         
    save_best_only=True,                         
    mode='min',                                  
    verbose=1                                    
)

In [None]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=2,
                    callbacks=[checkpoint])

Epoch 1/10

Epoch 1: val_loss improved from inf to 0.16334, saving model to best_suicidal_detection_model.keras
4642/4642 - 520s - 112ms/step - accuracy: 0.9214 - loss: 0.2713 - val_accuracy: 0.9495 - val_loss: 0.1633
Epoch 2/10

Epoch 2: val_loss improved from 0.16334 to 0.15444, saving model to best_suicidal_detection_model.keras
4642/4642 - 557s - 120ms/step - accuracy: 0.9578 - loss: 0.1415 - val_accuracy: 0.9514 - val_loss: 0.1544
Epoch 3/10

Epoch 3: val_loss did not improve from 0.15444
4642/4642 - 553s - 119ms/step - accuracy: 0.9658 - loss: 0.1185 - val_accuracy: 0.9504 - val_loss: 0.1567
Epoch 4/10

Epoch 4: val_loss improved from 0.15444 to 0.14809, saving model to best_suicidal_detection_model.keras
4642/4642 - 505s - 109ms/step - accuracy: 0.9708 - loss: 0.1042 - val_accuracy: 0.9561 - val_loss: 0.1481
Epoch 5/10

Epoch 5: val_loss did not improve from 0.14809
4642/4642 - 556s - 120ms/step - accuracy: 0.9758 - loss: 0.0898 - val_accuracy: 0.9528 - val_loss: 0.1556
Epoch 6/

In [None]:
best_model = tf.keras.models.load_model('best_suicidal_detection_model.keras')
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy of Best Model: {accuracy:.2f}")