In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wordninja

In [None]:
import re
import wordninja
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import gensim.downloader as api
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
data=pd.read_csv('/content/drive/MyDrive/machine learning/Suicide_Detection.csv')
data.head()

In [None]:
data = data[['text', 'class']]
data.dropna(subset=['text', 'class'], inplace=True)

In [None]:
def remove_url(text):
    return re.sub(r'http\S+', '', text)

In [None]:
def remove_mail(text):
    return re.sub(r'\S+@\S+', '', text)

In [None]:
def remove_emoji(text):
    return re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F'
                  r'\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F'
                  r'\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001FE00-\U0001FE0F\U0001F004]+', '', text)

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    # Apply additional cleaning
    text = remove_url(text)
    text = remove_mail(text)
    text = remove_emoji(text)
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize words
    words = text.split()
    # Apply wordninja for segmentation
    words = [subword for word in words for subword in wordninja.split(word)]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Remove single characters
    words = [word for word in words if len(word) > 1]
    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


In [None]:
# Apply the cleaning function to the text column
data['text'] = data['text'].apply(clean_text)

In [None]:
# Proceed with tokenization, encoding, and training (as in the original code)
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
max_words = 10000
max_length = 100

In [None]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['text'])

In [None]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [None]:
X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])

In [None]:
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', truncating='post')

In [None]:
y_train = train_data['class'].values
y_test = test_data['class'].values

In [None]:
# Load pre-trained GloVe embeddings
EMBEDDING_DIM = 100
MAX_NUM_WORDS = max_words
glove_gensim = api.load('glove-wiki-gigaword-100')

In [None]:
# Create embedding matrix
gensim_weight_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

for word, index in tokenizer.word_index.items():
    if index < MAX_NUM_WORDS:
        if word in glove_gensim.index_to_key:
            gensim_weight_matrix[index] = glove_gensim[word]
        else:
            gensim_weight_matrix[index] = np.zeros(EMBEDDING_DIM)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=MAX_NUM_WORDS,
                               output_dim=EMBEDDING_DIM,
                               input_length=max_length,
                               weights=[gensim_weight_matrix],
                               trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
checkpoint = ModelCheckpoint(
    filepath='best_suicidal_detection_model.keras',
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

In [None]:
# Train model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2,
                    verbose=2,
                    callbacks=[checkpoint])

In [None]:
# Evaluate the saved best model
best_model = tf.keras.models.load_model('best_suicidal_detection_model.keras')
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy of Best Model: {accuracy:.2f}")