In [28]:
from google.colab import drive
drive.mount('/content/drive')

!mkdir /content/output
!cp -rf '/content/drive/MyDrive/AI Suicide Detection Project/input' /content



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/content/output’: File exists


In [29]:
!pip install neattext
!pip install tensorflow==2.16.1

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
import plotly.express as plx
from sklearn.metrics import classification_report
import keras
from keras.layers import Embedding,Dense,LSTM,GlobalMaxPooling1D,Input
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm



In [30]:
data=pd.read_csv('/content/input/Suicide_Detection.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [31]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=10)

In [32]:
print(f'Training data lenght: {len(train_data)}')
print(f'Testing data lenght: {len(test_data)} {test_data["class"].value_counts().tolist()[0]}:{test_data["class"].value_counts().tolist()[1]}')

Training data lenght: 185659
Testing data lenght: 46415 23209:23206


# **Data Visualisation**

In [33]:
plx.bar(train_data,
        x=train_data['class'].value_counts().index.values,
        y=train_data['class'].value_counts().tolist(),
        color=['Suicide','Not Suicide'])

# **Data Cleaning**

In [34]:
def clean_text(text):
    text_length=[]
    cleaned_text=[]
    for sent in tqdm(text):
        sent=sent.lower()
        sent=nfx.remove_special_characters(sent)
        sent=nfx.remove_stopwords(sent)
        text_length.append(len(sent.split()))
        cleaned_text.append(sent)
    return cleaned_text,text_length

In [35]:
cleaned_train_text,train_text_length=clean_text(train_data.text)
cleaned_test_text,test_text_length=clean_text(test_data.text)

100%|██████████| 185659/185659 [00:42<00:00, 4374.75it/s]
100%|██████████| 46415/46415 [00:05<00:00, 7760.10it/s] 


In [36]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(cleaned_train_text)

train_text_seq=tokenizer.texts_to_sequences(cleaned_train_text)
train_text_pad=pad_sequences(train_text_seq,maxlen=50)

test_text_seq=tokenizer.texts_to_sequences(cleaned_test_text)
test_text_pad=pad_sequences(test_text_seq,maxlen=50)

# **Glove Embeddings**

In [37]:
lbl_target=LabelEncoder()
train_output=lbl_target.fit_transform(train_data['class'])
test_output=lbl_target.transform(test_data['class'])

In [38]:
import pickle
with open('/content/input/glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

In [39]:
v=len(tokenizer.word_index)

embedding_matrix=np.zeros((v+1,300), dtype=float)
for word,idx in tokenizer.word_index.items():
    embedding_vector=glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [40]:
early_stop=EarlyStopping(patience=5)
reducelr=ReduceLROnPlateau(patience=3)

# **Keras Sequential Model Construction**

In [41]:
model=Sequential()
model.add(Input(shape=(50,))) # Modified the input shape to match the actual data
model.add(Embedding(v+1,300,weights=[embedding_matrix],trainable=False))
model.add(LSTM(20,return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer=keras.optimizers.SGD(0.1,momentum=0.09),loss='binary_crossentropy',metrics=['accuracy'])

In [42]:
model.summary()

# **Model Training and Evaluation**

In [43]:
r=model.fit(train_text_pad,train_output,validation_data=(test_text_pad,test_output),
            epochs=20,batch_size=500,callbacks=[early_stop,reducelr])

Epoch 1/20
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 188ms/step - accuracy: 0.7198 - loss: 0.5303 - val_accuracy: 0.8871 - val_loss: 0.2818 - learning_rate: 0.1000
Epoch 2/20
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 199ms/step - accuracy: 0.8831 - loss: 0.2878 - val_accuracy: 0.8825 - val_loss: 0.2880 - learning_rate: 0.1000
Epoch 3/20
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 196ms/step - accuracy: 0.8972 - loss: 0.2586 - val_accuracy: 0.9067 - val_loss: 0.2362 - learning_rate: 0.1000
Epoch 4/20
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 186ms/step - accuracy: 0.9040 - loss: 0.2412 - val_accuracy: 0.9023 - val_loss: 0.2421 - learning_rate: 0.1000
Epoch 5/20
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 192ms/step - accuracy: 0.9088 - loss: 0.2308 - val_accuracy: 0.9135 - val_loss: 0.2197 - learning_rate: 0.1000
Epoch 6/20
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [44]:
# Run the model on test data and test accuracy

predictions = model.predict(test_text_pad)
predicted_classes = [1 if value > 0.5 else 0 for value in predictions]


print(classification_report(test_output, predicted_classes, target_names=lbl_target.inverse_transform([0,1])))

[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step
              precision    recall  f1-score   support

 non-suicide       0.92      0.94      0.93     23209
     suicide       0.94      0.91      0.93     23206

    accuracy                           0.93     46415
   macro avg       0.93      0.93      0.93     46415
weighted avg       0.93      0.93      0.93     46415



In [45]:
# Test the model
posts = ['i am Smiling', 'Through these past years thoughts of suicide, fear, anxiety I’m so close to my limit', 'Hello my name is abdallah i am from egypt']
posts = tokenizer.texts_to_sequences(posts)
posts = pad_sequences(posts, maxlen=50)

predictions = model.predict(posts)
print(predictions)
predicted_classes = [1 if value > 0.5 else 0 for value in predictions]

print(predicted_classes)
string_prediction = ["Suicide" if value == 1 else "Non Suicide" for value in predicted_classes]
print(string_prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[[0.20670825]
 [0.86885995]
 [0.01206977]]
[0, 1, 0]
['Non Suicide', 'Suicide', 'Non Suicide']


In [49]:
# Save the tokenizer
tokenizer_json = tokenizer.to_json()
with open("/content/output/tokenizer.json", "w") as json_file:
    json_file.write(tokenizer_json)

In [47]:
# Save the model
model.save("/content/output/model.keras")

In [50]:
# copy the model and tokenizer to google drive
!cp -rf /content/output '/content/drive/MyDrive/AI Suicide Detection Project/'