In [19]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
import plotly.express as plx
from sklearn.metrics import classification_report
import keras
from keras.layers import Embedding,Dense,LSTM,GlobalMaxPooling1D,Input
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

In [20]:
data=pd.read_csv('Suicide_Detection.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [3]:
data['class'].value_counts()

suicide        116037
non-suicide    116037
Name: class, dtype: int64

In [21]:
data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

In [22]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=10)

In [23]:
train_data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

# **Data Visualisation**

In [24]:
plx.bar(train_data,x=train_data['class'].value_counts().index.values,
        y=train_data['class'].value_counts(),color=['Suicide','Not Suicide'])

# **Data Cleaning**

In [25]:
def clean_text(text):
    text_length=[]
    cleaned_text=[]
    for sent in tqdm(text):
        sent=sent.lower()
        sent=nfx.remove_special_characters(sent)
        sent=nfx.remove_stopwords(sent)
        text_length.append(len(sent.split()))
        cleaned_text.append(sent)
    return cleaned_text,text_length

In [26]:
cleaned_train_text,train_text_length=clean_text(train_data.text)
cleaned_test_text,test_text_length=clean_text(test_data.text)

100%|███████████████████████████████████████████████████████████████████████| 185659/185659 [00:26<00:00, 7003.54it/s]
100%|█████████████████████████████████████████████████████████████████████████| 46415/46415 [00:06<00:00, 6852.90it/s]


In [10]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(cleaned_train_text)

In [28]:
cleaned_train_text

['hey east cost ya guys doin whats snow like',
 'ate edible ate edible looking fun stuff sitting suggestions',
 'helplessness havelast week ive realised dont like cant stsy happy long dont feel like belong dont know want know people miss die thats stops dont know im meant knowing thatthe times recently ive felt close happy im drinking following day regardless happens feel worse suicidal feelings worse',
 'want win election labeled discussion fight insult personally im hoping trump fact helps expect vote biden helped',
 'toothpicksso thing cant lie doctors family basically push door doctors office came tumbling feel world better lives people im better leaves doctor contacting family saying im allowed things case needed supervision pretty predictable know knifes rope razor blades etc toothpicks mean im flattered think creative kill toothpick honestly feels excessive forcing bubble wrapped getting help making feel worse pressure puts family gonna push guilt able things away isnt fair',
 '

In [29]:
train_text_seq=tokenizer.texts_to_sequences(cleaned_train_text)
train_text_pad=pad_sequences(train_text_seq,maxlen=50)


test_text_seq=tokenizer.texts_to_sequences(cleaned_test_text)
test_text_pad=pad_sequences(test_text_seq,maxlen=50)

In [31]:
train_text_pad

array([[   0,    0,    0, ...,  176, 3027,    3],
       [   0,    0,    0, ...,  163,  508, 1642],
       [   0,    0,    0, ...,   77,  240,   96],
       ...,
       [   0,    0,    0, ...,  328,    2,    4],
       [   0,    0,    0, ...,   65,   26,   16],
       [   4,   46,   25, ...,    2,    4,   16]])

# **Glove Embeddings**

In [34]:
lbl_target=LabelEncoder()
train_output=lbl_target.fit_transform(train_data['class'])
test_output=lbl_target.transform(test_data['class'])

In [33]:
import pickle
with open('glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

In [37]:
v=len(tokenizer.word_index)

embedding_matrix=np.zeros((v+1,300), dtype=float)
for word,idx in tokenizer.word_index.items():
    embedding_vector=glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [38]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.074482  ,  0.58293003, -0.78233999, ..., -0.24984001,
        -0.096953  ,  0.66692001],
       [-0.35394999,  0.23051   , -0.62689   , ..., -0.20720001,
         0.52003002,  0.51129001],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.29547   , -0.21822999, -0.039817  , ...,  0.62642998,
         0.48798001, -0.47554001],
       [ 0.75085002, -0.35099   ,  0.37674999, ..., -0.066863  ,
         0.79632998, -0.05967   ]])

In [39]:
early_stop=EarlyStopping(patience=5)
reducelr=ReduceLROnPlateau(patience=3)

# **Keras Sequential Model Construction**

In [40]:
model=Sequential()
model.add(Input(shape=(40,)))
model.add(Embedding(v+1,300,weights=[embedding_matrix],trainable=False))
model.add(LSTM(20,return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer=keras.optimizers.SGD(0.1,momentum=0.09),loss='binary_crossentropy',metrics=['accuracy'])

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 300)           81560700  
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 20)            25680     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 20)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               5376      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 81,592,013
Trainable params: 31,313
Non-trainable params: 81,560,700
_________________________________________________________________


# **Model Training and Evaluation**

In [42]:
r=model.fit(train_text_pad,train_output,validation_data=(test_text_pad,test_output),
            epochs=20,batch_size=256,callbacks=[early_stop,reducelr])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


In [44]:
print('TESTING DATA CLASSIFICATION REPORT \n \n')
print(classification_report(test_output,model.predict_classes(test_text_pad),
                            target_names=lbl_target.inverse_transform([0,1])))

print('TRAINING DATA CLASSIFICATION REPORT \n \n')
print(classification_report(train_output,model.predict_classes(train_text_pad),
                           target_names=lbl_target.inverse_transform([0,1])))

TESTING DATA CLASSIFICATION REPORT 
 

              precision    recall  f1-score   support

 non-suicide       0.92      0.95      0.93     23209
     suicide       0.95      0.91      0.93     23206

    accuracy                           0.93     46415
   macro avg       0.93      0.93      0.93     46415
weighted avg       0.93      0.93      0.93     46415

TRAINING DATA CLASSIFICATION REPORT 
 

              precision    recall  f1-score   support

 non-suicide       0.95      0.94      0.94     92828
     suicide       0.94      0.95      0.94     92831

    accuracy                           0.94    185659
   macro avg       0.94      0.94      0.94    185659
weighted avg       0.94      0.94      0.94    185659



In [48]:
twt = ['i am happy']
twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=50)

prediction = model.predict(twt)[0][0]
print(prediction)

if(prediction > 0.5):
    print("Potential Suicide Post")
else:
    print("Non Suicide Post")

0.3648214
Non Suicide Post


In [49]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [50]:
model.save("model.h5")

In [51]:
token_form = pickle.load(open('tokenizer.pkl', 'rb'))

In [52]:
from keras.models import load_model

In [53]:
model_form = load_model("model.h5")

In [54]:

twt = ['Through these past years thoughts of suicide, fear, anxiety I’m so close to my limit']
twt = token_form.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=50)


prediction = model_form.predict(twt)[0][0]
print(prediction)

if(prediction > 0.5):
    print("Potential Suicide Post")
elif (prediction == 1):
    print("Non Suicide Post")

0.9583739
Potential Suicide Post
