In [None]:
!pip install transformers

In [9]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d nikhileswarkomati/suicide-watch

Downloading suicide-watch.zip to /content
100% 60.6M/60.6M [00:05<00:00, 15.0MB/s]
100% 60.6M/60.6M [00:05<00:00, 11.9MB/s]


In [7]:
import zipfile
zip_ref = zipfile.ZipFile('/content/suicide-watch.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [15]:
raw = pd.read_csv('/content/Suicide_Detection.csv')

In [16]:
df = raw.sample(10000 , random_state=29)

In [17]:
df= df.drop('Unnamed: 0',axis=1)

In [18]:
df['class'].replace({'suicide' : 1 , 'non-suicide' : 0}, inplace=True)

In [46]:
df

Unnamed: 0,text,class
213108,Nobody cares...Got to see / talk to my mom for...,1
214134,Lost and confusedSo this whole bout of depress...,1
136003,My life is falling apart and I don't know want...,1
192568,Mental BarrierHonestly I've stopped trying to ...,1
198908,I went ghost to focus on myself for a week and...,0
...,...,...
161542,I'm going through a lot stuff right now I'm go...,0
11809,I dont think I can keep goingmy ex and I recen...,1
175643,Bruh the girls that pull up their shorts and t...,0
69299,Anyone wanna chat rn ? Just vibin pm me if u w...,0


In [None]:
model = TFAutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [34]:
encoded = tokenizer(df['text'].tolist(), truncation=True, padding=True)
dataset = tf.data.Dataset.from_tensor_slices((encoded, df['class'].tolist()))

In [39]:
dataset

<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [58]:
BATCH_SIZE = 8

def order(inp, label):
  
    return {
        'input_ids': inp['input_ids'],
        'attention_mask': inp['attention_mask'],
        'token_type_ids': inp['token_type_ids']
    }, label



train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))

train_dataset = dataset.take(train_size).map(order)
val_dataset = dataset.skip(train_size).take(val_size).map(order)
test_dataset = dataset.skip(train_size + val_size).map(order)


train_dataset = train_dataset.map(order)
val_dataset = val_dataset.map(order)
test_dataset = test_dataset.map(order)


train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)


In [59]:
inp, out = next(iter(train_dataset)) 
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[  101,  3407,  4336, ...,     0,     0,     0],
       [  101, 10047,  5458, ...,     0,     0,     0],
       [  101,  2158,  6616, ...,     0,     0,     0],
       ...,
       [  101,  1045,  1005, ...,  2031,  1037,   102],
       [  101,  3531,  2831, ...,     0,     0,     0],
       [  101,  1045,  2196, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(8, 512), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>} 



In [60]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='sigmoid')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [61]:
classifier = BERTForClassification(model, num_classes=1)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [62]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [63]:
classifier.evaluate(test_dataset)



[0.12009330093860626, 0.9620000123977661]

In [64]:
test_in = ["Today I felt good in the morning, everything was good, but in the evening, it rained, and as a result, I got stuck in the traffic; my life sucks",
           "Today, I felt good in the morning; everything was good, but in the evening, it rained, and as a result, I got stuck in traffic. My life sucks; I should end it; I should kill myself."]

In [115]:
inputs = tokenizer(test_in ,padding=True, truncation=True,
                  return_tensors='tf')
inputs

In [76]:
classifier.call(inputs)

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.00244915],
       [0.99890006]], dtype=float32)>

In [83]:
tf.keras.models.save_model(classifier, '/content/drive/MyDrive/Colab Notebooks/models')




In [85]:
classifier.save_weights('/content/drive/MyDrive/Colab Notebooks/models')

In [86]:
loaded_model = tf.keras.models.load_model('/content/models')

In [124]:
text = test_in[1]
input_ids = tokenizer.encode(text, max_length=512, padding='max_length', truncation=True, return_tensors='tf')
predictions = loaded_model({'input_ids': input_ids,
                            'attention_mask': tf.ones_like(input_ids),
                            'token_type_ids': tf.zeros_like(input_ids)})
print(text)
print(predictions)

Today, I felt good in the morning; everything was good, but in the evening, it rained, and as a result, I got stuck in traffic. My life sucks; I should end it; I should kill myself.
tf.Tensor([[0.99638176]], shape=(1, 1), dtype=float32)


In [132]:
def predict_sentiment(text):
    # loaded_model = tf.saved_model.load("path")
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    input_ids = tokenizer.encode(text, max_length=512, padding='max_length', truncation=True, return_tensors='tf')
    outputs = loaded_model({'input_ids': input_ids,
                                'attention_mask': tf.ones_like(input_ids),
                                'token_type_ids': tf.zeros_like(input_ids)})
    
    prediction = tf.math.round(outputs).numpy()[0][0]

   
    if prediction == 1:
        return "The Text Contains References to self-harm"
    else:
        return "The Text does not Contain References to self-harm"
    
    return sentiment


In [131]:
predict_sentiment(test_in[1])

'The Text Contains References to self-harm'