# 1. Extract the Kaggle dataset

In [1]:
!unzip /content/Suicide_Detection.csv.zip

Archive:  /content/Suicide_Detection.csv.zip
  inflating: Suicide_Detection.csv   


# 2. Install packages

In [2]:
# !pip install pandas
# !pip install numpy
# !pip install tensorflow
# !pip install tensorflow_hub

# 3. Configuration setup

In [3]:
# ========== Aditya Configuration ==========

class Config:
    
    # Model folder name containing weights and use it with wandb and 3.5 sec i/p and 3.5 sec o/p.
    model_name = "bert"  
    
    
    # Current directory as same as repository.
    workspace_dir = "/content/Aditya_workspace_in_GPU/" 
    
    # Modules related to custom libraries.
    module_dir ="/src"  
    
    # Location to save model checkpoints.
    checkpoint_dir = (
        "/content/aditya/model_checkpoints/"          
    )
    
    # Location to retrieve dataset.
    data_dir = "/content/datasets/"  

# 4. Import packages

In [12]:
# ========== Python libraries ==========

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub
from keras.utils import to_categorical
from tensorflow.python.client import device_lib

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tokenization import *

# 5. Define parameters

In [7]:
# ========== Model and Learning Parameters ==========

output='model.h5'

device_name = '/device:GPU:0'

m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'

url = "https://drive.google.com/uc?id={}".format("1GDYhTNHhat-qa_6fm5OqWTfTUclHFo6Z")

# Initialize seed to any value for reproducing the results of previous model 
# for fine tuning or designing with new layers or simple training the model.
SEED = 1291       

# Select the batch size which makes max usage of memory for
# passing data samples to the model for training purpose.
BATCH_SIZE = 32    

# Assign the total number of loops for training model over whole dataset.
N_EPOCHS = 10    

# Early stop the model from training till atmost 300 epochs.
EARLY_STOP_EPOCHS = 5 

# Select optimum set of weights for the model after each 
# batch so that the model succeeds in reaching the objective.
LEARNING_RATE = 0.001    

# 6. User defined functions


In [8]:
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']
    
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
        
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len-len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
        
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


def build_model(bert_layer, max_len=512):
  
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    
    clf_output = sequence_output[:, 0, :]
    
    lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    lay = tf.keras.layers.Dense(32, activation='relu')(lay)
    lay = tf.keras.layers.Dropout(0.2)(lay)
    out = tf.keras.layers.Dense(2, activation='softmax')(lay)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model    

# 7. Load dataset

In [9]:
df = pd.read_csv("/content/Suicide_Detection.csv")
df = df.drop(['Unnamed: 0'], axis=1)

# 8. Configure BERT

In [10]:
bert_layer = hub.KerasLayer(m_url, trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
get_available_gpus()

# 9. Prepare dataset for training

In [None]:
max_len = 250
with tf.device(device_name):

  train_input = bert_encode(df.text.values[:10000], tokenizer, max_len=max_len)
  test_input = bert_encode(df.text.values[10000:15000], tokenizer, max_len=max_len)

In [None]:
label = preprocessing.LabelEncoder()
train_y = label.fit_transform(df['class'][:10000])
train_y = to_categorical(train_y)

test_y = label.fit_transform(df['class'][10000:15000])
test_y = to_categorical(test_y)
print(train_y[:5])

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]


# 10. Initialize BERT

In [16]:
model = build_model(bert_layer, max_len=max_len)
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 250)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 250)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 250)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 250, 768)]                'input_mask[0][0]',         

# 11. Training the model

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

with tf.device(device_name):

  tf.config.experimental_run_functions_eagerly(True)
  
  train_sh = model.fit(
    train_input, train_y,
    validation_data=[test_input,test_y],
    epochs=10,
    callbacks=[checkpoint, earlystopping],
    batch_size=16,
    verbose=1
   )

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.91080, saving model to model.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.91080 to 0.92040, saving model to model.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.92040 to 0.92220, saving model to model.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.92220 to 0.92540, saving model to model.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.92540 to 0.92740, saving model to model.h5
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.92740
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.92740
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.92740
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.92740
Epoch 10/10
Epoch 10: val_accuracy improved from 0.92740 to 0.93020, saving model to model.h5


# 12. Inference test

In [17]:
model.load_weights("/content/drive/MyDrive/model_new.h5")

In [33]:
def outcome(test):
  
  suicide=0
  if test[0][1]>0.5:
      suicide = 1

  if suicide==1:
    result="Suicide"
  else:
    result="No suicide"
  return result  

In [29]:
text_1 = ["Mood Diary Input | Day 1 = Today, I felt good in the morning; everything was good,but in the evening",
        ", it rained, and as a result, I got stuck in traffic. My life sucks;I should end it; I should kill myself."]


text_2 = ['''Mood Diary Input | Day 1 = "Today I felt good in the morning, everything was good, but in the evening, it rained, and as a result, I got stuck in the traffic; my life sucks"''']        

In [31]:
test_1 = bert_encode([text_1[0]+text_1[1]],tokenizer,max_len=max_len)

In [34]:
outcome(model.predict(test_1))



'Suicide'

In [35]:
test_2= bert_encode(text_2,tokenizer,max_len=max_len)

In [36]:
outcome(model.predict(test_2))



'No suicide'