In [1]:
# Installing a library that is used to create model
!pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 28.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 429 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting 

In [22]:
import numpy as np
from nltk.corpus import stopwords
import nltk
import re
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Training

In [23]:

import pandas as pd
df = pd.read_csv("/content/test_data.csv")
df = df.dropna()  

sentences = df['text'].values

ex = sentences.shape[0]
y = np.zeros((ex))
y[df['class'] == 'suicide'] = 1

df["coded_class"] = y

In [24]:
train_data, test_data = train_test_split(
    df, test_size=0.25, random_state=1000)

In [25]:
# Name of the BERT model to use
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 300
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [26]:


bert = transformer_model.layers[0]

input_ids = Input(shape=(max_length), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

bert_model = bert(inputs)[1]
pooled_output = Dropout(0.2)(bert_model)

issue = Dense(units=100)(pooled_output)
issue = Dropout(0.2)(issue)
issue = Dense(units=10)(issue)
issue = Dropout(0.2)(issue)
product = Dense(units=2)(issue)
outputs = { 'product': product}

model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')


In [27]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics

loss = {'product': CategoricalCrossentropy(from_logits = True)}
metric = {'product': CategoricalAccuracy('accuracy')}

model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)



In [28]:


y_product = to_categorical(train_data['coded_class'])
# Tokenize the input (takes some time)
x = tokenizer(
    text=train_data['text'].to_list(), 
   add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)


In [29]:
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'product': y_product},
    validation_split=0.2,
    batch_size=8,
    epochs=5)

model.save_weights("weights.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
#######################################
### ----- Evaluate the model ------ #### Ready test data
# Tokenize the input (takes some time)
y_product_test = to_categorical(test_data['coded_class'])

test_x = tokenizer(
    text=test_data['text'].to_list(), 
   add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'product': y_product_test}
)



In [32]:
def set_array(ar):
  final = []
  for a in ar:
    final.append(np.argmax(a))
  return final
  

In [38]:
predicted = model.predict(x={'input_ids': test_x['input_ids']})['product']
predicted = set_array(predicted)

In [39]:
predicted

[1, 1, 1]

In [42]:
y_product_test = set_array(y_product_test)

In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_product_test, predicted)

array([[0, 3],
       [0, 0]])