# Multiclass Text Classification using BERT and TensorFlow

## 1. Intial setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = '/content/drive/MyDrive/Finalsem'
data_path = path + '/data'
model_path = path + '/models'

In [3]:
!pip install pandas numpy tensorflow-text scikit-learn nltk requests bs4 matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorflow<2.13,>=2.12.0
  Downloading tensorflow-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (585.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3
  Downloading protobuf-4.22.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 KB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from keras import backend as K
import json

## 2. Data preparation

### 2.1 Load the dataset

In [None]:
df = pd.read_csv(data_path + '/tosdr_case_data.csv')
df

Unnamed: 0.1,Unnamed: 0,Text,Case
0,0,YOU WAIVE YOUR RIGHT TO BRING OR PARTICIPATE I...,You waive your right to a class action.
1,1,YOU AND COMPANY WAIVE THE RIGHT TO BRING OR PA...,You waive your right to a class action.
2,2,CLASS WAIVER THIS SECTION AND THE PREVIOUS SEC...,You waive your right to a class action.
3,3,You wave your right to a class action.,You waive your right to a class action.
4,4,ALL CLAIMS AND DISPUTES WITHIN THE SCOPE OF TH...,You waive your right to a class action.
...,...,...,...
23571,23571,"For European Residents, please note that the p...",Your data is processed and stored in a country...
23572,23572,"By accessing CL or providing us data, you agre...",Your data is processed and stored in a country...
23573,23573,The personal information we collect is stored ...,Your data is processed and stored in a country...
23574,23574,"A data transfer to the USA takes place, when w...",Your data is processed and stored in a country...


### 2.2 Mapping labels to cases

In [None]:
num_classes = len(df["Case"].value_counts())

In [None]:
cases_map = json.load(open(data_path + '/cases_map.json'))

In [None]:
# map Cases to labels
df['Labels'] = df['Case'].map(cases_map)

# drop unused column
df = df.drop(["Case"], axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,Text,Labels
0,0,YOU WAIVE YOUR RIGHT TO BRING OR PARTICIPATE I...,10
1,1,YOU AND COMPANY WAIVE THE RIGHT TO BRING OR PA...,10
2,2,CLASS WAIVER THIS SECTION AND THE PREVIOUS SEC...,10
3,3,You wave your right to a class action.,10
4,4,ALL CLAIMS AND DISPUTES WITHIN THE SCOPE OF TH...,10


### 2.3 Split in train and test set

In [None]:
y = tf.keras.utils.to_categorical(df["Labels"].values, num_classes=num_classes)

x_train, x_test, y_train, y_test = train_test_split(df['Text'], y, test_size=0.25)

## 3. Data modeling

### 3.1 Load BERT with TensorFlow Hub

In [None]:
!pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
preprocessor = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")


def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']


get_embeddings(["We may provide you with the option to register with us using your existing social media account details, like your Facebook, Twitter, or other social media account."])

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-2.04660296e-01, -3.37948710e-01, -2.26528421e-01,
        -2.77026951e-01, -8.58247578e-02, -3.03020060e-01,
        -3.48053336e-01,  1.17398478e-01,  3.44240040e-01,
        -3.53154898e-01, -4.90266860e-01, -2.29241267e-01,
        -4.31633055e-01, -5.20016432e-01, -5.14222324e-01,
         3.83768789e-02,  3.98128092e-01, -3.16091478e-01,
         1.77234545e-01, -1.15023017e-01,  5.37374020e-01,
         1.52755320e-01, -2.93543160e-01,  2.16986448e-01,
        -4.05859262e-01, -8.67774785e-01,  3.10210407e-01,
         5.73749579e-02, -1.94481552e-01, -7.15773880e-01,
        -1.60223290e-01, -1.62515745e-01, -5.11625707e-01,
        -2.89124042e-01, -2.84561753e-01,  1.21384501e-01,
        -3.32728803e-01,  1.56743050e-01, -1.11458533e-01,
        -1.85460791e-01,  2.69308556e-02,  3.59443761e-02,
        -3.27055275e-01,  2.80556083e-01, -1.23945074e-02,
        -2.78026126e-02, -3.21841747e-01, -1.84350267e-01,
      

### 3.2 Create Model

#### 3.2.1 Metrics for evaluation

In [None]:
def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


#### 3.2.2 Define Model

In [None]:
i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


#### 3.2.3 Compile and Train the model

In [None]:
n_epochs = 2

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

In [None]:
model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = n_epochs,
                      validation_data = (x_test, y_test),
                      callbacks = [earlystop_callback])

Epoch 1/2

#### 3.2.4 Visualization of metrics vs epochs


In [None]:
import matplotlib.pyplot as plt

x = list(range(1, n_epochs+1))
metric_list = list(model_fit.history.keys())
num_metrics = int(len(metric_list)/2)

fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

for i in range(0, num_metrics):
  ax[i].plot(x, model_fit.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
  ax[i].plot(x, model_fit.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
  ax[i].set_xlabel("epochs",fontsize=14)
  ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
  ax[i].legend(loc="lower left")

### 3.3 Save model

In [None]:
model_name = 'text_classifier_v1_2e'
model.save(model_path + model_name)

## 4. Perform inference

### 4.1 Load model

#### 4.1.1 Evaluation metrics

In [None]:
def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


#### 4.1.2 Load the model

In [None]:
model_name = 'text_classifier_v1_2e'

model = tf.keras.models.load_model(model_path + model_name, custom_objects={
      'accuracy': tf.keras.metrics.CategoricalAccuracy(name = "accuracy"),
      'balanced_recall': balanced_recall,
      'balanced_precision': balanced_precision,
      'balanced_f1_score': balanced_f1_score
})

### 4.2 Preprocessing

#### 4.2.1 Extract content from webpage

In [None]:
import requests
from bs4 import BeautifulSoup

def getTextFromURL(url):
  # Make a request to the webpage
  response = requests.get(url)

  # Parse the HTML content of the page
  soup = BeautifulSoup(response.content, 'html.parser')

  # kill all script and style elements
  for script in soup(["script", "style"]):
      script.extract()    # rip it out

  # get text
  text = soup.find('body').get_text()

  # break into lines and remove leading and trailing space on each
  lines = (line.strip() for line in text.splitlines())

  # break multi-headlines into a line each
  chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

  # drop blank lines
  text = '\n'.join(chunk for chunk in chunks if chunk)

  return text

#### 4.2.2 Sentence level tokenization

In [None]:
import nltk
nltk.download('punkt')

def tokenize(text):
  return nltk.tokenize.sent_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#### 4.2.3 Process Predictions

In [None]:
import json

cases_map = json.load(open(data_path + '/cases_map.json'))

def getCaseFromID(id):
    for k, v in cases_map.items():
        if v == id:
            return k
    return None

In [None]:
import numpy as np

def process_predictions(predictions, sentences):
  prediction_records = []
  for i in range(len(predictions)):
    prediction = predictions[i]
    prediction_records.append((i, sentences[i], np.argmax(prediction), getCaseFromID(np.argmax(prediction)), np.max(prediction)))

  return sorted(prediction_records, key = lambda x: x[4])

### 4.3 Predict

In [None]:
def predict(url):
  text = getTextFromURL(url)
  sentences = tokenize(text)
  raw_predictions = model.predict(sentences)
  predictions = process_predictions(raw_predictions, sentences)
  return predictions 

In [None]:
predictions = predict("https://alegria.co.in/app-privacy-policy")



In [None]:
for record in predictions[-20:]:
  print(record)

(81, "The updated version will be indicated by an updated 'Revised' date and the updated version will be effective as soon as it is accessible.", 60, 'Instead of asking directly, this Service will assume your consent merely from your usage.', 0.25077367)
(65, 'In Short: You may review, change, or terminate your account at any time.If you are located in the EEA or UK and you believe we are unlawfully processing your personal information, you also have the right to complain to your local data protection supervisory authority.', 68, 'You can request access, correction and/or deletion of your data', 0.25643393)
(21, 'The easiest way to exercise your rights is by filling out our data subject request form available here: https://forms.gle/8x9h26x5SPhcPsMu7 , or by contacting us.', 68, 'You can request access, correction and/or deletion of your data', 0.3031188)
(69, 'below.However, please note that this will not affect the lawfulness of the processing before its withdrawal nor, when applicab