<a href="https://colab.research.google.com/github/ArazShilabin/sentiment_analysis_using_bert_tokenizer/blob/main/sentiment_analysis_bert_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### mount google drive

In [1]:
""" 
Use this javascript code in inspect>console so you wont need to click the page every 15 min:

########################
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
########################

"""
from google.colab import drive
drive.mount('/content/drive')
%pwd


Mounted at /content/drive


'/content'

### change current path to where the working project folder is at

In [2]:
%cd drive/MyDrive/projects/bert_tokenizer_SA/
%pwd

/content/drive/MyDrive/projects/bert_tokenizer_SA


'/content/drive/MyDrive/projects/bert_tokenizer_SA'

# Step 0: Get The Data

### upload the data to our current path and unzip it (uncomment and run this only once)

In [3]:
# # data is from: http://help.sentiment140.com/for-students you can use this or just upload your own data
# %cd data
# !wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip # download
# !unzip trainingandtestdata.zip # unzip the downloaded file
# %cd .. 
# %pwd

# Step 1: Importing Dependencies

### Imports

In [4]:
import numpy as np
import pandas as pd
import re
import math
import time
from bs4 import BeautifulSoup # we use this library to turn the tweets to texts
import random

### Install Bert

In [5]:
# this is not the official one, it's a lighter one
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/820ccaf55f1e24b5dd43583ac0da6d86c2d27bbdfffadbba69bafe73ca93/bert-for-tf2-0.14.7.tar.gz (41kB)
[K     |████████                        | 10kB 14.9MB/s eta 0:00:01[K     |████████████████                | 20kB 19.0MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 21.5MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 18.6MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.6MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

### Add tensorflow packages and Bert

In [6]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub # all the pretrained models are installed in tensorflow hub
import bert

# Step 2: Data Preprocessing

## A) Load Data

In [7]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "data/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python", # we need this for this specific data
    encoding="latin1") # the encoded format

data.drop(["id","date","query","user"], axis=1, inplace=True) # drop the useless cols
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## B) Clean

In [8]:
def clean_tweet(tweet):
    # the lmxl is the encoding it has, so beautifulsoup turns it to readable txt
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # remove the @'s (*: appear 0 or more)
    tweet = re.sub(r"@[A-Za-z0-9]*", ' ', tweet)
    # remove "http://" or "https://" (this s? means that 's' can be or not)
    tweet = re.sub(r"https?://[A-Za-z0-9]*", ' ', tweet)
    # get rid of everything that is not a letter or ".!?'"
    tweet = re.sub(r"[^a-zA-Z.!?']",' ', tweet)
    # replace >=2 white spaces with one (+: appear at least once)
    tweet = re.sub(" +", ' ', tweet)
    return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data['text']]
data_labels = data['sentiment'].values
data_labels[data_labels == 4] = 1 # for some reason the labels instead of 0's and 1's, they are 0's and 4's so we turn the 4's to 1's

## C) Tokenization (using Bert)

### Load tokenizer (add 'clrs' & 'sep' too)

In [10]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
# all the pretrained models are installed in tensorflow hub, we can find many 
# diffenet kinds in Hub, we can choose differet versions from hub, now we 
# got the weights of the layer (these are the SavedModels)
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

# get the vocabulary from the layer
vocab_file =  bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() # do lowercase
tokenizer = FullTokenizer(vocab_file, do_lower_case) # get the tokenizer


In [11]:
def encode_sentences(sentence):
    return ["[CLS]"] + tokenizer.tokenize(sentence) + ["[SEP]"]

In [12]:
data_inputs = [encode_sentences(sentence) for sentence in data_clean] # encode inputs

### Test our tokenizer

In [13]:
test_text = "Tensorflow is great, ain't it?"
print(tokenizer.tokenize(test_text))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_text)))


['tensor', '##flow', 'is', 'great', ',', 'ain', "'", 't', 'it', '?']
[23435, 12314, 2003, 2307, 1010, 7110, 1005, 1056, 2009, 1029]


## D) Dataset creation

### we need to create 3 different inputs for each sentence (ids, mask, sentence_segments)

In [14]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens): # gets the mask: if token is "[pad]" then 0. else 1.
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            # 0->1 and 1->0, why? we are segmenting the 2 sentences 000...01...111
            current_seg_id ^= 1 
    return seg_ids

### shuffle then sort them so we can choose simmilar size batches

In [15]:
# we get [text, label, len_text] we want this for padding purposes later
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2]) # sort on len

sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 3] # short sentences probably have no sentiment

### Dataset generator (because our data isn't the same size, needs padding)

In [16]:
all_dataset = tf.data.Dataset.from_generator(generator=lambda: sorted_all, # generator needs to be callable
                                             output_types=(tf.int32,tf.int32)) # (inputs, labels) both are int
print(type(all_dataset))
print(next(iter(all_dataset)))

<class 'tensorflow.python.data.ops.dataset_ops.FlatMapDataset'>
(<tf.Tensor: shape=(3, 4), dtype=int32, numpy=
array([[ 101, 2021, 4283,  102],
       [   1,    1,    1,    1],
       [   0,    0,    0,    0]], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


### Batching & Padding

In [17]:
BATCH_SIZE = 32
# turn it to padded
all_batched = all_dataset.padded_batch(
                batch_size=BATCH_SIZE,
                padded_shapes=((3,None),())
                # None here means pad as much as you need (it depends on the len of our sentence)
)
print(type(all_batched))
print(next(iter(all_batched)))
# here they are all size 4 because it's the first batch, but later there
# will be a mixture of size for example 3&4 so the 3's will be padded to 4

<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>
(<tf.Tensor: shape=(32, 3, 4), dtype=int32, numpy=
array([[[  101,  2021,  4283,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101,  2149,  2243,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101,  2428,  1029,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101,  4957,  1029,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101,  7989, 13825,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101,  2023, 19237,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101, 15624,  3599,   102],
        [    1,     1,     1,     1],
        [    0,     0,     0,     0]],

       [[  101,  4067,  2017,   102],
        [    1,     1,     1,  

### train-test split for batches

In [18]:
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE) # get number of batches (ceil(3.2)==4)
NB_BATCHES_TEST = NB_BATCHES // 10 # we just want the 10% of the data for testing purposes
all_batched.shuffle(NB_BATCHES) # shuffle it (we give it NB_BATCHES, and not NB_BATCHES*batch_size because we want to keep the integrity of each batch)
test_data_set = all_batched.take(NB_BATCHES_TEST) # take the fist 10% as test
train_data_set = all_batched.skip(NB_BATCHES_TEST) # skip the 10% to get the other 90% as train

# Step 3: Model Building

### lets see what the bert_layer gives as it's output (VERY IMPORTANT)

In [19]:
sample_sent = "Roses are red."
my_sent = ["[CLS]"] + tokenizer.tokenize(sample_sent) +["[SEP]"]
# we expand dim to simulate a batch....
bert_layer([tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32), 0),
            ])
# the CLS is: (1, 768) which 1 is batch_size. and 768 is the embedding dimension of bert. (*** this specifies the embedding for the whole sequence which we use in our classifier ***)
# the words are: (1, S, 768) which 1 is batch_size. S is the sentence number of tokens. and 768 is the embedding dimension of bert.

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-9.27935600e-01, -4.10335928e-01, -9.65755284e-01,
          9.07318294e-01,  8.12914550e-01, -1.74174860e-01,
          9.11234915e-01,  3.41952562e-01, -8.74521971e-01,
         -9.99989390e-01, -7.78410733e-01,  9.69385445e-01,
          9.86160517e-01,  6.36964917e-01,  9.48631406e-01,
         -7.51193941e-01, -4.58340079e-01, -7.08104968e-01,
          4.62099075e-01, -6.57927752e-01,  7.60415316e-01,
          9.99994755e-01, -3.96862566e-01,  3.44166636e-01,
          6.16489470e-01,  9.94400144e-01, -7.76634395e-01,
          9.38316643e-01,  9.59452212e-01,  7.32879639e-01,
         -6.93437755e-01,  2.93080807e-01, -9.93785501e-01,
         -1.64552301e-01, -9.67019856e-01, -9.95549619e-01,
          5.32936037e-01, -6.88061118e-01,  1.34707419e-02,
          2.98189707e-02, -9.18356717e-01,  4.20526713e-01,
          9.99988854e-01,  2.52677470e-01,  6.06236696e-01,
         -3.50750387e-01, -9.99999762e-01,  4.975

### Model

In [20]:
class DCNN_BERT_Embedding(tf.keras.Model):
    def __init__(self,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name='DCNN_BERT_Embedding'):
        super(DCNN_BERT_Embedding, self).__init__(name=name)
        
        # self.embedding = layers.Embedding(vocab_size, emb_dim)  # we dont use this, instead we use bert's embedding
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable=False)
        
        # bigram is conv on 2-grams (kernel_size=2)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding='valid',
                                    activation='relu')

        self.trigram =  layers.Conv1D(filters=nb_filters,
                                      kernel_size=3,
                                      padding='valid',
                                      activation='relu')
        
        self.fourgram =  layers.Conv1D(filters=nb_filters,
                                       kernel_size=4,
                                       padding='valid',
                                       activation='relu')
        
        self.pool = layers.GlobalMaxPool1D()     

        self.dense_1 = layers.Dense(units=FFN_units,
                                    activation='relu')
        self.dropout = layers.Dropout(rate=dropout_rate)

        # u could use softmax for binary too, but sigmoid with only 1 unit is
        # used always so why not just use it here too...
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation='sigmoid')
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation='softmax')

    ########################################################################################################### new:
    def embed_with_bert(self, all_tokens): # this is a function like the regular embedding layer
        # the '_' represents the whole sentence but we want the words
        # all_tokens=(batch, 3(id_mask_segment), values(d_model))
        _, embs = self.bert_layer([all_tokens[:, 0, :], # the inputs are just like the previous cell's example
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    def call(self, inputs, training):
        # training is boolean, when it's false we don't do dropout
        x = self.embed_with_bert(inputs) # (batch_size, d_embeded(128))

        # we get outputs for all 3 kernelsizes independentaly (we concat them later)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1) # (batch_size, nb_filters, 128-2+1)

        x_2 = self.trigram(x)
        x_2 = self.pool(x_2) # (batch_size, nb_filters, 128-3+1)

        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3) # (batch_size, nb_filters, 128-4+1)

        merged = layers.concatenate([x_1,x_2,x_3], axis=-1) # (batch_size, 3 * nb_filters, (128*3-2-3-4+3))

        merged = self.dense_1(merged)
        merged = self.dropout(merged, training=training)
        
        output = self.last_dense(merged)

        return output

# Step 4: Training

### Hyper Parameters

In [21]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.25
NB_EPOCHS = 2

### Compile

In [22]:
Dcnn = DCNN_BERT_Embedding(
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)


In [23]:
if NB_CLASSES == 2:
     Dcnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])    
else:
     Dcnn.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy'])

### Checkpoints

In [24]:
checkpoint_path = "ckpt2"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest chekpoint has been restored!")

### Custom call back

In [25]:
class MyCustomCallback(tf.keras.callbacks.Callback):
    # we basically tell tensorflow what it should print or do after each epoch ends
    # this on_epoch_end actually exists in tf.keras.callbacks so we just call it
    # that MyCustomCallback has inherited
    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print(f"Checkpoint saved!")

### Train

In [26]:
Dcnn.fit(train_data_set,
         validation_data=test_data_set,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/2
Checkpoint saved!
Epoch 2/2
Checkpoint saved!


<tensorflow.python.keras.callbacks.History at 0x7f529b69a2b0>