<a href="https://colab.research.google.com/github/ArazShilabin/SA_using_bert_tokenizer/blob/main/SA_bert_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### mount google drive

In [1]:
""" 
Use this javascript code in inspect>console so you wont need to click the page every 15 min:

########################
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
########################

"""
from google.colab import drive
drive.mount('/content/drive')
%pwd


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content'

### change current path to where the working project folder is at

In [2]:
%cd drive/MyDrive/projects/bert_tokenizer_SA/
%pwd

/content/drive/MyDrive/projects/bert_tokenizer_SA


'/content/drive/MyDrive/projects/bert_tokenizer_SA'

# Step 0: Get The Data

### upload the data to our current path and unzip it (uncomment and run this only once)

In [3]:
# # data is from: http://help.sentiment140.com/for-students you can use this or just upload your own data
# %cd data
# !wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip # download
# !unzip trainingandtestdata.zip # unzip the downloaded file
# %cd .. 
# %pwd

# Step 1: Importing Dependencies

### Imports

In [4]:
import numpy as np
import pandas as pd
import re
import math
import time
from bs4 import BeautifulSoup # we use this library to turn the tweets to texts
import random

### Install Bert

In [5]:
# this is not the official one, it's a lighter one
!pip install bert-for-tf2
!pip install sentencepiece



### Add tensorflow packages and Bert

In [6]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub # all the pretrained models are installed in tensorflow hub
import bert

# Step 2: Data Preprocessing

## A) Load Data

In [7]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "data/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python", # we need this for this specific data
    encoding="latin1") # the encoded format

data.drop(["id","date","query","user"], axis=1, inplace=True) # drop the useless cols
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## B) Clean

In [8]:
def clean_tweet(tweet):
    # the lmxl is the encoding it has, so beautifulsoup turns it to readable txt
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # remove the @'s (*: appear 0 or more)
    tweet = re.sub(r"@[A-Za-z0-9]*", ' ', tweet)
    # remove "http://" or "https://" (this s? means that 's' can be or not)
    tweet = re.sub(r"https?://[A-Za-z0-9]*", ' ', tweet)
    # get rid of everything that is not a letter or ".!?'"
    tweet = re.sub(r"[^a-zA-Z.!?']",' ', tweet)
    # replace >=2 white spaces with one (+: appear at least once)
    tweet = re.sub(" +", ' ', tweet)
    return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data['text']]
data_labels = data['sentiment'].values
data_labels[data_labels == 4] = 1 # for some reason the labels instead of 0's and 1's, they are 0's and 4's so we turn the 4's to 1's

## C) Tokenization (using Bert)

### Load tokenizer

In [10]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
# all the pretrained models are installed in tensorflow hub, we can find many 
# diffenet kinds in Hub, we can choose differet versions from hub, now we 
# got the weights of the layer (these are the SavedModels)
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

# get the vocabulary from the layer
vocab_file =  bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() # do lowercase
tokenizer = FullTokenizer(vocab_file, do_lower_case) # get the tokenizer


In [11]:
def encode_sentences(sentence): # e.g. 'tensorflow is' -> [23435, 12314, 2003]
    # the full explenation is in the 'Test our tokenizer' section
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))

In [12]:
data_inputs = [encode_sentences(sentence) for sentence in data_clean] # encode inputs

### Test our tokenizer

In [13]:
test_text = "Tensorflow is great, ain't it?"
print(tokenizer.tokenize(test_text))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_text)))


['tensor', '##flow', 'is', 'great', ',', 'ain', "'", 't', 'it', '?']
[23435, 12314, 2003, 2307, 1010, 7110, 1005, 1056, 2009, 1029]


## D) Dataset creation

### Shuffle and get rid of small datasets

In [14]:
# we get [text_encoded, label, len_text] we want this for padding purposes later
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len) # we shuffle the data because it's labels were 000....0011....111
data_with_len.sort(key=lambda x: x[2]) # we sort it based on lenghts
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 3] # short sentences probably have no sentiment

### Dataset generator (because our data isn't the same size, needs padding)

In [15]:
all_dataset = tf.data.Dataset.from_generator(generator=lambda: sorted_all, # generator needs to be callable
                                             output_types=(tf.int32,tf.int32)) # (inputs, labels) both are int
print(type(all_dataset))
print(next(iter(all_dataset)))

<class 'tensorflow.python.data.ops.dataset_ops.FlatMapDataset'>
(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 2026,  2132, 13403,  1012], dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)


### Batching & Padding

In [16]:
BATCH_SIZE = 32
# turn it to padded
all_batched = all_dataset.padded_batch(
                batch_size=BATCH_SIZE,
                padded_shapes=((None,),())
                # None here means pad the first dim and keep the same vals for the next dims.
                # for output we dont want to use it for padding so we choose it to be empty
)
print(type(all_batched))
print(next(iter(all_batched)))
# here they are all size 4 because it's the first batch, but later there
# will be a mixture of size for example 3&4 so the 3's will be padded to 4

<class 'tensorflow.python.data.ops.dataset_ops.PaddedBatchDataset'>
(<tf.Tensor: shape=(32, 4), dtype=int32, numpy=
array([[ 2026,  2132, 13403,  1012],
       [ 6160,  3582, 19338,  2063],
       [ 2039,  2001,  7078,  6429],
       [ 2017, 20482,  8808,  1029],
       [ 2204,  2851, 10474,  2155],
       [ 2074,  2288,  8412,  7136],
       [ 5404,  4989, 19237,  2295],
       [ 3335,  2078, 22794, 22017],
       [ 2003,  5962,  2000,  2189],
       [ 1045,  2540,  2703,  5232],
       [ 2746,  2091,  2007,  2242],
       [ 1045,  2342,  8771,  2205],
       [ 2012,  2026,  4624,  2551],
       [ 2003,  1999,  3009,  3499],
       [ 1045,  2439,  2026,  4950],
       [ 1060,  2033,  2205,  2080],
       [ 2307, 19431,  4037,   999],
       [ 1043,  2305,   999,  1060],
       [ 2107,  1037,  4030,  2154],
       [ 2053,  2028,  2182,  1029],
       [ 2204,  2851,  3071,   999],
       [ 5477,  2197,  4957,  3631],
       [ 1045,  2342,  1037,  2166],
       [ 1045,  2572,  2085,   99

### train-test split for batches

In [17]:
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE) # get number of batches (ceil(3.2)==4)
NB_BATCHES_TEST = NB_BATCHES // 10 # we just want the 10% of the data for testing purposes
all_batched.shuffle(NB_BATCHES) # shuffle it (we give it NB_BATCHES, and not NB_BATCHES*batch_size because we want to keep the integrity of each batch)
test_data_set = all_batched.take(NB_BATCHES_TEST) # take the fist 10% as test
train_data_set = all_batched.skip(NB_BATCHES_TEST) # skip the 10% to get the other 90% as train

# Step 3: Model Building

In [18]:
class DCNN(tf.keras.Model):
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name='dcnn'):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size, emb_dim)
        
        # bigram is conv on 2-grams (kernel_size=2)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding='valid',
                                    activation='relu')

        self.trigram =  layers.Conv1D(filters=nb_filters,
                                      kernel_size=3,
                                      padding='valid',
                                      activation='relu')
        
        self.fourgram =  layers.Conv1D(filters=nb_filters,
                                       kernel_size=4,
                                       padding='valid',
                                       activation='relu')
        
        self.pool = layers.GlobalMaxPool1D()     

        self.dense_1 = layers.Dense(units=FFN_units,
                                    activation='relu')
        self.dropout = layers.Dropout(rate=dropout_rate)

        # u could use softmax for binary too, but sigmoid with only 1 unit is
        # used always so why not just use it here too...
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation='sigmoid')
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation='softmax')

    def call(self, inputs, training):
        # training is boolean, when it's false we don't do dropout
        x = self.embedding(inputs) # (batch_size, d_embeded(128))

        # we get outputs for all 3 kernelsizes independentaly (we concat them later)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1) # (batch_size, nb_filters, 128-2+1)

        x_2 = self.trigram(x)
        x_2 = self.pool(x_2) # (batch_size, nb_filters, 128-3+1)

        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3) # (batch_size, nb_filters, 128-4+1)

        merged = layers.concatenate([x_1,x_2,x_3], axis=-1) # (batch_size, 3 * nb_filters, (128*3-2-3-4+3))

        merged = self.dense_1(merged)
        merged = self.dropout(merged, training=training)
        
        output = self.last_dense(merged)

        return output

# Step 4: Training

### Hyper Parameters

In [19]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 1

### Compile

In [20]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)


In [21]:
if NB_CLASSES == 2:
     Dcnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])    
else:
     Dcnn.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy'])

### Checkpoints

In [22]:
checkpoint_path = "ckpt"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest chekpoint has been restored!")

Latest chekpoint has been restored!


### Custom call back

In [23]:
class MyCustomCallback(tf.keras.callbacks.Callback):
    # we basically tell tensorflow what it should print or do after each epoch ends
    # this on_epoch_end actually exists in tf.keras.callbacks so we just call it
    # that MyCustomCallback has inherited
    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print(f"Checkpoint saved!")

### Train

In [24]:
Dcnn.fit(train_data_set,
         validation_data=test_data_set,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Checkpoint saved!


<tensorflow.python.keras.callbacks.History at 0x7f9dc3ccf748>

# Step 5: Evaluation

In [40]:
def predict(example_sentence):
    example_sentence = encode_sentences(example_sentence)
    # we add this extra dim to simulate the batch
    example_sentence = np.expand_dims(example_sentence,axis=0)
    prediction = Dcnn(example_sentence, training=False)
    # the prediction will be between 0 to 1 ([0 to 0.5] is 0 AND [0.5 to 1] is 1)
    # so we mult it by 2 then do floor to either get 0 or 1
    Sentiment = math.floor(prediction*2)
    if Sentiment >= 1:
        print(f"positive - certainty: {(prediction - 0.5) * 2.0}")
    else:
        print(f"negative - certainty: {(0.5 - prediction) * 2.0}")

In [41]:
negative_example = "The world is ending"
predict(negative_example)

negative - certainty: [[0.8656031]]


In [51]:
negative_example = "this world is a great place to live in"
predict(negative_example)

positive - certainty: [[0.9342562]]
