# Importing Dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/820ccaf55f1e24b5dd43583ac0da6d86c2d27bbdfffadbba69bafe73ca93/bert-for-tf2-0.14.7.tar.gz (41kB)
[K     |████████                        | 10kB 16.4MB/s eta 0:00:01[K     |████████████████                | 20kB 21.0MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 12.9MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 10.2MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 4.5MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [3]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

# Data Loading

In [4]:
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/Project/BERT/sentiment_data/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [6]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [7]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


#Preprocessing

> Cleaning


In [8]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [10]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1 #changing from 4 to 1 (0=negative, 1=positive) 



> Tokenization



We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [11]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
#hub is a tensorflow hub where all modules are stored
#we have used bert base
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
#vocab file for tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
#do lower case for tokenizer
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [12]:
tokenizer.tokenize("My dog loves strawberries.")

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [13]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog loves strawberries."))

[2026, 3899, 7459, 13137, 20968, 1012]

In [14]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [15]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]



> Dataset Creation



We will create padded batches (so we pad sentences for each batch independently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [16]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len) #shuffling so that we don't have sentiment reviews in particular order
data_with_len.sort(key=lambda x: x[2]) #accessing last element i.e. len
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7] #taking len greater than 7

In [17]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [18]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([22555,  2001,  2725,  7929,  2077,  2059,  2205,  1012],
       dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [19]:
#Batch size is 32
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [20]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[22555,  2001,  2725,  7929,  2077,  2059,  2205,  1012],
        [15775,  2721, 12871,  2025,  2061, 12476,  4402,  4402],
        [ 7459,  1996,  2614,  1997,  1996, 10474,  2075,  5055],
        [12476,   999,  2017,  1005,  2222,  2031,  1037,  8479],
        [ 2672,  2183,  2000,  2166, 13102,  4140,  3892,  1012],
        [ 2074,  2587,  1998,  4033,  2102,  2288,  1037,  9789],
        [ 2009,  2356,  2033,  2000,  3443,  2019,  4070,  1012],
        [18072,  2129,  2017,  3110,  2651,  1029,  3866,  1029],
        [ 5409,  2739,  2412,  3806, 18886,  7315,  2003,  2067],
        [ 1045,  2113,  3599,  2054,  8038,  2812,  2611,  2666],
        [ 1996,  2914,  2003,  2024, 15180,  2229,  8462,   999],
        [ 2188,  2013,  2147,  2525,  2651,  2253,  3435,   999],
        [ 2339,  1996,  3109,  2572,  1045,  2145,  8300,  1029],
        [ 2074,  2985,  1997,  2619,  2842,  2015,  2769,  1012],
        [10474,  2772,  2039

In [21]:
#Number of batches
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
#Shuffling the batches
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

#Model Building

In [22]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        #Convolutional layer (2 consecutive words)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        #Convolutional layer (3 consecutive words)
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        #Convolutional layer (4 consecutive words)
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

#Training

In [23]:
#Hyperparameters
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

In [24]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [25]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [26]:
checkpoint_path = "/content/drive/MyDrive/Project/BERT/ckpt_bert_tok"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [27]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [28]:
Dcnn.fit(train_dataset, epochs=NB_EPOCHS, callbacks=[MyCustomCallback()])

Epoch 1/5
  37196/Unknown - 3572s 96ms/step - loss: 0.4295 - accuracy: 0.8025Checkpoint saved at /content/drive/MyDrive/Project/BERT/ckpt_bert_tok.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fa744f23eb8>

#Evaluation

In [29]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.4268627166748047, 0.8121823668479919]


In [30]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [31]:
get_prediction("This movie was pretty interesting.")

Ouput of the model: [[0.9994223]]
Predicted sentiment: positive.
