In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "drive/MyDrive/Projects/BERT/BERT_TOKENIZER"
# !wget "cs.stanford.edu/people/alecmgo/trainingandtestdata.zip"
# !unzip trainingandtestdata.zip

/content/drive/MyDrive/Projects/BERT/BERT_TOKENIZER


In [None]:
import numpy as np
import pandas as pd
import re
import math
import time
from bs4 import BeautifulSoup
import random

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/820ccaf55f1e24b5dd43583ac0da6d86c2d27bbdfffadbba69bafe73ca93/bert-for-tf2-0.14.7.tar.gz (41kB)
[K     |████████                        | 10kB 20.6MB/s eta 0:00:01[K     |████████████████                | 20kB 16.5MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 15.0MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 14.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 6.5MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub

import bert

In [None]:
columns = ["sentiment","id","date","query","user","text"]
data = pd.read_csv(
                "train.csv",
                header=None,
                names=columns,
                engine="python",
                encoding="latin1")

In [None]:
data.drop(["id","date","query","user"],axis=1,inplace=True)

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet,'lxml').get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+",' ',tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+"," ",tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']"," ",tweet)
    tweet = re.sub(r" +",' ',tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data['text']]

In [None]:
data_labels = data["sentiment"].values
data_labels[data_labels == 4] = 1 

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

vocab_file =  bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()    
tokenizer = FullTokenizer(vocab_file,do_lower_case)

In [None]:
def encode_sentences(sentence):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))

In [None]:
data_inputs = [encode_sentences(sentence) for sentence in data_clean]

In [None]:
data_with_len = [ [sent,data_labels[i],len(sent)] for i,sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x : x[2])
sorted_all = [ (sent_lab[0],sent_lab[1]) for sent_lab in data_with_len
              if sent_lab[2] > 2]

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,output_types=(tf.int32,tf.int32))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(
                batch_size=BATCH_SIZE,
                padded_shapes=((None,),())
)

In [None]:
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_data_set = all_batched.take(NB_BATCHES_TEST)
train_data_set = all_batched.skip(NB_BATCHES_TEST)

In [None]:
class DCNN(tf.keras.Model):
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name='dcnn'):
        super(DCNN,self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,emb_dim)
        
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding='valid',
                                    activation='relu')

        self.trigram =  layers.Conv1D(filters=nb_filters,
                                      kernel_size=3,
                                      padding='valid',
                                      activation='relu')
        
        self.fourgram =  layers.Conv1D(filters=nb_filters,
                                       kernel_size=4,
                                       padding='valid',
                                       activation='relu')
        
        self.pool = layers.GlobalMaxPool1D()     

        self.dense_1 = layers.Dense(units=FFN_units,
                                    activation='relu')
        self.dropout = layers.Dropout(rate=dropout_rate)

        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,activation='sigmoid')
        else:
            self.last_dense = layers.Dense(units=nb_classes,activation='softmax')

    def call(self,inputs,training):
        x = self.embedding(inputs)

        x_1 = self.bigram(x)
        x_1 = self.pool(x_1) # (batch_size,nb_filters)

        x_2 = self.trigram(x)
        x_2 = self.pool(x_2) # (batch_size,nb_filters)

        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3) # (batch_size,nb_filters)

        merged = layers.concatenate([x_1,x_2,x_3],axis=-1) # (batch_size,3*nb_filters)

        merged = self.dense_1(merged)
        merged = self.dropout(merged,training=training)
        
        output = self.last_dense(merged)

        return output

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
     Dcnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])    
else:
     Dcnn.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy'])    

In [None]:
checkpoint_path = "ckpt"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest chekpoint has been restored!")

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):
    
    def on_epoch_end(self,epoch,logs=None):
        ckpt_manager.save()
        print(f"Checkpoint saved at {checkpoint_path}")

In [None]:
Dcnn.fit(train_data_set,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
Checkpoint saved at ckpt
Epoch 2/5
Checkpoint saved at ckpt
Epoch 3/5
Checkpoint saved at ckpt
Epoch 4/5
Checkpoint saved at ckpt
Epoch 5/5
Checkpoint saved at ckpt


<tensorflow.python.keras.callbacks.History at 0x7fbb328cdeb8>

In [None]:
def predict(example_sent):
    example_sent = encode_sentences(example_sent)
    example_sent = np.expand_dims(example_sent,axis=0)
    prediction = Dcnn.predict(example_sent)
    prediction = int(np.floor(prediction*2)[0,0])
    if prediction == 1:
        print("positive")
    else:
        print("negative")

In [None]:
negative_example = "I am so sad"
predict(negative_example)

negative


In [None]:
positive_example = "He is my best friend"
predict(positive_example)

positive
