In [52]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

import bert_tokenizer as tokenizer
from bert import tokenization
#from bert import bert_tokenization
from bert.tokenization import *


In [54]:

#import tensorflow as tf
# !pip install tensorflow==1.12.0
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert


In [66]:
#Data processing"
cols = ["sentiment","id","date","query","user","text"]
data = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None,
names=cols,
encoding="latin1")
data.head()



Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [67]:
data.drop(columns=["id","date","query","user"], inplace=True)

In [68]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [69]:
#CLEANING
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet,"lxml").get_text()
    # tweet = BeautifulSoup(tweet,'lxml')
    # tweet = tweet.get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
    tweet = re.sub(r"https?://A-Za-z0-9]+", ' ',tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ',tweet)
    tweet = re.sub(r" +", ' ',tweet)
    return tweet


DATASET CREATION

#We will create padded batches (so we pad sentences for each batch independetly),this way we add the minimum of padding tokens possible.For that,we sort sentences by lenghth ,apply padded_batches and then shuffle.

In [70]:
data_clean = [clean_tweet(tweet) for tweet in data.text]



In [71]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] =1

TOKENIZATION

In [72]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
#FullTokenizer = bert.tokenization.FullTokenizer(vocab_file)
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file,do_lower_case)

In [73]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [74]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [75]:
data_with_len = [[sent, data_labels[i],len (sent)]
                for i, sent in enumerate(data_inputs)]

random.shuffle(data_with_len)
data_with_len.sort(key=lambda x:x[2])
sorted_all = [(sent_lab[0],sent_lab[1])
for sent_lab in data_with_len if sent_lab[2] > 7]


In [76]:
all_dataset = tf.data.Dataset.from_generator(lambda:sorted_all,output_types=(tf.int32,tf.int32))



In [77]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE,padded_shapes=((None, ),()))

In [78]:
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES//10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

MODEL BUILDING

In [95]:
class DCNN(tf.keras.Model):
    
    #vocab_size = 20000


    def __init__(self,vocab_size,emb_dim=128,nb_filters=50, FFN_units=512, nb_classes=2,dropout_rate=0.1,training= False,name = "dcnn"):
        
        super(DCNN,self).__init__(name=name)

        self.embedding = layers.Embedding(vocab_size,emb_dim)
        self.bigram = layers.Convolution1D(filters=nb_filters,
                  kernel_size=2,
                  padding="valid",
                  activation="relu")
        self.trigram = layers.Convolution1D(filters=nb_filters,
                  kernel_size=3,
                  padding="valid",
                  activation="relu")
        self.fourgram = layers.Convolution1D(filters=nb_filters,
                  kernel_size=3,
                  padding="valid",
                  activation="relu")

        self.pool = layers.GlobalAvgPool1D()

        self.dense_1 = layers.Dense(units=FFN_units,activation="relu")

        self.dropout = layers.Dropout(rate= dropout_rate)

        if nb_classes ==2:

           self.last_dense = layers.Dense(units=1,activation="sigmoid")
        else:
           self.last_dense = layers.Dense(units=nb_classes,activation="softmax")


    def call(self,inputs,training):
        x= self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2= self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)  #(batch_size,nb_filters)

        merged = tf.concat([x_1,x_2,x_3],axis=-1) #batch_size, 3*nb_filters
        merged = self.dense_1(merged)
        merged = self.dropout(merged,training)
        output = self.last_dense(merged)

        return output




#TRAINING

In [96]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [97]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
        emb_dim=EMB_DM,
        nb_filters=NB_FILTERS,
        nb_classes=NB_CLASSES,
        dropout_rate=DROPOUT_RATE)

In [98]:
if NB_CLASSES ==2:
    Dcnn.compile(loss="binary_crossentropy",
    optimizer = "adam",
    metrics= ["accuracy"])

else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
    optimizer = "adam",
    metrics = ["sparse_categorical_accuracy"])

In [99]:

checkpoint_path = (r"C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok")

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)


if ckpt_manager.latest_checkpoint:
    print ("in")
    ckpt_manager.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored")

In [100]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self,epoch,logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [101]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
Checkpoint saved at C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok.
Epoch 2/5
Checkpoint saved at C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok.
Epoch 3/5
Checkpoint saved at C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok.
Epoch 4/5
Checkpoint saved at C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok.
Epoch 5/5
Checkpoint saved at C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok.


<tensorflow.python.keras.callbacks.History at 0x1f91e813be0>

#EVALUATION

In [102]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.8800873160362244, 0.8241663575172424]


In [110]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens,0)

    output = Dcnn(inputs,training= False)

    sentiment = math.floor(output*2)

    if sentiment ==0:
        print("output of the model: {}\nPredicted sentiment: negative.".format(output))

    elif sentiment ==1:
        print("output of the model: {}\nPredicted sentiment: positive.".format(output))

In [111]:
print(get_prediction("this movie was pretty interesting"))



None


INPUTS 


We only use the first sentence for BERT inputs so we add the CLS token at the beginning and the SEP token at the end of each sentence

In [115]:
def encode_sentence(sent):
    return["[CLS]"] + tokenizer.tokenize(sent)+ ["[SEP]"]

In [113]:
data_inputs = [encode_sentence(sent) for sent in data_clean]




DATASET CREATION 

We will createvthe 3 different inputs for each sentence

In [114]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens,"[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id=0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_sig_id = 1-current_seg_id
    return seg_ids
    

We will create padded batches (so we pad sentences for each batch independently)this way we add the minimum of possible.For that,we get sentences by length,apply padded_batches and then shuffle