In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

import bert_tokenizer as tokenizer
from bert import tokenization
#from bert import bert_tokenization
from bert.tokenization import *







In [None]:


#import tensorflow as tf
# !pip install tensorflow==1.12.0
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert


In [None]:
#Data processing"
cols = ["sentiment","id","date","query","user","text"]
data = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None,
names=cols,
encoding="latin1")
data.head()



In [None]:
data.drop(["id","date","query","user"],
axis=1,
inplace=True)

In [None]:
data.head(5)

In [None]:
#CLEANING
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet,"lxml").get_text()
    # tweet = BeautifulSoup(tweet,'lxml')
    # tweet = tweet.get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)
    tweet = re.sub(r"https?://A-Za-z0-9]+", ' ',tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ',tweet)
    tweet = re.sub(r" +", ' ',tweet)
    return tweet






In [None]:
DATASET CREATION

#We will create padded batches (so we pad sentences for each batch independetly),this way we add the minimum of padding tokens possible.For that,we sort sentences by lenghth ,apply padded_batches and then shuffle.

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] =1

In [None]:
# TOKENIZATION

In [None]:
# fullTokenizer = bert.tokenization.FullTokenizer()
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
#FullTokenizer = bert.tokenization.FullTokenizer(vocab_file)
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file,do_lower_case)

In [None]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [None]:
data_with_len = [[sent, data_labels[i],len (sent)]
                for i, sent in enumerate(data_inputs)]

random.shuffle(data_with_len)
data_with_len.sort(key=lambda x:x[2])
sorted_all = [(sent_lab[0],sent_lab[1])
for sent_lab in data_with_len if sent_lab[2] > 7]


In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda:sorted_all,output_types=(tf.int32,tf.int32))






In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE,padded_shapes=((None, ),()))

In [None]:
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES//10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [None]:
MODEL BUILDING

In [None]:
class DCNN(tf.keras.Model):
    def __init__(self,
    vocab_size,
    emb_dim=128,
    nb_filters=50,
    FFN_units=512,
    nb_classes=2,
    dropout_rate=0.1,
    training= False,
    name = "dcnn"):

        super(DCNN,self).__init__(name=name)

    self.embedding = layers.Embedding(vocab_size,emb_dim)
    self.bigram = layers.Convolution1D(filters=nb_filters,
                  kernel_size=2,
                  padding="valid",
                  activation="relu")
    self.trigram = layers.Convolution1D(filters=nb_filters,
                  kernel_size=3,
                  padding="valid",
                  activation="relu")
    self.fourgram = layers.Convolution1D(filters=nb_filters,
                  kernel_size=3,
                  padding="valid",
                  activation="relu")

    self.pool = layers.GlobalAvgPool1D()

    self.dense_1 = layers.Dense(units=FFN_units,activation="relu")

    self.dropout - layers.Dropout(rate= dropout_rate)

    if nb_classes ==2:
        self.last_dense = layers.Dense(units=1,activation="sigmoid")
    else:
        self.last_dense = layers.Dense(units=nb_classes,activation="softmax")


def call(self,inputs,training):
    x= self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)
    x_2= self.trigram(x)
    x_2 = self.pool(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)  #(batch_size,nb_filters)

    merged = tf.concat([x_1,x_2,x_3],axis=-1) #batch_size, 3*nb_filters
    merged = self.dense_1(merged)
    merged = self.dropout(merged,training)
    output = self.last_dense(merged)

    return output




In [None]:
#TRAINING

In [None]:
 VOCAB_SIZE = len(tokenizer.vocab)
EMB_DM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
        emb_dim=EMB_DM,
        nb_filters=NB_FILTERS,
        nb_classes=NB_CLASSES,
        dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES ==2:
    Dcnn.compile(loss="binary_crossentropy",
    optimizer = "admin",
    metrics= ["accuracy"])

else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
    optimizer = "adam",
    metrics = ["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "C:\Users\F5390087\Downloads\Udemy Bert\ckpt_bert_tok"

ckpt = tf.train.Checkpoint(Denn=Denn)
ckpt_manager = tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt_manager.restore(ckpt_manager.latest_checkpoit)
    print("Latest checkpoint restored")

In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self,epoch,logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

In [None]:
#EVALUATION

In [None]:
results = Dcnn.evaluate(test_dataset)
print(resuts)

In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(token,0)

    output = Dcnn(inouts,training= False)

    sentiment = math.floor(output*2)

    if sentiment ==0:
        print("output of the model: {}\nPredicted sentiment: negative.".format(output))

    elif sentiment ==1:
        print("output of the model: {}\nPredicted sentiment: positive.".format(output))

In [None]:
get_prediction("this movie was pretty interesting")