# Stage 1: Importing dependencies

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l[K     |████████                        | 10 kB 24.4 MB/s eta 0:00:01[K     |████████████████                | 20 kB 9.1 MB/s eta 0:00:01[K     |███████████████████████▉        | 30 kB 6.3 MB/s eta 0:00:01[K     |███████████████████████████████▉| 40 kB 3.5 MB/s eta 0:00:01[K     |████████████████████████████████| 41 kB 141 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=adcc57143483ec2dfe3fff6a95c77ea8b833a01e9ec1607284a0f1362e772d42
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Buildin

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [None]:
print(tf. __version__) 

2.7.0


# Stage 2: Data preprocessing

## Loading files

In this section the dataset is loaded into a pandas dataframe.

The **[Stock Market Sentiment Dataset](https://www.kaggle.com/yash612/stockmarket-sentiment-dataset)** is kindly provided on the Kaggle website by [Yash Chaudhary](https://www.kaggle.com/yash612). The dataset contains approximately 6,000 tweets regarding stocks, trading and economic forecasts from twitter. Each tweet was classified as having a **positive(1)** or **negative(0)** sentiment.

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#DATA - Stock Sentiment Analysis from Kaggle
data = pd.read_csv(
    "/content/drive/MyDrive/NLP/StockSentiment/stock_sentiment.csv",
    engine="python",
    encoding="latin1"
)

In [None]:
data.head(5)

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


## Preprocessing

### Cleaning

In order to understand the tweets, it needs to be cleaned, performing the following actions:


*   decoding the tweets from XML
*   removing mentions (starting with @-sign)
*   removing URL links
*   removing special characters
*   removing additional white spaces due to above functions






In [None]:
def clean_tweet(tweet):
    #decode tweets from XML
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

Use the function defined to clean the tweets in the dataset.  This will be the independent variable for the model. 

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.Text]

In [None]:
type(data_clean)

list

Create the data labels that classify the tweets as positive / negative.  These are the  targets / dependent variables to feed into the model.

In [None]:
data_labels = data.Sentiment.values

### Tokenization

Here we use the BERT tokeniser to generate tokens and token id's for each tweet.  We start by creating an instance of the class **FullTokenizer**.

Next we create a BERT layer to give us access to the information we need for the tokenizer.

Then we create the tokeniser from the class **FullTokenizer**. 

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
#call and create a BERT layer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)   #we don't want to fine tune right now

#Now get the information from the tokeniser           
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()  #access the vocab file
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()       #lower case the text
tokenizer = FullTokenizer(vocab_file, do_lower_case)                   #create tokeniser





































































































































































































































































A quick example to see how the tokeniser works

In [None]:
#some examples
tokenizer.tokenize('She has a very pleasant demeanor.')

['she', 'has', 'a', 'very', 'pleasant', 'demeanor', '.']

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('She has a very pleasant demeanor.'))

[2016, 2038, 1037, 2200, 8242, 21745, 1012]

Create a function that returns the token ids in a vector. 

In [None]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

In [None]:
type(data_inputs)

list

In [None]:
data_inputs[0]

[22652,
 2015,
 2006,
 2026,
 3422,
 9863,
 8418,
 3207,
 14841,
 2102,
 2061,
 4160,
 1052,
 8950,
 18133,
 2860,
 17531,
 2480,
 19128,
 3119,
 4118,
 2030,
 4118,
 2156,
 3653,
 2615,
 8466]

### Dataset creation

Next up we need to create batches of tweets that are padded to have the same lenght.  

To minimise the padding tokens, we will not padd all tweets to the same lenght.  Rather we will order tweets by lenght, then create batches and padd each batch to be as wide as the longest tweet in the batch.  

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]

random.shuffle(data_with_len)                               #shuffle data
data_with_len.sort(key=lambda x: x[2])                      #sort according to length
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 3] #keep only tweets 

Create a dataset using a generator (because the tweets are not all the same length)

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
next(iter(all_dataset))

(<tf.Tensor: shape=(4,), dtype=int32, numpy=array([5887, 2705, 2844, 2707], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()), drop_remainder=True)

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 4), dtype=int32, numpy=
 array([[ 5887,  2705,  2844,  2707],
        [ 9779,  2361,  2733,  2490],
        [ 1999, 16216,  6767,  1012],
        [ 1050,  4292,  2039, 17950],
        [ 9574,  2825, 25129,  2377],
        [ 4654,  2232,  2058,  1012],
        [10705,  2072,  2058,  1012],
        [ 2915,  2012,  2072,  1012],
        [10210,  2243,  3048,  2153],
        [27937,  2023,  2003,  2009],
        [18629,  2559,  3492,  9200],
        [20704,  2100,  2058,  1012],
        [ 3729,  2595,  2146,  1012],
        [ 1049,  1996, 25129,  7172],
        [ 9779, 10882,  5910,  2814],
        [14925,  3835,  2275,  2039],
        [ 2045,  3632, 27937,  1012],
        [ 2146,  9779,  2361,  1012],
        [ 1050,  2546,  2595,  9577],
        [ 2019,  2635,  2062,  2125],
        [ 8292,  2078,  4911,  2041],
        [10381,  2243,  2058,  1012],
        [ 1049,  8117,  5210,  1012],
        [22038,  2401,  2058,  1012],
        [ 4562,  5210,  1029,  2013],
  

Get the number of batches in the dataset.

Determine number of test- & train sets (10% / 90% for this example).

Shuffle the batched datasets.

Create the test- and train datasets.

In [None]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)    #nr of batches
NB_BATCHES_TEST = NB_BATCHES // 10                      #for test data
all_batched.shuffle(NB_BATCHES)                         #shuffle data
test_dataset = all_batched.take(NB_BATCHES_TEST)        #create test dataset using .take
train_dataset = all_batched.skip(NB_BATCHES_TEST)       #create training dataset using .skip

Extract from the test dataset the target values for later comparisons.  

In [None]:
y_test = np.concatenate([element[1] for element in test_dataset.as_numpy_iterator()])
y_test

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,

In [None]:
type(y_test)

numpy.ndarray

In [None]:
NB_BATCHES_TEST

18

In [None]:
test_dataset.take(1)

<TakeDataset shapes: ((32, None), (32,)), types: (tf.int32, tf.int32)>

In [None]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 4), dtype=int32, numpy=
 array([[ 5887,  2705,  2844,  2707],
        [ 9779,  2361,  2733,  2490],
        [ 1999, 16216,  6767,  1012],
        [ 1050,  4292,  2039, 17950],
        [ 9574,  2825, 25129,  2377],
        [ 4654,  2232,  2058,  1012],
        [10705,  2072,  2058,  1012],
        [ 2915,  2012,  2072,  1012],
        [10210,  2243,  3048,  2153],
        [27937,  2023,  2003,  2009],
        [18629,  2559,  3492,  9200],
        [20704,  2100,  2058,  1012],
        [ 3729,  2595,  2146,  1012],
        [ 1049,  1996, 25129,  7172],
        [ 9779, 10882,  5910,  2814],
        [14925,  3835,  2275,  2039],
        [ 2045,  3632, 27937,  1012],
        [ 2146,  9779,  2361,  1012],
        [ 1050,  2546,  2595,  9577],
        [ 2019,  2635,  2062,  2125],
        [ 8292,  2078,  4911,  2041],
        [10381,  2243,  2058,  1012],
        [ 1049,  8117,  5210,  1012],
        [22038,  2401,  2058,  1012],
        [ 4562,  5210,  1029,  2013],
  

# Stage 3: Model building

Create the model with three different convolutional filter sizes of two, three and four.

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,       #embedding size for the vectors
                 nb_filters=50,     #conv filters for each size 50x2, 50x3, 50x4
                 FFN_units=512,     #hidden units used in dense layers
                 nb_classes=2,      #nr of classes in our data - 1/0
                 dropout_rate=0.1,  #dropout
                 training=False,    #know if we are training/not to apply dropout/not
                 name="dcnn"):      #model name

        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)      #created for this model

        #bigram layer focusing on 2 consecutive words
        #1D because width of feature detector is the same as embedding dimension of vector
        self.bigram = layers.Conv1D(filters=nb_filters, 
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        
        #trigram layer focusing on 3 consecutive words
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        
        #fourgram layer focusing on 4 consecutive words
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        
        #takes the max of each output
        self.pool = layers.GlobalMaxPool1D()

        #Dense layer 1
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")

        #dropout shuts down some neurons to keep from overfitting
        self.dropout = layers.Dropout(rate=dropout_rate)

        #output layer 
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)

        x_1 = self.bigram(x)    # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1)    # (batch_size, nb_filters)

        x_2 = self.trigram(x)   # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2)    # (batch_size, nb_filters)

        x_3 = self.fourgram(x)  # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3)    # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

Hyperparameters & Other Info

In [None]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.5

NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./drive/My Drive/projects/BERT/ckpt_bert_tok/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

Latest checkpoint restored!!


In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [None]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
    160/Unknown - 4s 19ms/step - loss: 0.0025 - accuracy: 0.9994Checkpoint saved at ./drive/My Drive/projects/BERT/ckpt_bert_tok/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe5e5e1d410>

In [None]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_23 (Embedding)    multiple                  6104400   
                                                                 
 conv1d_69 (Conv1D)          multiple                  40100     
                                                                 
 conv1d_70 (Conv1D)          multiple                  60100     
                                                                 
 conv1d_71 (Conv1D)          multiple                  80100     
                                                                 
 global_max_pooling1d_23 (Gl  multiple                 0         
 obalMaxPooling1D)                                               
                                                                 
 dense_46 (Dense)            multiple                  77056     
                                                              

# Stage 5: Evaluation

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.9832169413566589, 0.859375]


In [None]:
results

[0.9780955910682678, 0.8524305820465088]

In [None]:
y_pred = Dcnn.predict(test_dataset)

In [None]:
type(y_pred)

numpy.ndarray

In [None]:
predictions = []
for i in list(y_pred):
    #print(i)
    #print(math.floor(i))
    #print(math.floor(i*2))
    if i <1.0:
        predictions.append(math.floor(i*2))
    else:
        predictions.append(math.floor(i))


In [None]:
type(predictions)

list

In [None]:
cm = confusion_matrix(y_test, predictions)
print(cm)

[[143  37]
 [ 44 352]]


In [None]:
acc = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions, multi_class="ovr")
print("Accuracy:",  round(acc,2))
print("Auc:", round(auc,2))
print("Detail:")
print(classification_report(y_test, predictions))

Accuracy: 0.86
Auc: 0.84
Detail:
              precision    recall  f1-score   support

           0       0.76      0.79      0.78       180
           1       0.90      0.89      0.90       396

    accuracy                           0.86       576
   macro avg       0.83      0.84      0.84       576
weighted avg       0.86      0.86      0.86       576



# Stage 6: Predictions for individual tweets

This section create a function to make predictions with and feed some recent tweets into the model to evaluate how well the model interprets recent tweets.  

Understanding incorrect predictions are just as important as understanding correct predictions.  Obtaining more data to test predictions will be a next phase in understanding and interpreting the model's behaviour.  

In [None]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)   #simulate a batch (of 1)

    output = Dcnn(inputs, training=False)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Ouput of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Ouput of the model: {}\nPredicted sentiment: positive.".format(
            output))

## Correct predictions 

In [None]:
get_prediction("The market size and demographics, as we wrote about here, leading into the IPO, are attractive.")

Ouput of the model: [[0.9898499]]
Predicted sentiment: positive.


In [None]:
get_prediction("DraftKings would still be losing $200 million a quarter,” Chanos said. “That is completely and totally insane.” He said has been short the stock for most of this year.")

Ouput of the model: [[2.3337441e-05]]
Predicted sentiment: negative.


In [None]:
get_prediction("Just a reminder that our debt bomb not going away. A year ago it was R2,9 trillion. At some stage this becomes a major problem...")

Ouput of the model: [[0.01473029]]
Predicted sentiment: negative.


In [None]:
get_prediction("Now track SA under the ANC and understand why R300bn has left the JSE bond and equity market this year.")

Ouput of the model: [[0.1265002]]
Predicted sentiment: negative.


## Incorrect predictions

In [None]:
get_prediction("I see Karoooo up 6.3% this morning. A closer look shows stock up on trades of 72 shares worth all of R41200. Serious. The gain though moves the market cap up R1bn. The fun you can have in illiquid markets!!")

Ouput of the model: [[0.20105565]]
Predicted sentiment: negative.


In [None]:
get_prediction("International investors in SA rail sector threaten to disinvest")

Ouput of the model: [[0.9941408]]
Predicted sentiment: positive.
