In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras import Input, Model
import gensim.downloader as api
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time

#Download the nltk packages required
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
#Download word embeddings
#vocabulary of 40k words each with 100 parameters
word_vec = api.load('glove-wiki-gigaword-100')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
def get_word_encoder(word_vec):

    #key: word, value: number in the vocabulary
    word_to_enc = dict((word, (i+2)) for i, word in enumerate(word_vec.wv.vocab))
    #value: word, key: number in the vocabulary
    enc_to_word = dict(((i+2), word) for i, word in enumerate(word_vec.wv.vocab))

    #0 is used for padding and 1 is used for out of vocab keywords
    word_to_enc[''], word_to_enc['UNK'] = 0, 1
    enc_to_word[0], enc_to_word[1] = '', 'UNK'

    return word_to_enc, enc_to_word

In [None]:
def get_embeddings(word_vec):

    embeddings = np.zeros((len(word_vec.wv.vocab)+2, 100))
    for i, word in enumerate(word_vec.wv.vocab):
        embeddings[i+2] = word_vec[word]
    
    return embeddings

def create_embedding_layer(word_vec, embeddings, output_dimensions, trainable=False):
    #Create an Embedding Layer
    embedding_layer = Embedding(
        input_dim = len(word_vec.wv.vocab)+2,
        output_dim = output_dimensions,
        weights = [embeddings],
        trainable = trainable
    )

    return embedding_layer

In [None]:
embeddings = get_embeddings(word_vec)
word_to_enc, enc_to_word = get_word_encoder(word_vec)
embedding_layer = create_embedding_layer(word_vec, embeddings, 100)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  if sys.path[0] == '':


In [None]:
#loading the dataset using which is stored in a pickle file
filepath = '/content/twitter_data.pickle'
with open(filepath, 'rb') as file:
    data = pickle.load(file)

In [None]:
data

Unnamed: 0,Target,Tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1048570,4,My GrandMa is making Dinenr with my Mum
1048571,4,Mid-morning snack time... A bowl of cheese noo...
1048572,4,@ShaDeLa same here say it like from the Termi...
1048573,4,@DestinyHope92 im great thaanks wbuu?


In [None]:
#Drop negative data row ie. the first 0.5 million rows for balancing the classes of the dataset
data = data[550000:]

In [None]:
x = np.array(data['Tweet'])
y = np.array(data['Target'])/4

In [None]:
#output the ratio of positive classes
print(list(y).count(1)/len(y))

0.2370607729537706


In [None]:
#Data cleaning and vectorizing
def clean_data(texts):
    
    for idx in range(len(texts)):
        #Remove the special symbols, multiple spaces should be replaced by a single space
        #lower the letters of the words

        texts[idx] = re.sub(r'\W', ' ', texts[idx])
        texts[idx] = re.sub(r' +', ' ', texts[idx])
        texts[idx] = texts[idx].lower()
    
    return np.array(texts)

#Convert the sentences into numbers for the embedding layer
#texts is the tweets and max_length is the max_length of the output array per sentence
def vectorizer(texts, max_length=50, remove_stopwords=True):

    temp_texts = []
    #Remove stopwords as they don't convey much meaning.
    #works well when max_length is not to be kept too high
    if remove_stopwords:
        stop_words = set(stopwords.words('english')) 
    
    for sent in texts:
        sent = word_tokenize(sent)
        nos = [0 for _ in range(max_length)]
        counter = 0
        for word in range(len(sent)):
            
            #If stopwords are not to be removed
            if not remove_stopwords:
                try:
                    nos[counter] = word_to_enc[sent[word]]
                except KeyError:
                    try:
                        nos[counter] = 1
                    except IndexError:
                        pass
                except IndexError:
                    pass
                counter += 1

            #If stopwords are to be removed and word is not present in stopword
            elif not sent[word] in stop_words:
                try:
                    nos[counter] = word_to_enc[sent[word]]
                except KeyError:
                    try:
                        nos[counter] = 1
                    except IndexError:
                        pass
                except IndexError:
                    pass
                counter += 1
                
        temp_texts.append(np.array(nos))
    
    return np.array(temp_texts)


In [None]:
def model(input_sequence_length):

    x_input = Input(shape=(input_sequence_length, ), dtype='int64')

    x = embedding_layer(x_input)

    #1st LSTM Layer
    x = LSTM(32, return_sequences=True)(x)
    x = Dropout(0.8)(x)

    #2nd LSTM Layer
    x = LSTM(16, return_sequences=True)(x)
    x = Dropout(0.8)(x)


    #3rd LSTM layer
    x = LSTM(16, activation='relu')(x)

    #Dense Layer
    x = Dense(32, activation='relu')(x)

    #Output Layer
    pred = Dense(1, activation='sigmoid')(x)

    model = Model(x_input, pred, name='Classifier')

    return model

In [None]:
sent_model = model(20)
sent_model.summary()

Model: "Classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        [(None, 20)]              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 20, 100)           40000200  
_________________________________________________________________
lstm_38 (LSTM)               (None, 20, 32)            17024     
_________________________________________________________________
dropout_26 (Dropout)         (None, 20, 32)            0         
_________________________________________________________________
lstm_39 (LSTM)               (None, 20, 16)            3136      
_________________________________________________________________
dropout_27 (Dropout)         (None, 20, 16)            0         
_________________________________________________________________
lstm_40 (LSTM)               (None, 16)                2

In [None]:
#split dataset into train(90%)-dev(7%)-val(3%) sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
x_dev, x_val, y_dev, y_val = train_test_split(x_test, y_test, test_size=0.3)

In [None]:
print(list(y_train).count(1)/len(y_train))
print(list(y_dev).count(1)/len(y_dev))
print(list(y_val).count(1)/len(y_val))
print(x_train.shape, y_train.shape)
print(x_dev.shape, y_dev.shape)
print(x_val.shape, y_val.shape)

0.23701491019023713
0.23811989100817438
0.2359654142030644
(943717,) (943717,)
(73400,) (73400,)
(31458,) (31458,)


In [None]:
#Clean and vectorize the data
x_train = vectorizer(clean_data(x_train), max_length=20)
x_dev = vectorizer(clean_data(x_dev), max_length=20)
x_val = vectorizer(clean_data(x_val), max_length=20)

print(x_train.shape, x_dev.shape, x_val.shape)

(943717, 20) (73400, 20) (31458, 20)


In [None]:
sent_model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy',
                                                                         tf.keras.metrics.TruePositives(thresholds=0.7),
                                                                         tf.keras.metrics.TrueNegatives(),
                                                                         tf.keras.metrics.FalseNegatives(),
                                                                         tf.keras.metrics.FalsePositives()])

In [None]:
sent_model.fit(x_train, y_train, epochs=10, batch_size=128, validation_data=(x_dev, y_dev), shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4c07d3efd0>

In [None]:
metrics = sent_model.evaluate(x_val, y_val, batch_size=64)



In [None]:
print("Total elements: ", len(x_val))
print("Total positives: ", list(y_val).count(1)/ len(x_val)*100, "%")
print("Total Negatives: ", list(y_val).count(0)/ len(x_val)*100, "%")
print("True Positive: ", metrics[2]/ list(y_val).count(1)*100, "%")
print("True Negative: ", metrics[3]/ list(y_val).count(0)*100, "%")

Total elements:  31458
Total positives:  23.59654142030644 %
Total Negatives:  76.40345857969356 %
True Positive:  30.08217701737842 %
True Negative:  92.17391304347827 %


In [None]:
sent_model.predict(x_val), y_val

(array([[0.32798773],
        [0.71607465],
        [0.12307593],
        ...,
        [0.57409924],
        [0.4033209 ],
        [0.8392886 ]], dtype=float32), array([0., 1., 0., ..., 1., 1., 0.]))

In [None]:
#Make a prediction on a single tweet
def predict(model, text):
    

    text = vectorizer(clean_data([text]), max_length=20)
    text = np.reshape(text, (1, -1))
    prediction = model.predict(text)
    return prediction

def multi_predict(model, tweets, show_time=True, pos_margin=0.7, neg_margin=0.3, return_values=False):
    s = time.time()
    pos, neg, neutral = 0, 0, 0

    text = vectorizer(clean_data(tweets), max_length=20)
    prediction = model.predict(text)

    for pred in prediction:
        if pred >= pos_margin:
            pos += 1
        elif pred >= neg_margin and pred < pos_margin:
            neutral += 1
        else:
            neg += 1

    if show_time:
        print(f'Total Time taken for {tweets.shape[0]}: {time.time()-s}')

    if return_values:
        return prediction
        
    return {"Positive Tweets":pos, "Negative Tweets":neg, "Neutral Tweets":neutral}

In [None]:
#Build an end to end model

input_data = Input(shape=(20,), dtype='int64')
preds = sent_model(input_data)
end_to_end_model = Model(input_data, preds, name='end_to_end_classifier')
end_to_end_model.summary()
end_to_end_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


Model: "end_to_end_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
Classifier (Functional)      (None, 1)                 40023049  
Total params: 40,023,049
Trainable params: 22,849
Non-trainable params: 40,000,200
_________________________________________________________________


In [None]:
#test_data = "I am a very huge fan of the performing arts. i am very good at live performance"
#test_data = "You motherfucker son of a bitch. We all hate you"
#test_data = "I am very bad at predicting the mood swings of people"
#test_data = "This is a generic tweet."
#test_data = data[100:200]
#tweets = np.array(test_data['Tweet'])
#print("Total Tweets: ", tweets.shape)

multi_predict(end_to_end_model, tweets, pos_margin=0.5, neg_margin=0.5, return_values=True)
predict(end_to_end_model, test_data)


Total Tweets:  (100,)
Total Time taken for 100: 0.08659529685974121
[0.87100685] 2
[0.80267394] 5
[0.94494665] 24
[0.83507645] 29
[0.79905856] 41
[0.7000853] 47
[0.80914414] 51
[0.8387081] 52
[0.8860167] 65
[0.8946146] 69
[0.9692976] 80
[0.7392249] 82
[0.88025963] 87
[0.9175091] 91


In [None]:
sent_model = model(20)

In [None]:
sent_model.load_weights('/content/sentiment_classifier_v1.20.h5')

In [None]:
from systemml.mllearn import Keras2DML

ModuleNotFoundError: ignored

In [None]:
from systemml.mllearn im