In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [3]:
columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

In [4]:
data = pd.read_csv('trainingandtestdata/training.1600000.processed.noemoticon.csv', 
                         header = None,
                         names = columns,
                         engine = 'python',
                         encoding = 'latin1')

In [5]:
zero =  data.loc[data['sentiment'] == 0].iloc[0:50000,:]

In [6]:
four =  data.loc[data['sentiment'] == 4].iloc[0:50000,:]

In [7]:
data = pd.concat([zero,four], ignore_index = True)

In [8]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [9]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [10]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [11]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [12]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

In [13]:
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [14]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [15]:
test_idx = np.random.randint(0, 50000, 2500)
test_idx = np.concatenate((test_idx, test_idx+50000))

In [16]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [43]:
class CNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="cnn"):
        super(CNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size, emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
        
        self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding="valid", activation="relu")
        
        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")
        
        self.pool = layers.GlobalMaxPool1D() 
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [44]:
VOCAB_SIZE = tokenizer.vocab_size

EMBEDDING_DIM = 200
NUM_FILTERS = 100
NUM_UNITS_FFN = 256
NUM_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

In [45]:
BATCH_SIZE = 32
EPOCHS = 5

In [46]:
cnn = CNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMBEDDING_DIM,
            nb_filters = NUM_FILTERS,
            FFN_units = NUM_UNITS_FFN,
            nb_classes = NUM_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [47]:
if NUM_CLASSES == 2:
    cnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    cnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [24]:
cnn.fit(train_inputs,
         train_labels,
         batch_size = BATCH_SIZE,
         epochs = EPOCHS)

Train on 95120 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1398d23d0>

In [28]:
results = cnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE, verbose = 0)
# print(results)

In [29]:
results

[1.1356499167442322, 0.7632]

In [41]:
sentiment = cnn(np.array([tokenizer.encode("You are so funny")]), training=False).numpy()
if sentiment[0] >= 0.5:
    print('Positive ->', float(sentiment[0]))
else:
    print('Negative ->', float(sentiment[0]))

Positive -> 0.9997956156730652


In [42]:
sentiment = cnn(np.array([tokenizer.encode("You are so not funny, I hate it")]), training=False).numpy()
if sentiment[0] >= 0.5:
    print('Positive ->', float(sentiment[0]))
else:
    print('Negative ->', float(sentiment[0]))

Negative -> 0.003476840676739812
