# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!gdown 'https://drive.google.com/uc?id=1JffwGvZ-LAx_Cq4Bb4quyJAMsEf63vVz'

Downloading...
From: https://drive.google.com/uc?id=1JffwGvZ-LAx_Cq4Bb4quyJAMsEf63vVz
To: /content/trainingandtestdata.zip
81.4MB [00:01, 55.5MB/s]


In [3]:
!unzip trainingandtestdata.zip

Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


# Data Preparation

In [4]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
# tensorflow_datasets contains tokenizer function

In [5]:
cols = ["sentiment", "id", "date", "query", "user", "text"]

train_data = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, names=cols, engine="python", encoding="latin1")
test_data = pd.read_csv("testdata.manual.2009.06.14.csv", header=None, names=cols, engine="python", encoding="latin1")

data = train_data

In [6]:
print(len(data))
data.head()

1600000


Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, "lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", " ", tweet)
  # get rid of all mentions in the tweet
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", " ", tweet)
  # get rid of all urls
  tweet = re.sub(r"[^a-zA-Z.!?']", " ", tweet)
  # to only keep letters and standard punctuation
  tweet = re.sub(r" +", " ", tweet)
  # replace multiple whitespaces by a single whitespace
  return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data.text]
print(data_clean[0])

 Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D


In [10]:
data_labels = data.iloc[0:, 0:1]
data_labels = np.array(data_labels)
data_labels[data_labels == 4] = 1
# data_labels[data_labels == 2] = 1

In [11]:
print(np.unique(data_labels))
print(len(data_labels))
print(len(data_clean))

[0 1]
1600000
1600000


In [12]:
# converting words into numbers using tensorflow tokenizer

tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [13]:
print(len(data_inputs[0]))
print(data_inputs[0])

21
[65316, 1570, 113, 65323, 10, 6, 3553, 1, 135, 5262, 50, 1484, 38165, 16, 13337, 606, 2, 49, 33, 1, 65352]


In [14]:
# Add padding to make length of all sentences equal

MAX_LEN = max([len(sentence) for sentence in data_inputs])

# this makes all our sentences of length=MAX_LEN by adding 0s at the end
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [15]:
print(len(data_inputs[0]))
print(MAX_LEN)
print(data_inputs[0])

73
73
[65316  1570   113 65323    10     6  3553     1   135  5262    50  1484
 38165    16 13337   606     2    49    33     1 65352     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]


In [16]:
# splitting dataset into train and test set

test_idx = np.random.randint(0, 800000, 8000)
# 8000 random negative tweets stored in test_idx right now

test_idx = np.concatenate((test_idx, test_idx+800000))
# this combines 8000 random negative tweets and 8000 random positive tweets to give us total 16000 testing tweets

test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]

train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx, axis=0)

print(len(test_inputs))
print(len(test_labels))
print(len(train_inputs))
print(len(train_labels))

16000
16000
1584082
1584082


# Model

In [17]:
class DCNN(tf.keras.Model):
  def __init__(self,
               vocab_size,
               emb_dim=128,
               nb_filters=50,
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False,
               name="dcnn"):
    super(DCNN, self).__init__(name=name)

    # embedding layer
    self.embedding = layers.Embedding(vocab_size, emb_dim)

    # this create feature maps to find similarity between 2 simultaneous words(dimension of filter map=2xemb_dim)
    # we use 1d convolutions since width of feature maps is same as width of input matrix
    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
    self.pool_1 = layers.GlobalMaxPool1D()
    # this create feature maps to find similarity between 3 simultaneous words(dimension of filter map=3xemb_dim)
    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding="valid", activation="relu")
    self.pool_2 = layers.GlobalMaxPool1D()
    # this create feature maps to find similarity between 4 simultaneous words(dimension of filter map=4xemb_dim)
    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")
    self.pool_3 = layers.GlobalMaxPool1D()

    # full connection layers
    self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
    self.dropout = layers.Dropout(rate=dropout_rate)

    # output layer
    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1, activation="sigmoid")
    else:
      self.last_dense = layers.Dense(units=nb_units, activation="softmax")

  def call(self, inputs, training):
    x = self.embedding(inputs)

    x_1 = self.bigram(x)
    x_1 = self.pool_1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_2(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool_3(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis=-1)
    # size = (batch_size, 3*nb_filters)
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output

In [18]:
VOCAB_SIZE = tokenizer.vocab_size
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(np.unique(data_labels))
DROPOUT_RATE = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 2

In [19]:
dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [20]:
if NB_CLASSES == 2:
  dcnn.compile(loss="binary_crossentropy",
               optimizer="adam",
               metrics=["accuracy"])
else:
  dcnn.compile(loss="sparse_categorical_crossentropy",
               optimizer="adam",
               metrics=["sparse_categorical_accuracy"])

In [21]:
checkpoint_path = "drive/My Drive/models/tweets-sentiment/"

ckpt = tf.train.Checkpoint(dcnn=dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=2)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest checkpoint restored!")

In [22]:
dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)

ckpt_manager.save()

Epoch 1/2
Epoch 2/2
 7652/49503 [===>..........................] - ETA: 1:28:58 - loss: 0.3269 - accuracy: 0.8592Buffered data was truncated after reaching the output size limit.

# Evaluation

In [23]:
results = dcnn.evaluate(test_inputs, 
                        test_labels, 
                        batch_size=BATCH_SIZE)

print(results)

[0.3684089481830597, 0.8408750295639038]


In [24]:
dcnn(np.array([tokenizer.encode('I love you')]), training=False).numpy()

array([[0.44495186]], dtype=float32)

In [25]:
dcnn(np.array([tokenizer.encode('I hate you')]), training=False).numpy()

array([[0.44495186]], dtype=float32)

In [26]:
dcnn(np.array([tokenizer.encode('I wish no one ever have to do that again')]), training=False).numpy()

array([[0.0143242]], dtype=float32)

In [27]:
dcnn(np.array([tokenizer.encode('You are so funny!')]), training=False).numpy()

array([[0.94882584]], dtype=float32)