## 1. Importing dependencies

In [None]:
import numpy as np 
import pandas as pd
import re
import math
from bs4 import BeautifulSoup

from google.colab import drive

In [None]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

## 2. Loading Data


In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols = ["sentiment" , "id", "date", "query", "user", "text"]

train_data = pd.read_csv("/content/drive/MyDrive/training_twitter.csv", header = None, 
                         names = cols, engine = "python",
                         encoding = "latin1")


test_data = pd.read_csv("/content/drive/MyDrive/testdata_twitter.csv", header = None, 
                         names = cols, engine = "python",
                         encoding = "latin1")

In [None]:
train_data["sentiment"].value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [None]:
test_data["sentiment"].value_counts()

4    182
0    177
2    139
Name: sentiment, dtype: int64

In [None]:
train_data.head(3)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [None]:
print("The length of train_dataset is: " + str(len(train_data)))
print("The length of test_dataset is: " + str(len(test_data)))

The length of train_dataset is: 1600000
The length of test_dataset is: 498


## 3.Preprocessing the data



###### Dropping the useless columns


In [None]:
train_data.columns

Index(['sentiment', 'id', 'date', 'query', 'user', 'text'], dtype='object')

In [None]:
nr_cols = ["id" , "date" , "query" , "user"]

train_data1 = train_data.drop(nr_cols, axis = 1)
test_data1 = test_data.drop(nr_cols, axis = 1)

###### Using re and beautiful soup to clean my tweet_texts

In [None]:
def clean_tweet(tweet):
  '''
  Input: An uncleaned tweet of "lxml" format containg Twitter features like @user_name, https:// etc

  Output: A cleaned output
  '''

  #converting the format to text
  tweet = BeautifulSoup(tweet, "lxml").get_text()

  #using regular expressions
  tweet = re.sub(r"@[A-Za-z0-9]+" , " ", tweet)    #remove usernames
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", " ", tweet)  #remove hyperlinks
  tweet = re.sub(r"[^A-Za-z0-0.?!]", " ", tweet)     #any other non-significant character
  tweet = re.sub(r" +", " ", tweet)    #multiple continuous instances of white_space

  return tweet

In [None]:
data = train_data1
data_t = test_data1

In [None]:
data_clean = [clean_tweet(x) for x in data.text]

In [None]:
data_labels = data.sentiment.values #converting panda series to numpy array
data_labels[data_labels == 4] = 1

###### Tokenization

In [None]:
for i in range(10):
  print("raw data at index" + str(i) + ":  " + data.text[i])
  print("cleaned data at index" + str(i) + ":  " + data_clean[i])
  print("\n")

raw data at index0:  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
cleaned data at index0:   Awww that s a bummer. You shoulda got David Carr of Third Day to do it. D


raw data at index1:  is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
cleaned data at index1:  is upset that he can t update his Facebook by texting it... and might cry as a result School today also. Blah!


raw data at index2:  @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
cleaned data at index2:   I dived many times for the ball. Managed to save 0 The rest go out of bounds


raw data at index3:  my whole body feels itchy and like its on fire 
cleaned data at index3:  my whole body feels itchy and like its on fire 


raw data at index4:  @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there

In [None]:
vocab_size = 2**16

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size = vocab_size)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

###### Padding

In [None]:
print(data_inputs[0])
print("\n")
print(data_inputs[1])

[65168, 1550, 20, 13, 6, 3546, 1, 116, 5218, 50, 1406, 34706, 17, 13224, 593, 3, 49, 79, 1, 65204]


[12, 1077, 20, 96, 34, 16, 743, 194, 1807, 124, 2944, 79, 27, 9, 325, 800, 77, 6, 3614, 1736, 76, 3006, 1, 6358, 65169]


In [None]:
for i in range(10):
  print("Length of the input " + str(i) + " : " + str(len(data_inputs[i])))

Length of the input 0 : 20
Length of the input 1 : 25
Length of the input 2 : 21
Length of the input 3 : 10
Length of the input 4 : 28
Length of the input 5 : 5
Length of the input 6 : 3
Length of the input 7 : 25
Length of the input 8 : 8
Length of the input 9 : 6


In [None]:
#Thus every input has a different size and we need to make them of the same size. Thus, we need to add some kind of padding to make them of uniform shape

In [None]:
#finding the len of maximum input
MAX_LEN = max([len(x) for x in data_inputs])

In [None]:
#padding the data points of data_clean with 0, at the end of the sequence with max_len = MAX_LEN
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)



In [None]:
#post padding
for i in range(100):
  print("Length of the input " + str(i) + " : " + str(len(data_inputs[i])))

Length of the input 0 : 74
Length of the input 1 : 74
Length of the input 2 : 74
Length of the input 3 : 74
Length of the input 4 : 74
Length of the input 5 : 74
Length of the input 6 : 74
Length of the input 7 : 74
Length of the input 8 : 74
Length of the input 9 : 74
Length of the input 10 : 74
Length of the input 11 : 74
Length of the input 12 : 74
Length of the input 13 : 74
Length of the input 14 : 74
Length of the input 15 : 74
Length of the input 16 : 74
Length of the input 17 : 74
Length of the input 18 : 74
Length of the input 19 : 74
Length of the input 20 : 74
Length of the input 21 : 74
Length of the input 22 : 74
Length of the input 23 : 74
Length of the input 24 : 74
Length of the input 25 : 74
Length of the input 26 : 74
Length of the input 27 : 74
Length of the input 28 : 74
Length of the input 29 : 74
Length of the input 30 : 74
Length of the input 31 : 74
Length of the input 32 : 74
Length of the input 33 : 74
Length of the input 34 : 74
Length of the input 35 : 74
Le

##### Splitting into Training/Test dataset


In [None]:
#The data is in sorted manner. First 800000 datapoints with negative labels, and next 800000 with positive labels

test_idx = np.random.randint(0,800000, 8000)   #generate random int as index location between [0, 800000)
test_idx = np.concatenate((test_idx, test_idx +800000), axis = 0)    #add 800000 to each input in test_idx to get index location of positive labels

In [None]:
type(data_inputs)

numpy.ndarray

In [None]:
#Getting test data

test_inputs = data_inputs[test_idx]     #index location given by test_idx to data_labels array
test_labels = data_labels[test_idx]     #index location given by test_idx to data_labels array

In [None]:
#Getting train_data

train_inputs = np.delete(data_inputs, test_idx, axis=0)   ##np.delete deletes the data at the index location given here
train_labels = np.delete(data_labels, test_idx)           #np.delete deletes the data at the index location given here

## 4. Model Building

In [None]:
#Building a custom model


class DCNN(tf.keras.Model):


#voacb_size : to be given by user
#emb_dim :  dimension of the embeddings
#nb_filters : no. of instances of filters of each type
#FFN_units = no. of fully connected neurons in the FFN 
#nb_classes = no. of classes
#dropout_rate: For FFN for regularization
#training : (True/False) to indicate the status of model i.e. training or inference
#name  = name given 




  def __init__(self,
                vocab_size,
                emb_dim=128,
                nb_filters=50,
                FFN_units=512,
                nb_classes=2,
                dropout_rate=0.1,
                training=False,
                name="dcnn"):
    
    super(DCNN, self).__init__(name = name)
    #defining the various parts of the model

    self.embeddings = layers.Embedding(vocab_size, 
                                       emb_dim)
    self.bigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 2,
                                padding = "valid",
                                activation = "relu")
    self.trigram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 3,
                                padding = "valid",
                                activation = "relu")
    self.fourgram = layers.Conv1D(filters = nb_filters,
                                kernel_size = 4,
                                padding = "valid",
                                activation = "relu")
    
    self.pool = layers.GlobalMaxPooling1D()

    self.dense_1 = layers.Dense(units = FFN_units, activation = "relu")
    self.dropout = layers.Dropout(rate = dropout_rate)

    #no. of classes basis the classification task
    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1,
                                    activation="sigmoid")
    else:
      self.last_dense = layers.Dense(units=nb_classes,
                                    activation="softmax")


    #definition fo my custom call

  def call(self, inputs, training):    #trainng is a boolean here
    x = self.embeddings(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)

    #merging of the 3 outputs
    merged = tf.concat([x_1,x_2,x_3], axis = -1)      #Data format: (batch_size, 3 * nb_filters) thus, -1 ensures along the last axis
    merged = self.dense_1(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output

###### Config

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2 #len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

#### Training Model


In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./content/drive/MyDrive/NLP_CNN/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs = NB_EPOCHS)
ckpt_manager.save()

In [None]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[0.4493756592273712, 0.7833124995231628]


In [None]:
Dcnn(np.array([tokenizer.encode("bad teacher")]), training=False).numpy()

InvalidArgumentError: ignored

In [None]:
tokenizer.encode("bad")

[620]

In [None]:
tokenizer.encode("bad teacher")

[132, 8475]