<h1>Convolutional Neural Network (CNN) Implementation</h1>

In this notebook we create a CNN to make predictions on our dataset.

In [None]:
# Needed general imports
import os, re
import numpy as np

# Helper code
from Keras.helpers import load_data_and_labels, clean_files, submission, embedding_matrix

# Libraries from Keras to implement CNN
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Merge, Convolution1D, Dropout, AveragePooling1D
from keras.layers.core import Flatten
from keras import optimizers

## Creating all the variables

In [None]:
# defnine the path where the glove twitter dataset is
TWITTER_GLOVE_PATH='scripts/glove.twitter.27B.200d.txt' 
nb_word = 20000
embedding_dim = 200

## Clean and Load Data

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
     #  Delete url and user 
    string = re.sub(r'<user>', ' ', string)
    string = re.sub(r'<url>', ' ', string)
    #  Replaced smiley with meaning
    string = re.sub(r'\:\)', ' happy ', string)
    string = re.sub(r'\:\(', ' sad ', string)
    string = re.sub(r'\:\/', ' sarcasm ', string)
    string = re.sub(r'\<\d', ' love ', string)
    string = re.sub(r'&', ' and ', string)
    # Change the conjugation
    string = re.sub(r"what's ", "what is ", string)
    string = re.sub(r" \'s ", " is ", string)
    string = re.sub(r" \'ve ", " have ", string)
    string = re.sub(r"can't ", "cannot ", string)
    string = re.sub(r"n't ", " not ", string)
    string = re.sub(r"i'm ", " i am ", string)
    string = re.sub(r"i've ", " i have ", string)
    string = re.sub(r"youre ", " you are ", string)
    string = re.sub(r"it's ", " it is ", string)
    string = re.sub(r"\'re ", " are ", string)
    string = re.sub(r"\'d ", " would ", string)
    string = re.sub(r"\'ll ", " will ", string)
    string = re.sub(r"don't ", " dont ", string)
    string = re.sub(r"im ", " i am ", string)
    string = re.sub(r"do no ", " dont ", string)
    string = re.sub(r"does no ", " dont ", string)
    string = re.sub(r"are no ", " arent ", string)
    string = re.sub(r"is no ", " isnt ", string)
    string = re.sub(r"am no ", " arent ", string)
    string = re.sub(r"its no ", " it isnt ", string)
    string = re.sub(r"did no ", " didnt ", string)
    string = re.sub(r"i no ", " i arent ", string)
    string = re.sub(r"will no ", " wont ", string)
    string = re.sub(r"have no ", " havent ", string)
    string = re.sub(r"don t ", " dont ", string)
    string = re.sub(r"doesn t ", " dont ", string)
    string = re.sub(r"aren t ", " arent ", string)
    string = re.sub(r"isn t ", " isnt ", string)
    string = re.sub(r"didn t ", " didnt ", string)
    string = re.sub(r"won t ", " wont ", string)
    string = re.sub(r"haven t ", " havent ", string)
    # change the ponctuation 
    string = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", string)
    string = re.sub(r"\d", " ", string) 
    string = re.sub(r",", " ", string)
    string = re.sub(r"\.", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\/", " ", string)
    string = re.sub(r"\^", " ^ ", string)
    string = re.sub(r"\+", " + ", string)
    string = re.sub(r"\-", " - ", string)
    string = re.sub(r"\=", " = ", string)
    string = re.sub(r"'", " ", string)
    string = re.sub(r"(\d+)(k)", r"\g<1>000", string)
    string = re.sub(r":", " : ", string)
    string = re.sub(r" e g ", " eg ", string)
    string = re.sub(r" b g ", " bg ", string)
    string = re.sub(r" u s ", " american ", string)
    string = re.sub(r"\0s", "0", string)
    string = re.sub(r" 9 11 ", "911", string)
    string = re.sub(r"e - mail", "email", string)
    string = re.sub(r"j k", "jk", string)
    string = re.sub(r"\s{2,}", " ", string)
   
    # return all in lowercase 
    return string.strip().lower()

In [None]:
# returns the cleaned version of the tweet files
def clean_files():
    positive_examples = list(open('twitter-datasets/train_pos_full.txt', "r", encoding="utf-8").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open('twitter-datasets/train_neg_full.txt', "r", encoding="utf-8").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    test_examples = list(open('twitter-datasets/test_data.txt', "r", encoding="utf-8").readlines())
    test_examples = [s.strip() for s in test_examples]
    
    # Split by words
    positive_string = [clean_str(sent) for sent in positive_examples]
    negative_string = [clean_str(sent) for sent in negative_examples]
    test_string = [clean_str(sent) for sent in test_examples]

    # save the cleaned files for future use
    with open('processed/train_pos_CNN_full.txt', 'w', encoding="utf-8") as f:
        for sent in positive_string:
            f.write(sent + '\n')

    with open('processed/train_neg_CNN_full.txt', 'w', encoding="utf-8") as f:
        for sent in negative_string:
            f.write(sent + '\n')

    with open('processed/test_CNN.txt', 'w', encoding="utf-8") as f:
        for sent in test_string:
            f.write(sent + '\n')
            
# Clean the file if it does not exist
if not os.path.exists('processed/train_pos_CNN_full.txt') \
    or not os.path.exists('../processed/train_neg_CNN_full.txt'):
        print('Cleaned CNN files do not exist')
        clean_files()

In [None]:
def load_data_and_labels(positive_data_file, negative_data_file, test_data_file):
    """
    Loads data from files, splits the data into words and generates labels.
    Returns split sentences and labels for the training sets and split sentences for the testing set
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r", encoding="utf-8").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r", encoding="utf-8").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    test = list(open(test_data_file, "r", encoding="utf-8").readlines())
    test = [s.strip() for s in test]
    
    # Split by words
    train = positive_examples + negative_examples
    
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    labels = np.concatenate([positive_labels, negative_labels], 0)
    labels = np.array(labels)
    return [train, labels, test]

# Load data from processed files
train,labels,test=load_data_and_labels('processed/train_pos_CNN_full.txt',
                                       'processed/train_neg_CNN_full.txt',
                                       'processed/test_CNN.txt')

## Tokenize Data
In this part, we tokenize the data, which means we replace words by numbers.

In [None]:
# Vectorize the text samples into a 2D integer tensor with Tokenizer
tokenizer = Tokenizer(num_words=nb_word)
tokenizer.fit_on_texts(train)
sequences_train = tokenizer.texts_to_sequences(train)
sequences_test = tokenizer.texts_to_sequences(test)
# take only the index of words
word_index = tokenizer.word_index

## Create the Embedding Matrix

In [None]:
def embedding_matrix(path_glove_twitter,word_index,nb_words,embedding_dim):
    # create index mapping words in the embeddings  to their embedding vector
    embeddings_index = {}
    f = open(path_glove_twitter, "r", encoding="utf-8") 
    for line in f:
        values = line.split()
        word = values[0]
        # for each word we find the corresponding word vector
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
    f.close()

    # Create the embedding matrix corresponding to our dataset
    embedding_matrix = np.zeros((nb_words + 1,embedding_dim))
    for word, i in word_index.items():
        if i > nb_words: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in the embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

#create the embedding matrix that will be the weight of our embedding layer
print('create embedding_matrix')
embedding_matrix_200 = embedding_matrix(TWITTER_GLOVE_PATH, word_index, nb_word,embedding_dim)

Now we use pad_sequences to put every tweets at the same length.

In [None]:
#put at the same length every sentence (the length is the max length of all tweets)
sequence_lenght = max(len(x) for x in train)
xtrain = pad_sequences(sequences_train, maxlen=sequence_lenght)
xtest = pad_sequences(sequences_test, maxlen=sequence_lenght)

Let us randomize and split the data for training and testing.

In [None]:
# split data into training and testing
num_row = len(labels)
indices = np.random.permutation(num_row)
train = xtrain[indices]
ytrain=labels[indices]

In [None]:
nb_tweets_train=200000
# add a part of the tweet to have external accuracy
nb_tweets_test=10000
validation_split = int(0.20*nb_tweets_train)

x_train=xtrain[: nb_tweets_train]
y_train=ytrain[: nb_tweets_train]
x_validation=X_train[nb_tweets_train+1 : nb_tweets_train+validation_split]
y_validation=Y_train[nb_tweets_train+1 : nb_tweets_train+validation_split]
x_test=X_train[nb_tweets_train+validation_split+1 :nb_tweets_train+validation_split+nb_tweets_test]
y_test=Y_train[nb_tweets_train+validation_split+1 :nb_tweets_train+validation_split+nb_tweets_test]

## Design the CNN
In this part, we create the actual CNN as discussed in the report.

In [None]:
filters = [2,3,4]
num_filters = 120
drop = 0.6
nb_epoch = 3
batch_size = 60

convolutions = []
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=nb_words + 1,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix_200],
                            input_length=sequence_length,
                            trainable=False)(inputs)
embedding2 = Embedding(input_dim=nb_words + 1,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix_200],
                            input_length=sequence_length,
                            trainable=True)(inputs)
for nb_filter in filters:
    conv = Convolution1D(num_filters, nb_filter, activation='relu')(embedding)
    maxpooling = AveragePooling1D(3)(conv)
    flatten=Flatten()(maxpooling)
    convolutions.append(flatten)
for nb_filter in filters:
    conv = Convolution1D(num_filters, nb_filter, activation='relu')(embedding2)
    maxpooling = AveragePooling1D(3)(conv)
    flatten=Flatten()(maxpooling)
    convolutions.append(flatten)

merged_tensor = Merge(mode='concat', concat_axis=1)(convolutions)
dense0=Dense(120,init='uniform', activation='relu')(merged_tensor)
dropout0 = Dropout(drop)(dense0)
dense1=Dense(60,init='uniform', activation='relu')(dropout0)
dropout1 = Dropout(drop)(dense1)
out = Dense(output_dim=2, init='uniform',activation='softmax')(dropout1)

model = Model(input=inputs, output=out)

Adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0005)

model.compile(loss='binary_crossentropy',
                                          optimizer=Adam,
                                          metrics=['accuracy'])
model.fit(x_train, y_train,
          epochs=nb_epoch ,
          batch_size=batch_size,
          validation_data=(x_validation, y_validation))