## **Source**

In [0]:
# https://github.com/mk60991/DeepLearning-with-keras

## **Setting up environment**

In [0]:
!unzip datasets.zip

## **Imports**

In [0]:
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.datasets import imdb
from keras.engine import Layer, InputSpec
import tensorflow as tf
import xml.etree.ElementTree as ET

## **Loading Data from Different Datasets**
To train for a specific dataset, only run the cell for that dataset, then skip over to the next section.\
Don't forget to run the first cells in this section for the loading functions.\

In [0]:
def generate_vocab(sentences):
    vectorizer = CountVectorizer(input = u'content',
                                analyzer = "word",
                                tokenizer = None,
                                preprocessor = None,
                                stop_words = None,
                                token_pattern = r',|\b\w+\b',
                                max_features = 15448)
    vectorizer.fit(sentences)
    return vectorizer.vocabulary_

In [0]:
# sst-1 data
def load_vocab(paths):
    sentences = []
    for path in paths:
        with open(path, 'r', encoding='latin-1', errors='ignore') as f:
            r = f.read()
        for line in r.split('\n'):
            line = line.rstrip()
            text = line.split(' ')[1:]
            sentences.append(' '.join(text))
    vocab = generate_vocab(sentences)
    return vocab

def load_data(split_path, vocab):
    data = []
    labels = []
    with open(split_path, 'r', encoding='latin-1', errors='ignore') as f:
        r = f.read()
    for line in r.split('\n'):
        line = line.rstrip()
        text = line.split(' ')[1:]
        label = int(line.split(' ')[0])
        mapped_sentence = []
        for token in text:
            if token in vocab:
                mapped_sentence.append(str(vocab[token]))
            else:
                mapped_sentence.append(15448)
        labels.append(label)
        data.append(mapped_sentence)
    return np.array(data), to_categorical(np.array(labels)), len(set(labels))

vocab = load_vocab(['datasets/sst/Training_SST-1.txt', 'datasets/sst/Dev_SST-1.txt', 'datasets/sst/Test_SST-1.txt'])
X_train, y_train, num_classes = load_data('datasets/sst/Training_SST-1.txt', vocab)
X_valid, y_valid, _ = load_data('datasets/sst/Dev_SST-1.txt', vocab)
X_test, y_test, _ = load_data('datasets/sst/Test_SST-1.txt', vocab)

In [0]:
# sst-2 data
def load_vocab(paths):
    sentences = []
    for path in paths:
        with open(path, 'r', encoding='latin-1', errors='ignore') as f:
            r = f.read()
        for line in r.split('\n'):
            line = line.rstrip()
            text = line.split(' ')[1:]
            sentences.append(' '.join(text))
    vocab = generate_vocab(sentences)
    return vocab

def load_data(split_path, vocab):
    data = []
    labels = []
    with open(split_path, 'r', encoding='latin-1', errors='ignore') as f:
        r = f.read()
    for line in r.split('\n'):
        line = line.rstrip()
        text = line.split(' ')[1:]
        label = int(line.split(' ')[0])
        mapped_sentence = []
        for token in text:
            if token in vocab:
                mapped_sentence.append(str(vocab[token]))
            else:
                mapped_sentence.append(15448)
        labels.append(label)
        data.append(mapped_sentence)
    return np.array(data), to_categorical(np.array(labels)), len(set(labels))

vocab = load_vocab(['datasets/sst/Training_SST-2.txt', 'datasets/sst/Dev_SST-2.txt', 'datasets/sst/Test_SST-2.txt'])
X_train, y_train, num_classes = load_data('datasets/sst/Training_SST-2.txt', vocab)
X_valid, y_valid, _ = load_data('datasets/sst/Dev_SST-2.txt', vocab)
X_test, y_test, _ = load_data('datasets/sst/Test_SST-2.txt', vocab)

In [0]:
# sentihood data
def load_vocab(paths):
    sentences = []
    for path in paths:
        with open(path, 'r') as f:
            j = json.load(f)
        for i in range(len(j)):
            if len(j[i]['opinions']) > 0:
                text = j[i]['text'].strip()
                for idx in range(len(j[i]['opinions'])):
                    aspect = j[i]['opinions'][idx]['aspect']
                    entity = j[i]['opinions'][idx]['target_entity']
                    text = text.replace(entity, aspect)
                sentences.append(' '.join(text))
    vocab = generate_vocab(sentences)
    return vocab

def load_data(split_path, vocab):
    data = []
    labels = []
    with open(split_path, 'r') as f:
        j = json.load(f)
        for i in range(len(j)):
            if len(j[i]['opinions']) > 0:
                if j[i]['opinions'][0]['sentiment'] == 'Positive':
                    label = 1
                else:
                    label = 0
                text = j[i]['text'].strip()
                for idx in range(len(j[i]['opinions'])):
                    aspect = j[i]['opinions'][idx]['aspect']
                    entity = j[i]['opinions'][idx]['target_entity']
                    text = text.replace(entity, aspect)
                mapped_sentence = []
                for token in text:
                    if token in vocab:
                        mapped_sentence.append(str(vocab[token]))
                    else:
                        mapped_sentence.append(15448)
                labels.append(label)
                data.append(mapped_sentence)
    return np.array(data), to_categorical(np.array(labels)), len(set(labels))

vocab = load_vocab(['datasets/sentihood/sentihood-train.json', 'datasets/sentihood/sentihood-dev.json', 'datasets/sentihood/sentihood-test.json'])
X_train, y_train, num_classes = load_data('datasets/sentihood/sentihood-train.json', vocab)
X_valid, y_valid, _ = load_data('datasets/sentihood/sentihood-dev.json', vocab)
X_test, y_test, _ = load_data('datasets/sentihood/sentihood-test.json', vocab)

In [0]:
# semeval data
def load_vocab(paths):
    sentences = []
    for path in paths:
        tree = ET.parse(path)
        root = tree.getroot()
        for sentence in root.iter('sentence'):
            text = sentence.find('text').text.strip()
            sentences.append(text)
    vocab = generate_vocab(sentences)
    return vocab

def load_data(paths, vocab):
    data = []
    labels = []
    for split_path in paths:
        tree = ET.parse(split_path)
        root = tree.getroot()
        for sentence in root.iter('sentence'):
            text = sentence.find('text').text.strip()
            P = []
            aspects = sentence.find('aspectTerms')
            if aspects is not None:
                for aspect in aspects:
                    P.append(aspect.get('polarity'))
            if 'negative' in P and not 'positive' in P:
                polarity = 0
            elif 'positive' in P and not 'negative' in P:
                polarity = 1
            else:
                polarity = 2
            mapped_sentence = []
            for token in text:
                if token in vocab:
                    mapped_sentence.append(str(vocab[token]))
                else:
                    mapped_sentence.append(15448)
            labels.append(polarity)
            data.append(mapped_sentence)
    return np.array(data), to_categorical(np.array(labels)), len(set(labels))

vocab = load_vocab(['datasets/semeval/Laptops_train_v2.xml.txt', 'datasets/semeval/Restaurants_Train_v2.xml.txt'])
data, labels, num_classes = load_data(['datasets/semeval/Laptops_train_v2.xml.txt', 'datasets/semeval/Restaurants_Train_v2.xml.txt'], vocab)
X_train, X_valid, X_test = np.split(data, [int(.8*len(data)), int(.9*len(data))])
y_train, y_valid, y_test = np.split(labels, [int(.8*len(labels)), int(.9*len(labels))])

## **Training options and data**

In [0]:
num_words = 20000
max_len = 300
embedding_dim = 50

In [0]:
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_valid = sequence.pad_sequences(X_valid, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)
print(X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape)

## **Basic Dynamic CNN**

In [0]:
class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[1] * self.k))

    def call(self, inputs):
        
        # swap last two dimensions since top_k will be applied along the last dimension
        #shifted_input = tf.transpose(inputs, [0, 2, 1])
        
        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(inputs, k=self.k, sorted=True, name=None)[0]
        
        # return flattened output
        return top_k
  

def basic_dynamic_cnn(k = 5):
    model = Sequential()
    # Embedding each word
    model.add(Embedding(num_words, embedding_dim, input_length = max_len))
    # Wide convolution
    model.add(ZeroPadding1D(29))
    model.add(Conv1D(embedding_dim, 30, activation = 'relu'))
    # k-max pooling
    model.add(Permute((2, 1)))
    model.add(KMaxPooling(k))
    model.add(Reshape((k, -1)))
    model.add(Flatten())
    model.add(Dense(num_classes, activation = 'softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [0]:
basic_dynamic_cnn = basic_dynamic_cnn()
callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]
history = basic_dynamic_cnn.fit(X_train, y_train, callbacks = callbacks, epochs = 10, validation_data = (X_valid, y_valid), batch_size = 200)
results = basic_dynamic_cnn.evaluate(X_test, y_test)
print('Test accuracy: ', results[1])
print('=============================')
print('Training: ', round(history.history['acc'][-1]*100, 2), '%')
print('Validation: ', round(history.history['val_acc'][-1]*100, 2), '%')
print('Testing: ', round(results[1]*100, 2), '%')

## **Two Conv Dynamic CNN**

In [0]:
# two kinds of k's and kernel sizes for each operation
def two_conv_dynamic_cnn(k1 = 20, k2 = 10, ksize1 = 20, ksize2 = 30):
    inputs = Input(shape = (X_train.shape[-1],))
    embed = Embedding(num_words, embedding_dim, input_length = max_len)(inputs)
    padded = ZeroPadding1D(ksize1 - 1)(embed)
    conv1 = Conv1D(embedding_dim, ksize1, activation = 'relu')(padded)
    permuted = Permute((2,1))(conv1)
    kmaxpool1 = KMaxPooling(k1)(permuted)
    kmaxpool1 = Reshape((k1, -1))(kmaxpool1)
    padded = ZeroPadding1D(ksize2 -1)(kmaxpool1)
    conv2 = Conv1D(embedding_dim, ksize2, activation = 'relu')(padded)
    permuted = Permute((2,1))(conv2)
    kmaxpool2 = KMaxPooling(k2)(permuted)
    kmaxpool2 = Reshape((k2, -1))(kmaxpool2)
    flattened = Flatten()(kmaxpool2)
    outputs = Dense(num_classes, activation = 'softmax')(flattened)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [0]:
two_conv_dynamic_cnn = two_conv_dynamic_cnn()
callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]
history = two_conv_dynamic_cnn.fit(X_train, y_train, callbacks = callbacks, epochs = 10, validation_data = (X_valid, y_valid), batch_size = 200)
results = two_conv_dynamic_cnn.evaluate(X_test, y_test)
print('Test accuracy: ', results[1])
print('=============================')
print('Training: ', round(history.history['acc'][-1]*100, 2), '%')
print('Validation: ', round(history.history['val_acc'][-1]*100, 2), '%')
print('Testing: ', round(results[1]*100, 2), '%')

## **Two Feature Map Dynamic CNN**

In [0]:
def two_feature_map_dynamic_cnn(k1 = 20, k2 = 10, ksize1 = 20, ksize2 = 30):
    inputs = Input(shape = (X_train.shape[-1],))
    embed = Embedding(num_words, embedding_dim, input_length = max_len)(inputs)
    conv_results = []
    # two feature maps using for loop
    for i in range(2):
        padded = ZeroPadding1D(ksize1 - 1)(embed)
        conv1 = Conv1D(embedding_dim, ksize1, activation = 'relu')(padded)
        permuted = Permute((2,1))(conv1)
        kmaxpool1 = KMaxPooling(k1)(permuted)
        kmaxpool1 = Reshape((k1, -1))(kmaxpool1)
        padded = ZeroPadding1D(ksize2 -1)(kmaxpool1)
        conv2 = Conv1D(embedding_dim, ksize2, activation = 'relu')(padded)
        permuted = Permute((2,1))(conv2)
        kmaxpool2 = KMaxPooling(k2)(permuted)
        kmaxpool2 = Reshape((k2, -1))(kmaxpool2)
        flattened = Flatten()(kmaxpool2)
        conv_results.append(flattened)
    conv_result = concatenate(conv_results)
    outputs = Dense(num_classes, activation = 'softmax')(conv_result)
    
    model = Model(inputs = inputs, outputs = outputs)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [0]:
two_feature_map_dynamic_cnn = two_feature_map_dynamic_cnn()
callbacks = [ModelCheckpoint(filepath = 'best_model.hdf5', monitor='val_acc', verbose=1, save_best_only = True, mode='max')]
history = two_feature_map_dynamic_cnn.fit(X_train, y_train, callbacks = callbacks, epochs = 10, validation_data = (X_valid, y_valid), batch_size = 200)
results = two_feature_map_dynamic_cnn.evaluate(X_test, y_test)
print('Test accuracy: ', results[1])
print('=============================')
print('Training:', round(history.history['acc'][-1]*100, 2), '%')
print('Validation:', round(history.history['val_acc'][-1]*100, 2), '%')
print('Testing:', round(results[1]*100, 2), '%')

# **Results: Accuracy**


## **SST-1**: 5 classes

### Basic Dynamic CNN:
#### Training: 71.10%
#### Validation: 57.64%
#### Testing: 58.29%

## Two-Conv Dynamic CNN:
#### Training: 63.66%
#### Validation: 59.93%
#### Testing: 57.64%

## Two-Feature-Map Dynamic CNN:
#### Training: 65.18%
#### Validation: 53.69%
#### Testing: 56.48%


## **SST-2**: 2 classes

### Basic Dynamic CNN:
#### Training: 99.86%
#### Validation: 76.40%
#### Testing: 78.32%

## Two-Conv Dynamic CNN:
#### Training: 99.71%
#### Validation: 76.06%
#### Testing: 77.06%

## Two-Feature-Map Dynamic CNN:
#### Training: 99.81%
#### Validation: 78.01%
#### Testing: 77.66%

## **SentiHood**: 2 classes

### Basic Dynamic CNN:
#### Training: 76.94 %
#### Validation: 75.84 %
#### Testing: 73.78 %

## Two-Conv Dynamic CNN:
#### Training: 75.9 %
#### Validation: 75.25 %
#### Testing: 72.98 %

## Two-Feature-Map Dynamic CNN:
#### Training: 74.37 %
#### Validation: 74.65 %
#### Testing: 73.08 %

## **SemEval**: 3 classes

### Basic Dynamic CNN:
#### Training: 73.77 %
#### Validation: 60.43 %
#### Testing: 61.08 %

## Two-Conv Dynamic CNN:
#### Training: 64.11 %
#### Validation: 58.62 %
#### Testing: 57.96 %

## Two-Feature-Map Dynamic CNN:
#### Training: 62.82 %
#### Validation: 55.67 %
#### Testing: 56.81 %