## **Source**

In [0]:
# https://github.com/jojonki/cnn-for-sentence-classification

## **Setting up environment**

In [0]:
!mkdir datasets
# !wget -O datasets/rt-polaritydata.tar.gz https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz
# !tar -C datasets -xvf datasets/rt-polaritydata.tar.gz
!unzip datasets.zip
!wget -O datasets/glove.6B.zip http://nlp.stanford.edu/data/glove.6B.zip
!unzip datasets/glove.6B.zip -d datasets

Archive:  datasets.zip
   creating: datasets/rt-polaritydata/
  inflating: datasets/rt-polaritydata/rt-polarity.pos  
  inflating: datasets/rt-polaritydata/rt-polarity.neg  
   creating: datasets/semeval/
  inflating: datasets/semeval/Restaurants_Train_v2.xml.txt  
  inflating: datasets/semeval/Laptops_test_PhaseB.xml.txt  
  inflating: datasets/semeval/Laptops_test_PhaseA.xml.txt  
  inflating: datasets/semeval/Restuarants_test_phaseA.xml.txt  
  inflating: datasets/semeval/Restuarants_test_phaseB.xml.txt  
  inflating: datasets/semeval/Laptops_train_v2.xml.txt  
  inflating: datasets/semeval/Restaurants_Train.xml.txt  
   creating: datasets/sentihood/
  inflating: datasets/sentihood/sentihood-test.json  
  inflating: datasets/sentihood/sentihood-dev.json  
  inflating: datasets/sentihood/sentihood-train.json  
   creating: datasets/sst/
  inflating: datasets/sst/Test_SST-1.txt  
  inflating: datasets/sst/Dev_SST-2.txt  
  inflating: datasets/sst/Training_SST-1.txt  
  inflating: data

## **Imports**

In [0]:
import tensorflow as tf
tf.__version__

'1.15.0'

In [0]:
import numpy as np
import codecs
import os
import random

from keras import backend as K
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Dense, Lambda, Permute, Dropout
from keras.layers import Conv2D, MaxPooling1D
from keras.optimizers import Adam
from keras.utils import to_categorical

Using TensorFlow backend.


## **Loading Data from Different Datasets**
To train for a specific dataset, only run the cell for that dataset, then skip over to the next section.\
\Don't forget to run the first cells in this section for the loading functions.\![alt text](https://)
Data format is as follows: [ ( [word1, word2, word3], label ), ... ]

In [0]:
def vectorize(data, sentence_maxlen, w2i):
    vec_data = []
    labels = []
    for d, label in data:
        vec = [w2i[w] for w in d if w in w2i]
        pad_len = max(0, sentence_maxlen - len(vec))
        vec += [0] * pad_len
        vec_data.append(vec)
        labels.append(label)
    vec_data = np.array(vec_data)
    labels = to_categorical(np.array(labels))
    return vec_data, labels

def setup_vocab(data):
    sentence_maxlen = max(map(len, (d for d, _ in data)))
    vocab = []
    for d, _ in data:
        for w in d:
            if w not in vocab: vocab.append(w)
    vocab = sorted(vocab)
    vocab_size = len(vocab)
    print('Sentence Max Length', sentence_maxlen)
    print('Vocab Examples:', vocab[:10])
    print('Vocab Size', len(vocab))
    w2i = {w:i for i,w in enumerate(vocab)}
    return sentence_maxlen, w2i, vocab_size

In [0]:
# # repo data
# def load_data(fpath, label):
#     data = []
#     with codecs.open(fpath, 'r', 'utf-8', errors='ignore') as f:
#         lines = f.readlines()
#         for l in lines:
#             l = l.rstrip()
#             data.append((l.split(' '), label))
#     return data

# pos = load_data('datasets/rt-polaritydata/rt-polarity.pos', 1)
# neg = load_data('datasets/rt-polaritydata/rt-polarity.neg', 0)
# data = pos + neg
# classes = sorted(set([i[1] for i in data]))
# num_classes = len(classes)
# print(f'Loaded {len(data)} data rows with {num_classes} classes.')
# print(f'Classes: {classes}')
# print('Example:')
# print(data[-1])

In [0]:
# sst-1 data
def load_data(paths):
    data = []
    for path in paths:
        with open(path, 'r', encoding='latin-1', errors='ignore') as f:
            r = f.read()
        for line in r.split('\n'):
            line = line.rstrip()
            text = line.split(' ')[1:]
            L = line.split(' ')[0]
            label = int(L)
            data.append((text, label))
    classes = sorted(set([i[1] for i in data]))
    num_classes = len(classes)
    return data, classes, num_classes

# load data
data_train, classes, num_classes = load_data(['datasets/sst/Training_SST-1.txt'])
data_valid, _, _ = load_data(['datasets/sst/Dev_SST-1.txt'])
data_test, _, _ = load_data(['datasets/sst/Test_SST-1.txt'])
# shuffle data
random.shuffle(data_train)
random.shuffle(data_valid)
random.shuffle(data_test)
# define vocab
all_data = data_train + data_valid + data_test
sentence_maxlen, w2i, vocab_size = setup_vocab(all_data)
# vectorize data
trainX, trainY = vectorize(data_train, sentence_maxlen, w2i)
validX, validY = vectorize(data_valid, sentence_maxlen, w2i)
testX, testY = vectorize(data_test, sentence_maxlen, w2i)
# print summary
print(f'Loaded {len(all_data)} data rows with {num_classes} classes.')
print(f'Training Data: {len(data_train)}\nValidation Data: {len(data_valid)}\nTest Data: {len(data_test)}')
print(f'Classes: {classes}')
print('Example:')
print(data_train[-1])

Sentence Max Length 56
Vocab Examples: ['!', '!?', '#', '$', '%', '&', "'", "''", "'30s", "'40s"]
Vocab Size 19536
Loaded 11858 data rows with 5 classes.
Training Data: 8545
Validation Data: 1102
Test Data: 2211
Classes: [0, 1, 2, 3, 4]
Example:
(['while', 'some', 'of', 'the', 'camera', 'work', 'is', 'interesting', ',', 'the', 'film', "'s", 'mid-to-low', 'budget', 'is', 'betrayed', 'by', 'the', 'surprisingly', 'shoddy', 'makeup', 'work', '.'], 1)


In [0]:
# sst-2 data
def load_data(paths):
    data = []
    for path in paths:
        with open(path, 'r', encoding='latin-1', errors='ignore') as f:
            r = f.read()
        for line in r.split('\n'):
            line = line.rstrip()
            text = line.split(' ')[1:]
            L = line.split(' ')[0]
            data.append((text, int(L)))
    classes = sorted(set([i[1] for i in data]))
    num_classes = len(classes)
    return data, classes, num_classes

# load data
data_train, classes, num_classes = load_data(['datasets/sst/Training_SST-2.txt'])
data_valid, _, _ = load_data(['datasets/sst/Dev_SST-2.txt'])
data_test, _, _ = load_data(['datasets/sst/Test_SST-2.txt'])
# shuffle data
random.shuffle(data_train)
random.shuffle(data_valid)
random.shuffle(data_test)
# define vocab
all_data = data_train + data_valid + data_test
sentence_maxlen, w2i, vocab_size = setup_vocab(all_data)
# vectorize data
trainX, trainY = vectorize(data_train, sentence_maxlen, w2i)
validX, validY = vectorize(data_valid, sentence_maxlen, w2i)
testX, testY = vectorize(data_test, sentence_maxlen, w2i)
# print summary
print(f'Loaded {len(all_data)} data rows with {num_classes} classes.')
print(f'Training Data: {len(data_train)}\nValidation Data: {len(data_valid)}\nTest Data: {len(data_test)}')
print(f'Classes: {classes}')
print('Example:')
print(data_train[-1])

Sentence Max Length 56
Vocab Examples: ['!', '!?', '#', '$', '%', '&', "'", "''", "'30s", "'40s"]
Vocab Size 17573
Loaded 9616 data rows with 2 classes.
Training Data: 6921
Validation Data: 873
Test Data: 1822
Classes: [0, 1]
Example:
(['i', 'liked', 'the', 'original', 'short', 'story', 'but', 'this', 'movie', ',', 'even', 'at', 'an', 'hour', 'and', 'twenty-some', 'minutes', ',', 'it', "'s", 'too', 'long', 'and', 'it', 'goes', 'nowhere', '.'], 0)


In [0]:
# sentihood data
import json
def load_data(paths):
    data = []
    for path in paths:
        with open(path, 'r') as f:
            j = json.load(f)
        for i in range(len(j)):
            if len(j[i]['opinions']) > 0:
                if j[i]['opinions'][0]['sentiment'] == 'Positive':
                    label = 1
                else:
                    label = 0
                text = j[i]['text'].strip()
                for idx in range(len(j[i]['opinions'])):
                    aspect = j[i]['opinions'][idx]['aspect']
                    entity = j[i]['opinions'][idx]['target_entity']
                    text = text.replace(entity, aspect)
                data.append((text.rstrip().split(' '), label))
    classes = sorted(set([i[1] for i in data]))
    num_classes = len(classes)
    return data, classes, num_classes

# load data
data_train, classes, num_classes = load_data(['datasets/sentihood/sentihood-train.json'])
data_valid, _, _ = load_data(['datasets/sentihood/sentihood-dev.json'])
data_test, _, _ = load_data(['datasets/sentihood/sentihood-test.json'])
# shuffle data
random.shuffle(data_train)
random.shuffle(data_valid)
random.shuffle(data_test)
# define vocab
all_data = data_train + data_valid + data_test
sentence_maxlen, w2i, vocab_size = setup_vocab(all_data)
# vectorize data
trainX, trainY = vectorize(data_train, sentence_maxlen, w2i)
validX, validY = vectorize(data_valid, sentence_maxlen, w2i)
testX, testY = vectorize(data_test, sentence_maxlen, w2i)
# print summary
print(f'Loaded {len(all_data)} data rows with {num_classes} classes.')
print(f'Training Data: {len(data_train)}\nValidation Data: {len(data_valid)}\nTest Data: {len(data_test)}')
print(f'Classes: {classes}')
print('Example:')
print(data_train[-1])

Sentence Max Length 115
Vocab Examples: ['', '"', '"Awarded', '"Tourist', '"convenient"', '"dangerous"', '"murder', '"nicer"', '"normal"', '"open"']
Vocab Size 4994
Loaded 3529 data rows with 2 classes.
Training Data: 2021
Validation Data: 505
Test Data: 1003
Classes: [0, 1]
Example:
(['Personally', 'I', 'would', 'rather', 'live', 'in', 'transit-location,', 'but', 'just', 'cus', 'its', 'more', 'central', 'then', 'transit-location'], 1)


In [0]:
# semeval data
import xml.etree.ElementTree as ET
def load_data(paths):
    data = []
    for path in paths:
        tree = ET.parse(path)
        root = tree.getroot()
        for sentence in root.iter('sentence'):
            text = sentence.find('text').text.strip()
            P = []
            aspects = sentence.find('aspectTerms')
            if aspects is not None:
                for aspect in aspects:
                    P.append(aspect.get('polarity'))
            if 'negative' in P and not 'positive' in P:
                polarity = 0
            elif 'positive' in P and not 'negative' in P:
                polarity = 1
            else:
                polarity = 2
            data.append((text.rstrip().split(' '), polarity))
    classes = sorted(set([i[1] for i in data]))
    num_classes = len(classes)
    return data, classes, num_classes

# load data
data, classes, num_classes = load_data(['datasets/semeval/Laptops_train_v2.xml.txt', 'datasets/semeval/Restaurants_Train_v2.xml.txt'])
# shuffle data
random.shuffle(data)
# define vocab
sentence_maxlen, w2i, vocab_size = setup_vocab(data)
# split dataset into train-valid-test (80-10-10)
data_train, data_valid, data_test = np.split(data, [int(.8*len(data)), int(.9*len(data))])
# create train-valid-test vectors
trainX, trainY = vectorize(data_train, sentence_maxlen, w2i)
validX, validY = vectorize(data_valid, sentence_maxlen, w2i)
testX, testY = vectorize(data_test, sentence_maxlen, w2i)
# print info
print(f'Loaded {len(data)} data rows with {num_classes} classes.')
print(f'Training Data: {len(trainX)}\nValidation Data: {len(validX)}\nTest Data: {len(testX)}')
print(f'Classes: {classes}')
print('Example:')
print(data[-1])

Sentence Max Length 78
Vocab Examples: ['', '!', '!!', '"', '"1764"', '"74%,', '">', '"Activity', '"BUILD"', '"Blue']
Vocab Size 12250
Loaded 6086 data rows with 3 classes.
Training Data: 4868
Validation Data: 609
Test Data: 609
Classes: [0, 1, 2]
Example:
(['I', 'took', 'it', 'back', 'for', 'an', 'Asus', 'and', 'same', 'thing-', 'blue', 'screen', 'which', 'required', 'me', 'to', 'remove', 'the', 'battery', 'to', 'reset.'], 2)


## **Load Embeddings**

In [0]:
def load_glove_weights(glove_dir, embd_dim, vocab_size, word_index):
    embeddings_index = {}
    f = open(os.path.join(glove_dir, 'glove.6B.' + str(embd_dim) + 'd.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index)) 
    embedding_matrix = np.zeros((vocab_size, embd_dim))
    print('embed_matrix.shape', embedding_matrix.shape)
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embd_dim = 300
glove_embd_w = load_glove_weights('datasets', embd_dim, vocab_size, w2i)

Found 400000 word vectors.
embed_matrix.shape (12250, 300)


## **Create Model**

In [0]:
def Net(vocab_size, embd_size, sentence_maxlen, glove_embd_w, num_classes):
    sentence = Input((sentence_maxlen,), name='SentenceInput')
    
    # embedding
    embd_layer = Embedding(input_dim=vocab_size, 
                           output_dim=embd_size, 
                           weights=[glove_embd_w], 
                           trainable=False,
                           name='shared_embd')
    embd_sentence = embd_layer(sentence)
    embd_sentence = Permute((2,1))(embd_sentence)
    embd_sentence = Lambda(lambda x: K.expand_dims(x, -1))(embd_sentence)
    
    # cnn
    cnn = Conv2D(1, 
                 kernel_size=(3, sentence_maxlen),
                 activation='relu')(embd_sentence)
    cnn =  Lambda(lambda x: K.sum(x, axis=3))(cnn)
    cnn = MaxPooling1D(3)(cnn)
    cnn = Lambda(lambda x: K.sum(x, axis=2))(cnn)
    dense = Dense(64, activation='relu')(cnn)
    dropout = Dropout(0.2)(dense)
    dense = Dense(32, activation='relu')(dropout)
    out = Dense(num_classes, activation='softmax')(dense)
    adam = Adam(lr=0.0001,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=None,
                decay=0.0,
                amsgrad=True)
    model = Model(inputs=sentence, outputs=out, name='sentence_claccification')
    model.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['accuracy']) 
    return model

model = Net(vocab_size, embd_dim, sentence_maxlen, glove_embd_w, num_classes)
print(model.summary())

Model: "sentence_claccification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
SentenceInput (InputLayer)   (None, 78)                0         
_________________________________________________________________
shared_embd (Embedding)      (None, 78, 300)           3675000   
_________________________________________________________________
permute_5 (Permute)          (None, 300, 78)           0         
_________________________________________________________________
lambda_13 (Lambda)           (None, 300, 78, 1)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 298, 1, 1)         235       
_________________________________________________________________
lambda_14 (Lambda)           (None, 298, 1)            0         
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 99, 1) 

## **Train Model**

In [0]:
model.fit(trainX, trainY,
          batch_size=256,
          epochs=50,
          validation_data=(validX, validY))

Train on 4868 samples, validate on 609 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f888b52d320>

In [0]:
results = model.evaluate(testX, testY)
print('Test accuracy: ', results[1])

Test accuracy:  0.5763546798029556


# **Results: Accuracy**

## **SST-1**: 5 classes
#### Training: 39.31%
#### Validation: 36.30%
#### Testing: 37.22%

## **SST-2**: 2 classes
#### Training: 74.45%
#### Validation: 72.16%
#### Testing: 72.39%

## **SentiHood**: 2 classes
#### Training: 80.65%
#### Validation: 72.87%
#### Testing: 72.18%

## **SemEval**: 3 classes
#### Training: 65.55%
#### Validation: 63.05%
#### Testing: 57.64