### Loading Dataset

In [1]:
import os
import re
import numpy as np



MDSD_PATH = os.path.expanduser('./')

DOMAINS = ('books', 'dvd', 'electronics', 'kitchen')


def load_mdsd(domains, n_labeled=None):
    sorted_data_path = os.path.join(MDSD_PATH, 'sorted_data')
    print('loading data from {}'.format(sorted_data_path))
    texts = []
    s_labels = []
    d_labels = []
    sentiments = ('positive', 'negative')
    for d_id, d_name in enumerate(domains):
        for s_id, s_name in zip((1, 0, -1), sentiments):
            fpath = os.path.join(sorted_data_path, d_name, s_name + '.review')
            print(' - loading', d_name, s_name, end='')
            count = 0
            text = ''
            in_review_text = False
            with open(fpath, encoding='utf8', errors='ignore') as fr:
                for line in fr:
                    if '<review_text>' in line:
                        text = ''
                        in_review_text = True
                        continue
                    if '</review_text>' in line:
                        in_review_text = False
                        text = text.lower().replace('\n', ' ').strip()
                        text = re.sub(r'&[a-z]+;', '', text)
                        text = re.sub(r'\s+', ' ', text)
                        texts.append(text)
                        s_labels.append(s_id)
                        d_labels.append(d_id)
                        count += 1
                    if in_review_text:
                        text += line
                    if (s_id >= 0) and n_labeled and (count == n_labeled):
                        break
            print(': %d texts' % count)
    print('data loaded')
    s_labels = np.asarray(s_labels, dtype='int')
    d_labels = np.asarray(d_labels, dtype='int')
    print(' - texts:', len(texts))
    print(' - s_labels:', len(s_labels))
    print(' - d_labels:', len(d_labels))

    return texts, s_labels, d_labels



loading data from ./sorted_data
 - loading books positive: 1000 texts
 - loading books negative: 1000 texts
 - loading dvd positive: 1000 texts
 - loading dvd negative: 1000 texts
 - loading electronics positive: 1000 texts
 - loading electronics negative: 1000 texts
 - loading kitchen positive: 1000 texts
 - loading kitchen negative: 1000 texts
data loaded
 - texts: 8000
 - s_labels: 8000
 - d_labels: 8000


In [2]:
import os
import numpy as np
from tqdm import tqdm
from numpy.linalg import norm

from keras import layers
from keras.callbacks import Callback
from keras import backend as K

glove_path = os.path.expanduser('./glove/')
assert os.path.exists(glove_path)


def load_glove(path=glove_path, embedding_dim=300, corpus_size=6, desired=None, verbose=False):
    if embedding_dim != 300:
        assert embedding_dim in (50, 100, 200),
        fpath = os.path.join(path, 'glove.6B.{}d.txt'.format(embedding_dim))
    else:
        assert corpus_size in (6, 42, 840), 
        fpath = os.path.join(path, 'glove.{}B.300d.txt'.format(corpus_size))
    word2vec = {}
    print('loading glove from', fpath)
    f = open(fpath, 'r', encoding='utf8', errors='ignore')
    for line in tqdm(f, desc='glove') if verbose else f:
        values = line.split()
        word = values[0]  # the word
        if not desired or word in desired:
            coefs = np.asarray(values[1:], dtype="float32")
            word2vec[word] = coefs
    f.close()
    print('glove info: {} words, {} dims'.format(len(word2vec), embedding_dim))
    return word2vec


def get_embedding_mat(embeddings, word2index, embedding_dim, random_uniform_level=0.01, idx_from=2):
    # embedding_mat = np.zeros((n_words, embedding_dim))
    n_words = len(word2index)
    for idx in range(0, idx_from):
        if idx in word2index.values():
            n_words -= 1
    n_words += idx_from
    embedding_mat = np.random.uniform(low=-random_uniform_level, high=random_uniform_level, size=(n_words, embedding_dim))
    embedding_mat[0] = np.zeros(embedding_dim)
    for word, idx in word2index.items():
        if idx < idx_from:
            continue
        embedding_vec = embeddings.get(word)
        if embedding_vec is not None:  
            embedding_mat[idx] = embedding_vec
    return embedding_mat


def att_process(candidates, att, activation='tanh'):
    att_dim = K.int_shape(att)[-1]
    candidates2 = layers.TimeDistributed(
        layers.Dense(att_dim, activation=activation))(candidates)
    dotted = layers.dot([candidates2, att], axes=(2, 1), normalize=True)
    weights = layers.Activation('softmax')(dotted)  # (*, maxlen), sums up to 1
    weighted = layers.dot([candidates, weights], axes=(1, 1))
    return weighted, weights


class UpdateMonitor(Callback):
    def __init__(self):
        super(UpdateMonitor, self).__init__()
        self.weights = None

    @classmethod
    def _get_updates(cls, old_weights, new_weights):
        if not old_weights:
            old_weights = new_weights
        updates = []
        for old_layerwise_weights, new_layerwise_weights in zip(old_weights, new_weights):
            if len(old_layerwise_weights) == 0 or len(new_layerwise_weights) == 0:
                updates.append(None)
            else:
                w1, w2 = old_layerwise_weights[0], new_layerwise_weights[0]  # only check the first weight of a layer
                updates.append(norm(w2 - w1) / norm(w2))
        return updates

    def on_epoch_end(self, epoch, logs={}):
        new_weights = _get_weights(self.model)
        updates = self._get_updates(old_weights=self.weights, new_weights=new_weights)
        self.weights = new_weights  # update
        updates_info = ', '.join('{:.4f}'.format(1e3 * update) if update else '-' for update in updates)
        print('- updates: 1e-3 * [{}]'.format(updates_info))


def _get_weights(model):
    weights = []
    for layer in model.layers:
        # if no weights, return value is []
        weights.append(layer.get_weights())
    return weights

### Training & Validating Model

In [3]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
from nltk import word_tokenize
from collections import Counter

import tensorflow as tf
from keras.models import Model
from keras import layers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical



np.random.seed(2333)
tf.random.set_seed(2333)



# domains
DOMAINS = ('books', 'dvd', 'electronics', 'kitchen')


class SharedData:
    """global data"""
    coef = 0.04  

    # model params
    embed_dim = 300  
    rnn_dim = 300  
    hidden_dim = 100  
    embed_dropout = 0.2 
    fc_dropout = 0.2  
    batch_size = 64  
    epochs = 20  #epochs
    activation = 'relu'  
    optimizer = 'adadelta'  
    RNN = layers.LSTM  

    
    lr_factor = 0.1  
    lr_patience = 1  
    stop_patience = 2  

    
    glove_corpus = 6 
    min_count = 1  
    max_words = None  
    n_words = None
    maxlen = None
    word2index = None
    wv_weights = None  


SD = SharedData()


def _tvt_split(_seqs, _slabels, splits=(7, 2, 1)):
    """train/val/test split for one single domain"""
    assert len(_seqs) == len(_slabels)
    splits = np.asarray(splits)
    splits = np.cumsum(splits / splits.sum())
    indices = [range(len(_seqs))]
    np.random.shuffle(indices)
    _seqs = _seqs[indices]
    _slabels = _slabels[indices]
    X_train, y_train, X_val, y_val, X_test, y_test = [], [], [], [], [], []
    for slabel in sorted(np.unique(_slabels)):
        seqs_ofs = _seqs[_slabels == slabel]
        slabels_ofs = _slabels[_slabels == slabel]
        split_ats = np.asarray(splits * len(seqs_ofs), dtype=int)
        X_train.extend(seqs_ofs[:split_ats[0]])
        X_val.extend(seqs_ofs[split_ats[0]:split_ats[1]])
        X_test.extend(seqs_ofs[split_ats[1]:])
        y_train.extend(slabels_ofs[:split_ats[0]])
        y_val.extend(slabels_ofs[split_ats[0]:split_ats[1]])
        y_test.extend(slabels_ofs[split_ats[1]:])
    X_train = np.asarray(X_train, dtype='int')
    X_val = np.asarray(X_val, dtype='int')
    X_test = np.asarray(X_test, dtype='int')
    y_train = np.asarray(y_train, dtype='int')
    y_val = np.asarray(y_val, dtype='int')
    y_test = np.asarray(y_test, dtype='int')
    print(' * X:', X_train.shape, X_val.shape, X_test.shape)
    print(' * y:', y_train.shape, y_val.shape, y_test.shape)
    return (X_train, X_val, X_test), (y_train, y_val, y_test)


def make_data():
    global SD

    print('loading data: Multi-Domain Sentiment Dataset v2')
    texts, s_labels, d_labels = load_mdsd(domains=DOMAINS)

    # build vocabulary 
    print('building vocabulary')
    texts_tokens = []
    lens = []
    for text in texts:
        words = word_tokenize(text)
        for idx, word in enumerate(words):
            if word.isdigit():
                words[idx] = '<NUM>'  
        texts_tokens.append(words)
        lens.append(len(words))
    maxlen = int(np.percentile(lens, 95))
    print('maxlen:', maxlen)
    counter = Counter()
    for words in texts_tokens:
        counter.update(words)
    word2index = {'<PAD>': 0, '<UNK>': 1}
    for idx, word_count in enumerate(counter.most_common(SD.max_words)):
        if word_count[1] >= SD.min_count:  
            word2index[word_count[0]] = idx + 2  
    n_words = len(word2index)
    print('n_words:', n_words)

    # data encode
    print('data encoding')
    seqs = []
    for words in texts_tokens:
        seqs.append([word2index.get(word, 1) for word in words])
    seqs_padded = pad_sequences(seqs, maxlen=maxlen, padding='post', truncating='post')
    s_labels = np.asarray(s_labels, dtype=int)
    d_labels = np.asarray(d_labels, dtype=int)

    # domain & train/val/test split
    print('labeled data: domain & train/val/test splitting')
    X_train, ys_train, yd_train = [], [], []
    X_val, ys_val, yd_val = [], [], []
    X_test_byd, ys_test_byd, yd_test_byd = {}, {}, {}
    for d_id, d_name in enumerate(DOMAINS):
        print(d_name, 'splitting')
        seqs_padded_ofd = seqs_padded[(d_labels == d_id) & (s_labels != -1)]
        slabels_ofd = s_labels[(d_labels == d_id) & (s_labels != -1)]
        print(' * all:', seqs_padded_ofd.shape, slabels_ofd.shape)
        (X_train_ofd, X_val_ofd, X_test_ofd), (y_train_ofd, y_val_ofd, y_test_ofd) = _tvt_split(seqs_padded_ofd, slabels_ofd)
        # train data (add this domain)
        X_train.extend(X_train_ofd)
        ys_train.extend(y_train_ofd)
        yd_train.extend([d_id] * len(X_train_ofd))
        # val data
        X_val.extend(X_val_ofd)
        ys_val.extend(y_val_ofd)
        yd_val.extend([d_id] * len(X_val_ofd))
        # test data
        X_test_byd[d_id] = X_test_ofd
        ys_test_byd[d_id] = to_categorical(y_test_ofd, num_classes=2)
        yd_test_byd[d_id] = to_categorical([d_id] * len(X_test_ofd), num_classes=len(DOMAINS))
    X_train = np.asarray(X_train, dtype='int')
    ys_train = to_categorical(ys_train, num_classes=2)
    yd_train = to_categorical(yd_train, num_classes=len(DOMAINS))
    X_val = np.asarray(X_val, dtype='int')
    ys_val = to_categorical(ys_val, num_classes=2)
    yd_val = to_categorical(yd_val, num_classes=len(DOMAINS))
    
    X_test = np.concatenate([X_test_byd[idx] for idx in range(len(DOMAINS))])
    ys_test = np.concatenate([ys_test_byd[idx] for idx in range(len(DOMAINS))])
    yd_test = np.concatenate([yd_test_byd[idx] for idx in range(len(DOMAINS))])

    indices = list(range(len(X_train)))
    np.random.shuffle(indices)
    X_train = X_train[indices]
    ys_train = ys_train[indices]
    yd_train = yd_train[indices]
    print('combined labeled data:')
    print('  - train:', X_train.shape, ys_train.shape, yd_train.shape)
    print('  - val:', X_val.shape, ys_val.shape, yd_val.shape)
    print('  - test:', X_test.shape, ys_test.shape, yd_test.shape)
    for d_id, d_name in enumerate(DOMAINS):
        print('  - test for {}:'.format(d_name[:3]), X_test_byd[d_id].shape, ys_test_byd[d_id].shape, yd_test_byd[d_id].shape)

    print('loading word embeddings from glove')
    embeddings = load_glove(embedding_dim=SD.embed_dim, desired=word2index.keys(), corpus_size=SD.glove_corpus)
    print('processing embedding matrix')
    embedding_mat = get_embedding_mat(embeddings, word2index, SD.embed_dim, idx_from=2)
    SD.wv_weights = [embedding_mat]

    SD.maxlen = maxlen
    SD.n_words = n_words
    SD.word2index = word2index
    SD.X_train, SD.ys_train, SD.yd_train = X_train, ys_train, yd_train
    SD.X_val, SD.ys_val, SD.yd_val = X_val, ys_val, yd_val
    SD.X_test, SD.ys_test, SD.yd_test = X_test, ys_test, yd_test
    SD.X_test_byd, SD.ys_test_byd, SD.yd_test_byd = X_test_byd, ys_test_byd, yd_test_byd


def get_model():
    global SD

    weights = SD.wv_weights

    # the model
    print('\nbuilding the model')
    inputs = layers.Input(shape=(SD.maxlen,))
    embeddings = layers.Embedding(
        input_dim=SD.n_words,
        output_dim=SD.embed_dim,
        input_length=SD.maxlen,
        weights=weights)(inputs)
    embeddings = layers.SpatialDropout1D(rate=SD.embed_dropout)(embeddings)

    # domain part
    d_repr = layers.Bidirectional(SD.RNN(
        units=SD.rnn_dim,
        return_sequences=False))(embeddings)
    d_repr = layers.Dense(SD.hidden_dim, activation=SD.activation)(d_repr)
    d_repr = layers.Dropout(SD.fc_dropout)(d_repr)
    d_pred = layers.Dense(len(DOMAINS), activation='softmax', name='d_pred')(d_repr)

    # sentiment part
    # use domain representation as attention
    episodes = layers.Bidirectional(SD.RNN(
        units=SD.rnn_dim,
        return_sequences=True))(embeddings)
    selected, _ = att_process(candidates=episodes, att=d_repr)
    s_repr = layers.Dense(SD.hidden_dim, activation=SD.activation)(selected)
    s_repr = layers.Dropout(SD.fc_dropout)(s_repr)
    s_pred = layers.Dense(2, activation='softmax', name='s_pred')(s_repr)

    # model
    model = Model(
        inputs=inputs,
        outputs=[s_pred, d_pred])
    model.compile(optimizer=SD.optimizer, metrics=['acc'], loss={
        's_pred': 'categorical_crossentropy',
        'd_pred': 'categorical_crossentropy'
    }, loss_weights={
        's_pred': 1,
        'd_pred': SD.coef
    })
    model.summary()
    return model


def train_and_test(model):
    global SD

    # training
    updater = UpdateMonitor()
    reducer = callbacks.ReduceLROnPlateau(factor=SD.lr_factor, patience=SD.lr_patience, verbose=1)
    stopper = callbacks.EarlyStopping(patience=SD.stop_patience, verbose=1)
    cbks = [updater, reducer, stopper]
    print('\ntraining model')
    model.fit(
        SD.X_train,
        [SD.ys_train, SD.yd_train],
        validation_data=(SD.X_val, [SD.ys_val, SD.yd_val]),
        shuffle=True, batch_size=SD.batch_size, epochs=SD.epochs, verbose=2,
        callbacks=cbks)

    # evaluation
    print('\nTest evaluation:')
    for d_id, d_name in enumerate(DOMAINS):
        scores = model.evaluate(
            SD.X_test_byd[d_id],
            [SD.ys_test_byd[d_id], SD.yd_test_byd[d_id]],
            batch_size=SD.batch_size, verbose=0)
        print('{} acc: {:.4f}'.format(d_name[:3], scores[-2]))


if __name__ == '__main__':
    make_data()

    # build & compile model
    model = get_model()

    # train and test
    train_and_test(model)

    print('\nprocess finished ~~~')

loading data: Multi-Domain Sentiment Dataset v2
loading data from ./sorted_data
 - loading books positive: 1000 texts
 - loading books negative: 1000 texts
 - loading dvd positive: 1000 texts
 - loading dvd negative: 1000 texts
 - loading electronics positive: 1000 texts
 - loading electronics negative: 1000 texts
 - loading kitchen positive: 1000 texts
 - loading kitchen negative: 1000 texts
data loaded
 - texts: 8000
 - s_labels: 8000
 - d_labels: 8000
building vocabulary
maxlen: 461
n_words: 45077
data encoding
labeled data: domain & train/val/test splitting
books splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
dvd splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
electronics splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: (1400,) (398,) (202,)
kitchen splitting
 * all: (2000, 461) (2000,)
 * X: (1400, 461) (398, 461) (202, 461)
 * y: 

  _seqs = _seqs[indices]
  _slabels = _slabels[indices]


glove info: 35122 words, 300 dims
processing embedding matrix

building the model
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 461)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 461, 300)     13523100    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 461, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 600)          1442400     spatial_dropout1d[0][0]          
____________

88/88 - 628s - loss: 0.7483 - s_pred_loss: 0.6930 - d_pred_loss: 1.3835 - s_pred_acc: 0.5013 - d_pred_acc: 0.2670 - val_loss: 0.7478 - val_s_pred_loss: 0.6926 - val_d_pred_loss: 1.3815 - val_s_pred_acc: 0.5031 - val_d_pred_acc: 0.2644
- updates: 1e-3 * [-, 0.0001, -, 0.0056, 0.0457, 0.0248, 0.0026, -, -, -, -, 0.0713, -, 0.0596, 0.0400]
Epoch 15/20
88/88 - 582s - loss: 0.7489 - s_pred_loss: 0.6936 - d_pred_loss: 1.3827 - s_pred_acc: 0.4946 - d_pred_acc: 0.2655 - val_loss: 0.7478 - val_s_pred_loss: 0.6925 - val_d_pred_loss: 1.3813 - val_s_pred_acc: 0.5057 - val_d_pred_acc: 0.2644
- updates: 1e-3 * [-, 0.0001, -, 0.0059, 0.0472, 0.0246, 0.0025, -, -, -, -, 0.0730, -, 0.0687, 0.0393]
Epoch 16/20
88/88 - 606s - loss: 0.7477 - s_pred_loss: 0.6924 - d_pred_loss: 1.3831 - s_pred_acc: 0.5030 - d_pred_acc: 0.2620 - val_loss: 0.7477 - val_s_pred_loss: 0.6924 - val_d_pred_loss: 1.3812 - val_s_pred_acc: 0.5044 - val_d_pred_acc: 0.2619
- updates: 1e-3 * [-, 0.0001, -, 0.0061, 0.0459, 0.0246, 0.0027

In [6]:
    print('\nTest evaluation:')
    for d_id, d_name in enumerate(DOMAINS):
        scores = model.evaluate(
            SD.X_test_byd[d_id],
            [SD.ys_test_byd[d_id], SD.yd_test_byd[d_id]],
            batch_size=SD.batch_size, verbose=0)
        print('{} acc: {:.4f}'.format(d_name[:3], scores[-2]))


Test evaluation:
boo acc: 0.5099
dvd acc: 0.5099
ele acc: 0.4950
kit acc: 0.5000


## This model is run only for 20 epochs to show the working on my local system so has lower metric/accuracy values (and very significant training time), I trained this on powerful Titan X GPU for 5 experiments (architectures) and have their metrics below(thanks to my friend Anup for the access to the GPU!)
### Note: The log files for all 5 experiments which has the entire run is included in the log folder, those log files have the metric values shown below.

## The metrics comparing all the experiments done are shown below after running all experiments and making graphs from them, the pdf report also contains Network Architectures for each experiment using Tensorboard.

#### The Experiments :

**EXPERIMENT No.**|**AIM**
:-----:|:-----:
1|Train the state of art model proposed by Yuan et al. and obtain the metrics for comparison.
2|Add a dense layer before the final layer of the sentiment module. The number of neurons is kept equal to the number of domains.
3|Use a combination of LSTM and GRU in the hidden layers. Use LSTM layer as the bidirectional RNN in the domain module and GRU in the sentiment module.
4|Use a combination of LSTM and GRU in the hidden layers. Use GRU layer as the bidirectional RNN in the domain module and LSTM in the sentiment module.
5|Use GRU layer as the bidirectional RNN in the domain module as well as in the sentiment module.

#### The Metrics:

**Metric**|**Definition**
:-----:|:-----:
d\_pred\_acc | The accuracy of  the model to predict the domain. Accuracy is the fraction of predictions our model got right.
d\_pred\_loss| The loss in the model to predict the domain. Loss is a number indicating how bad the model's prediction was on a single example. 
loss|The complete loss from both the domains. We use mean squared error for finding the loss in our models.
s\_pred\_acc | The accuracy of  the model to predict the sentiment. 
s\_pred\_loss| The loss in the model to predict the sentiment. 
val\_d\_pred\_acc | The accuracy of  the model to predict the domain during validation. Accuracy is the fraction of predictions our model got right.
val\_d\_pred\_loss| The loss in the model to predict the domain during validation. Loss is a number indicating how bad the model's prediction was on a single example. 
val\_loss|The complete loss from both the domains during validation. We use mean squared error for finding the loss in our models.
val\_s\_pred\_acc | The accuracy of  the model to predict the sentiment during validation. 
val\_s\_pred\_loss| The loss in the model to predict the sentiment during validation. 

### Note: These Images are added kept in the same directory as the Notebook for these to be displayed properly by Markdown.


![alt text](image5.png "Title")
![alt text](image6.png "Title")
![alt text](image13.png "Title")
![alt text](image15.png "Title")
![alt text](image16.png "Title")
![alt text](image14.png "Title")
![alt text](image10.png "Title")

## The comparison in training time of different Experiments/Architectures
![alt text](image3.png "Title")

## The final validation accuracy comparison
![alt text](image4.png "Title")