### 1- Check GPU type

In [0]:
!nvidia-smi

### 2- Install SimpleRepresentations library

In [0]:
!pip install simplerepresentations

### 3- Download the Large Movie Review Dataset

In [0]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar xzf aclImdb_v1.tar.gz
!rm aclImdb_v1.tar.gz

### 4- Load train and test datasets

In [0]:
import os
import random

TRAIN_POS_PATH = 'aclImdb/train/pos'
TRAIN_NEG_PATH = 'aclImdb/train/neg'
TEST_POS_PATH  = 'aclImdb/test/pos'
TEST_NEG_PATH  = 'aclImdb/test/neg'

In [0]:
print('Number of Train/Positive examples:', len(os.listdir(TRAIN_POS_PATH)))
print('Number of Train/Negative examples:', len(os.listdir(TRAIN_NEG_PATH)))
print('Number of Test/Positive examples:', len(os.listdir(TEST_POS_PATH)))
print('Number of Test/Negative examples:', len(os.listdir(TEST_NEG_PATH)))

In [0]:
def load_files(dir_path):
    data = list()
    for file in os.listdir(dir_path):
        with open(os.path.join(dir_path, file), 'r') as file:
            data.append(file.readlines()[0].strip())
    return data

In [0]:
train_pos_examples = load_files(TRAIN_POS_PATH)
train_neg_examples = load_files(TRAIN_NEG_PATH)
test_pos_examples = load_files(TEST_POS_PATH)
test_neg_examples = load_files(TEST_NEG_PATH)

In [0]:
train_examples = train_pos_examples + train_neg_examples
train_labels = ([1] * len(train_pos_examples)) + ([0] * len(train_neg_examples))
train_data = list(zip(train_examples, train_labels))
random.shuffle(train_data)
train_examples, train_labels = zip(*train_data)
train_examples = list(train_examples)
train_labels = list(train_labels)

In [0]:
test_examples = test_pos_examples + test_neg_examples
test_labels = ([1] * len(test_pos_examples)) + ([0] * len(test_neg_examples))
test_data = list(zip(test_examples, test_labels))
random.shuffle(test_data)
test_examples, test_labels = zip(*test_data)
test_examples = list(test_examples)
test_labels = list(test_labels)

### 5- Define SimpleRepresentations model

In [0]:
from simplerepresentations import RepresentationModel

model_type = 'roberta'
model_name = 'roberta-base'

representation_model = RepresentationModel(
    model_type=model_type,
    model_name=model_name,
    batch_size=128,
    max_seq_length=128, # truncate sentences to be less than or equal to 128 tokens
    combination_method='sum', # sum the last `last_hidden_to_use` hidden states
    last_hidden_to_use=1, # use the last 1 hidden states to build tokens representations
    verbose=0
)

In [0]:
all_sentences_representations, all_tokens_representations = representation_model(['Simple Representations!'])

print(all_sentences_representations.shape) # (1, 768)
print(all_tokens_representations.shape) # (1, 128, 768)
print(all_sentences_representations[0].shape) # (768,)
print(all_tokens_representations[0].shape) # (128, 768)

### 6- Define data generator class

In [0]:
import numpy as np
from keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, representation_model, sentences, labels, batch_size, token_level=True):
        self.representation_model = representation_model
        self.sentences = sentences
        self.labels = labels
        self.batch_size = batch_size
        self.token_level = token_level

    def __len__(self):
        return int(np.ceil(len(self.sentences) / float(self.batch_size)))

    def __getitem__(self, idx):
        sentences_batch = np.array(self.sentences[idx * self.batch_size:(idx + 1) * self.batch_size])
        labels_batch = np.array(self.labels[idx * self.batch_size:(idx + 1) * self.batch_size])

        sentences_sen_batch, sentences_tok_batch = self.representation_model(sentences_batch)

        if self.token_level:
            sentences_batch = sentences_tok_batch
        else:
            sentences_batch = sentences_sen_batch

        return sentences_batch, np.array(labels_batch)

train_tok_generator = DataGenerator(representation_model, train_examples, train_labels, 128)
test_tok_generator = DataGenerator(representation_model, test_examples, test_labels, 128)

### 7- Define token level recurrent neural network for binary classification

In [0]:
from keras.models import Input, Model
from keras.layers import Dropout, Dense, LSTM, Bidirectional
from keras.optimizers import Adam

model_input = Input(shape=(128, len(representation_model(['test'])[0][0])))

model = Bidirectional(
            LSTM(
                units=128,
                dropout=0.3,
                return_sequences=True
            )
        )(model_input)
model = Bidirectional(
            LSTM(
                units=128,
                dropout=0.3,
                return_sequences=False
            )
        )(model)
model = Dense(128, activation='relu')(model)
model = Dropout(0.3)(model)

model_output = Dense(1, activation='sigmoid')(model)

model = Model(model_input, model_output)

model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

### 8- Train token level model

In [0]:
model.fit_generator(train_tok_generator, epochs=10)

### 9- Evaluate token level model

In [0]:
print(model.evaluate_generator(test_tok_generator))