In [1]:
import numpy as np
import matplotlib.pyplot as plt

# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [2]:
print(reviews[:1000])
print()
print(labels[:101])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

In [3]:
from string import punctuation

print(punctuation)

# Delete all punctuation
reviews = reviews.lower() # transform to lowercase
all_text = ''.join([c for c in reviews if c not in punctuation])

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [4]:
# Separate the reviews
reviews_split = all_text.split('\n')

# Join them to create one text
all_text = ' '.join(reviews_split)

In [5]:
# Create list of all words
words = all_text.split()

In [6]:
print(len(words))
words[:30]

6020196


['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such',
 'as',
 'teachers',
 'my',
 'years',
 'in',
 'the',
 'teaching',
 'profession',
 'lead',
 'me']

# Task 1
Encode the reviews

In [7]:
from collections import Counter
import tqdm

vocab_to_int = dict()

reviews_ints = list()


In [8]:
counter = Counter()

for word in words:
    counter[word] += 1
    
unique_words = list(counter)
print(len(unique_words)) # shoulde be more 74000

74072


In [9]:
vocab_to_int['\n'] = 0

for i, word in enumerate(unique_words):
    vocab_to_int[word] = i + 1 # 0 is reserved for "\n"
    
print(len(vocab_to_int))

74073


In [10]:
reviews_split[:2]

['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ',
 'story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  viol

In [11]:
for review in reviews_split:
    
    encoded_review = list()
    
    for word in review.split():
        
        idx = vocab_to_int[word]
        encoded_review.append(idx)
        
    reviews_ints.append(encoded_review)

In [12]:
print('Unique words: ', len((vocab_to_int)))  # should ~ 74000+
print()

print('Tokenized review: \n', reviews_ints[:1])

Unique words:  74073

Tokenized review: 
 [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 13, 21, 22, 23, 24, 10, 25, 26, 27, 28, 29, 30, 31, 1, 2, 32, 33, 3, 34, 35, 29, 36, 37, 3, 21, 10, 38, 29, 39, 40, 10, 41, 42, 43, 44, 45, 46, 47, 48, 49, 21, 50, 10, 51, 52, 10, 53, 54, 55, 56, 28, 52, 10, 57, 58, 59, 60, 48, 42, 61, 58, 62, 10, 63, 24, 64, 4, 65, 66, 67, 29, 68, 69, 10, 18, 58, 70, 71, 9, 2, 4, 72, 73, 74, 58, 75, 76, 29, 77, 78, 52, 79, 21, 65, 80, 29, 1, 2, 58, 81, 31, 82, 83, 52, 22, 84, 85, 31, 1, 2, 3, 86, 87, 88, 4, 89, 31, 7, 90, 91]]


# Task 2
Encode the target variable

In [13]:
encoded_labels = list()

In [14]:
for label in labels.split('\n'):
    
    encoded_labels.append(1 if label == 'positive' else 0)
    
encoded_labels[:10]

[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Task 3  
Delete outliers

In [15]:
# Save copies
orig_reviews_ints = reviews_ints.copy()
orig_encoded_labels = encoded_labels.copy()

In [16]:
f = lambda x: len(x)
lengths = list(map(f, reviews_ints))
lengths[:2]

[140, 114]

In [17]:
print('Mean length:', np.mean(lengths))

Mean length: 240.79820807167712


Delete all reviews that are shorter than 1% percentile, and longer than 99% percentile

In [18]:
perc_1 = np.percentile(lengths, 1)
perc_99 = np.percentile(lengths, 99)
print('1% percentile:', perc_1)
print('99% percentile:', perc_99)

1% percentile: 43.0
99% percentile: 935.0


In [19]:
print('Number of reviews before removing outliers: ', len(reviews_ints))
for i, review in enumerate(reviews_ints):
    
    if len(review) > perc_99 or len(review) < perc_1:
        del reviews_ints[i]
        del encoded_labels[i]
        
print('Number of reviews after removing outliers: ', len(reviews_ints))
print('Mean length:', np.mean(list(map(f, reviews_ints))))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  24532
Mean length: 234.63027881950106


# Task 4  
Padding and truncate

In [20]:
seq_length = 200

In [21]:
def left_pad(review, seq_length, token):
    
    pad = [token for i in range(seq_length - len(review))]
    # Add list of [token, ... , token] in the beginning
    review[:0] = pad
    
    return review
    
    
def pad_features(reviews_ints, seq_length, token):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    features = list()
    
    for review in reviews_ints:
        
        if len(review) < seq_length:
            review = left_pad(review, seq_length, token)
            
        else:
            review = review[:seq_length]
            
        features.append(review)
            
    return features

In [22]:
features = pad_features(reviews_ints, seq_length=seq_length, token = vocab_to_int['\n'])

assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

print(features[:10])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 13, 21, 22, 23, 24, 10, 25, 26, 27, 28, 29, 30, 31, 1, 2, 32, 33, 3, 34, 35, 29, 36, 37, 3, 21, 10, 38, 29, 39, 40, 10, 41, 42, 43, 44, 45, 46, 47, 48, 49, 21, 50, 10, 51, 52, 10, 53, 54, 55, 56, 28, 52, 10, 57, 58, 59, 60, 48, 42, 61, 58, 62, 10, 63, 24, 64, 4, 65, 66, 67, 29, 68, 69, 10, 18, 58, 70, 71, 9, 2, 4, 72, 73, 74, 58, 75, 76, 29, 77, 78, 52, 79, 21, 65, 80, 29, 1, 2, 58, 81, 31, 82, 83, 52, 22, 84, 85, 31, 1, 2, 3, 86, 87, 88, 4, 89, 31, 7, 90, 91], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 92, 52, 4, 93, 43, 94,

In [23]:
features = np.array(features)
encoded_labels = np.array(encoded_labels)
print(f'Data shape: {features.shape}\nTarget shape: {encoded_labels.shape}')

Data shape: (24532, 200)
Target shape: (24532,)


# Task 5

In [24]:
split_frac = 0.8

train_len = int(features.shape[0] * split_frac)
valid_len = (features.shape[0] - train_len) // 2
test_len  = features.shape[0] - train_len - valid_len

train_features = features[:train_len]
train_labels = encoded_labels[:train_len]

valid_features = features[train_len : train_len + valid_len]
valid_labels = encoded_labels[train_len : train_len + valid_len]

test_features = features[train_len + valid_len :]
test_labels = encoded_labels[train_len + valid_len :] 

print(f'Training set shape: {train_features.shape}\nValidation set shape: {valid_features.shape}\nTest set shape: {test_features.shape}')

Training set shape: (19625, 200)
Validation set shape: (2453, 200)
Test set shape: (2454, 200)


In [25]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_features), torch.from_numpy(train_labels))
valid_data = TensorDataset(torch.from_numpy(valid_features), torch.from_numpy(valid_labels))
test_data = TensorDataset(torch.from_numpy(test_features), torch.from_numpy(test_labels))

# dataloaders
batch_size = 50

# Shuffle data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [26]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[  342,  4256,    94,  ...,    13,  1911, 20511],
        [    0,     0,     0,  ...,   696, 34770,   544],
        [  282,   524,   449,  ..., 54344,     3,  2936],
        ...,
        [    0,     0,     0,  ..., 39083,    32,   835],
        [    0,     0,     0,  ...,    62, 12759,   540],
        [    0,     0,     0,  ...,   944,    78,  3511]], dtype=torch.int32)

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
        0, 0], dtype=torch.int32)


In [29]:
gpu_use = torch.cuda.is_available()

if gpu_use is True:
    print(f'Using GPU ({torch.cuda.get_device_name()})')
else:
    print('Using CPU')

Using GPU GeForce MX250


In [None]:
import torch.nn as nn
from models import SentimentRNN