## URLs:
GitHub: [https://github.com/gabrielloye/LSTM_Sentiment-Analysis/blob/master/main.ipynb](https://github.com/gabrielloye/LSTM_Sentiment-Analysis/blob/master/main.ipynb)

Article: [https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/](https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/)

Data: [https://www.kaggle.com/bittlingmayer/amazonreviews](https://www.kaggle.com/bittlingmayer/amazonreviews)

In [1]:
# Script settings.
SMALLER_SAMPLE = False

In [2]:
import bz2
from collections import Counter
import re
import json
import nltk
import numpy as np
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ess/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Prep

In [3]:
train_file = bz2.BZ2File('./data/train.ft.txt.bz2')
test_file = bz2.BZ2File('./data/test.ft.txt.bz2')

In [4]:
train_file = train_file.readlines()
test_file = test_file.readlines()

In [5]:
# Load files.

print("Number of training reivews: " + str(len(train_file)))
print("Number of test reviews: " + str(len(test_file)))

Number of training reivews: 3600000
Number of test reviews: 400000


In [6]:
# Subset, parse from bytes.

if SMALLER_SAMPLE:
    num_train = 9600
    num_test = 2400
else:
    num_train = 800000  # We're training on the first 800,000 reviews in the dataset
    num_test = 200000  # Using 200,000 reviews from test set

train_file = [x.decode('utf-8') for x in train_file[:num_train]]
test_file = [x.decode('utf-8') for x in test_file[:num_test]]

In [7]:
print(train_file[0])

__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^



In [8]:
# -- Cleaning. --


# Extracting labels from sentences:

train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

    
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]


# Some simple cleaning of data:

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])


# Modify URLs to <url>:

for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [9]:
# Show, post clean and label extract.

print(train_labels[:1])
print(train_sentences[:1])

print(test_labels[:1])
print(test_sentences[:1])

[1]
['stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^']
[1]
['great cd: my lovely pat has one of the great voices of her generation. i have listened to this cd for years and i still love it. when i\'m in a good mood it makes me feel better. a bad mood just evaporates like sugar in the rain. this cd just oozes life. vocals are jusat stuunning and lyrics just kill. one of life\'s hidden gems. this is a desert isle cd in my book. why she never made it big is just beyond me. everytime i play this, no matter black, white, young, old, male, female everybody says one thing "who was that singing ?"']


In [10]:
# Remove file objects.
del train_file, test_file

In [11]:
# Extract vocabulary from training set.

words = Counter()  # Dictionary that will map a word to the number of times it appeared in all the training sentences

for i, sentence in enumerate(train_sentences):
    # The sentences will be stored as a list of words/tokens
    train_sentences[i] = []
    for word in nltk.word_tokenize(sentence):  # Tokenizing the words
        words.update([word.lower()])  # Converting all the words to lower case
        train_sentences[i].append(word)
    if i % 100000 == 0:
        print(str((i*100)/num_train) + "% done")

print("100% done")

0.0% done
12.5% done
25.0% done
37.5% done
50.0% done
62.5% done
75.0% done
87.5% done
100% done


In [12]:
# -- Construct vocabulary. --

# Removing the words that only appear once
words = {k:v for k,v in words.items() if v>1}
print("\nwords counter dict:\n")
print(list(words.items())[:5])

# Sorting the words according to the number of appearances, with the most common word being first
words_ = sorted(words, key=words.get, reverse=True)
# Adding padding and unknown to our vocabulary so that they will be assigned an index
words_ = ['_PAD','_UNK'] + words_
print("\nwords_ sorted list:\n")
print(words_[:5])

# Dictionaries to store the word to index mappings and vice versa.
# Note: '_PAD' --> 0, '_UNK' --> 1.
word2idx = {o:i for i,o in enumerate(words_)}
idx2word = {i:o for i,o in enumerate(words_)}

print("\nword2idx:\n")
print(list(word2idx.items())[:5])
print("\nidx2word:\n")
print(list(idx2word.items())[:5])

if SMALLER_SAMPLE:
    word2idx_fname = "./data/word2idx_small.json"
    idx2word_fname = "./data/idx2word_small.json"
else:
    word2idx_fname = "./data/word2idx.json"
    idx2word_fname = "./data/idx2word.json"

with open(word2idx_fname, 'w') as f:
    json.dump(word2idx, f)
with open(idx2word_fname, 'w') as f:
    json.dump(idx2word, f)


words counter dict:

[('stuning', 9), ('even', 113265), ('for', 669566), ('the', 3223249), ('non-gamer', 4)]

words_ sorted list:

['_PAD', '_UNK', '.', 'the', ',']

word2idx:

[('_PAD', 0), ('_UNK', 1), ('.', 2), ('the', 3), (',', 4)]

idx2word:

[(0, '_PAD'), (1, '_UNK'), (2, '.'), (3, 'the'), (4, ',')]


In [13]:
# Convert words to indices.

for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else 1 for word in sentence]  # Note: line with correction!

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else 0 for word in nltk.word_tokenize(sentence)]

In [14]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [15]:
# Apply padding.

seq_len = 200  # The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

In [16]:
# Converting our labels into numpy arrays.
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [17]:
# Preview padded:
print("test_sentences[0]:")
print(test_sentences[0])
print("test_labels[0]:")
print(test_labels[0])

# Shapes:
print("\n train_sentences: {}\n train_labels: {}\n test_sentences: {}\n test_labels: {}".format(train_sentences.shape, train_labels.shape, test_sentences.shape, test_labels.shape))

test_sentences[0]:
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0    40    99    13    28  1445  4274    58    31    10     3
    40  1778    10    85  1727     2     5    27   904     8    11    99
    16   152     6     5   140    89     9     2    68     5   122    14
     7    42  1845     9   210    59   243   109     2     7   134  1845
    47 29399    38  2640    14     3  2378     2    11    99    47 18877
   160     2   932    30     0     0     6   557    47  1282     2    31
    10   160    21  2334  4156     2    11    12     7  3564 15134    99
    14    28    24     2   182  

In [18]:
# Split the test set into a validation and test sets.

split_frac = 0.5

split_id = int(split_frac * len(test_sentences))

val_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
val_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

In [19]:
# Save as numpy array files for reuse.

if SMALLER_SAMPLE:
    filename = "./data/processed_small.npz"
else:
    filename = "./data/processed.npz"

np.savez(filename, 
         train_sentences=train_sentences, train_labels=train_labels, 
         test_sentences=test_sentences, test_labels=test_labels,
         val_sentences=val_sentences, val_labels=val_labels)

## Sense check 

In [21]:
if not SMALLER_SAMPLE:
    filename = "./data/processed.npz"
    filename_check = "./repo_files/processed.npz"

    npzfile = np.load(filename)
    npzfile_check = np.load(filename_check)
    
    for name in npzfile.files:
        a = npzfile[name]
        b = npzfile_check[name]
        print("Checking equality of matrix {}: {}".format(name, np.all(a == b)))

Checking equality of matrix train_sentences: True
Checking equality of matrix train_labels: True
Checking equality of matrix test_sentences: True
Checking equality of matrix test_labels: True
Checking equality of matrix val_sentences: True
Checking equality of matrix val_labels: True
