## Data Preparation

In [2]:
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
## Define path to the data
path = "/home/isaac/UdacityDL/SentiAna/Trask/"

In [4]:
with open(path+"reviews.txt", "r+") as file:
    reviews = file.readlines() 
    ##use readlines as it will be separated by \n; read will read the whole thing as one big chunk of array
    file.close()

with open(path+"labels.txt", "r+") as file:
    labels = file.readlines()
    file.close()

In [5]:
reviews[0]
## Notice \n at the end of each review[i]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \n'

In [6]:
labels[0]
## Notice \n at the end of each label[i]

'positive\n'

In [7]:
reviews_clean = list(map(lambda x: x[:-1], reviews))
reviews_clean[0][:200]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  '

In [8]:
labels_clean = list(map(lambda x: x[:-1], labels))
labels_clean[0]

'positive'

## Quick Theory Validation

In [9]:
my_rand = np.random.randint(0,len(reviews_clean),10)
my_rand

array([ 7651,  6937,  1758,  6160,  3795,  2629, 13605, 14809,  2278, 22792])

In [10]:
for num in my_rand:
    print(labels_clean[num]+" : "+reviews_clean[num][:100]+"\n")
    print("-"*110)

negative : a demented scientist girlfriend is decapitated so he brings her head back to life . honest this is t

--------------------------------------------------------------------------------------------------------------
negative : be warned . this movie is such a mess . it  s a catastrophe . don  t waste your time with this one .

--------------------------------------------------------------------------------------------------------------
positive : good show  very entertaining . good marshal arts acting . good story plot . the entire main crew did

--------------------------------------------------------------------------------------------------------------
positive : horror omnibus films were popular in the seventies . i  m not very fond of them myself  but this one

--------------------------------------------------------------------------------------------------------------
negative : blows my mind how this movie got made . i watched it while i worked at home writing emails an

In [11]:
## Initialization
positive_count = Counter()
negative_count = Counter()
total_count = Counter()

In [12]:
for i in range(len(reviews)):
    if(labels_clean[i] == "positive"):
        for word in reviews[i].split(" "):
            positive_count[word] += 1
            total_count[word] += 1
    else:
        for word in reviews[i].split(" "):
            negative_count[word] += 1
            total_count[word] += 1 

In [13]:
positive_count.most_common()

[('', 537968),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937),
 ('but', 20822),
 ('movie', 19074),
 ('his', 17227),
 ('on', 17008),
 ('you', 16681),
 ('he', 16282),
 ('are', 14807),
 ('not', 14272),
 ('t', 13720),
 ('one', 13655),
 ('have', 12587),
 ('\n', 12500),
 ('be', 12416),
 ('by', 11997),
 ('all', 11942),
 ('who', 11464),
 ('an', 11294),
 ('at', 11234),
 ('from', 10767),
 ('her', 10474),
 ('they', 9895),
 ('has', 9186),
 ('so', 9154),
 ('like', 9038),
 ('about', 8313),
 ('very', 8305),
 ('out', 8134),
 ('there', 8057),
 ('she', 7779),
 ('what', 7737),
 ('or', 7732),
 ('good', 7720),
 ('more', 7521),
 ('when', 7456),
 ('some', 7441),
 ('if', 7285),
 ('just', 7152),
 ('can', 7001),
 ('story', 6780),
 ('time', 6515),
 ('

In [14]:
negative_count.most_common()

[('', 548962),
 ('.', 167538),
 ('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327),
 ('i', 46880),
 ('in', 43753),
 ('this', 40920),
 ('that', 37615),
 ('s', 31546),
 ('was', 26291),
 ('movie', 24965),
 ('for', 21927),
 ('but', 21781),
 ('with', 20878),
 ('as', 20625),
 ('t', 20361),
 ('film', 19218),
 ('you', 17549),
 ('on', 17192),
 ('not', 16354),
 ('have', 15144),
 ('are', 14623),
 ('be', 14541),
 ('he', 13856),
 ('one', 13134),
 ('they', 13011),
 ('\n', 12500),
 ('at', 12279),
 ('his', 12147),
 ('all', 12036),
 ('so', 11463),
 ('like', 11238),
 ('there', 10775),
 ('just', 10619),
 ('by', 10549),
 ('or', 10272),
 ('an', 10266),
 ('who', 9969),
 ('from', 9731),
 ('if', 9518),
 ('about', 9061),
 ('out', 8979),
 ('what', 8422),
 ('some', 8306),
 ('no', 8143),
 ('her', 7947),
 ('even', 7687),
 ('can', 7653),
 ('has', 7604),
 ('good', 7423),
 ('bad', 7401),
 ('would', 7036),
 ('up', 6970),
 ('only', 6781),
 ('m

In [15]:
pos_neg_ratios = Counter()

for term, count in list(total_count.most_common()):
    if(count > 100):
        pos_neg_ratio = positive_count[term] / (negative_count[term] + 1)
        pos_neg_ratios[term] = pos_neg_ratio

for word, ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = - np.log(1/(ratio+0.01))

In [16]:
pos_neg_ratios.most_common()[:30]

[('edie', 4.6913478822291435),
 ('paulie', 4.0775374439057197),
 ('felix', 3.1527360223636558),
 ('polanski', 2.8233610476132043),
 ('matthau', 2.8067217286092401),
 ('victoria', 2.6810215287142909),
 ('mildred', 2.6026896854443837),
 ('gandhi', 2.5389738710582761),
 ('flawless', 2.451005098112319),
 ('superbly', 2.2600254785752498),
 ('perfection', 2.1594842493533721),
 ('astaire', 2.1400661634962708),
 ('captures', 2.0386195471595809),
 ('voight', 2.0301704926730531),
 ('wonderfully', 2.0218960560332353),
 ('powell', 1.9783454248084671),
 ('brosnan', 1.9547990964725592),
 ('lily', 1.9203768470501485),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('refreshing', 1.8551812956655511),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('lemmon', 1.8458266904983307),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('andrews', 1.7764919970972666),
 ('homer', 1.7692866133759964),
 ('beautifully', 1.7626953362841438),
 ('socc

In [17]:
list(reversed(pos_neg_ratios.most_common()))[:30]

[('boll', -4.0778152602708904),
 ('uwe', -3.9218753018711578),
 ('seagal', -3.3202501058581921),
 ('unwatchable', -3.0269848170580955),
 ('stinker', -2.9876839403711624),
 ('mst', -2.7753833211707968),
 ('incoherent', -2.7641396677532537),
 ('unfunny', -2.5545257844967644),
 ('waste', -2.4907515123361046),
 ('blah', -2.4475792789485005),
 ('horrid', -2.3715779644809971),
 ('pointless', -2.3451073877136341),
 ('atrocious', -2.3187369339642556),
 ('redeeming', -2.2667790015910296),
 ('prom', -2.2601040980178784),
 ('drivel', -2.2476029585766928),
 ('lousy', -2.2118080125207054),
 ('worst', -2.1930856334332267),
 ('laughable', -2.172468615469592),
 ('awful', -2.1385076866397488),
 ('poorly', -2.1326133844207011),
 ('wasting', -2.1178155545614512),
 ('remotely', -2.111046881095167),
 ('existent', -2.0024805005437076),
 ('boredom', -1.9241486572738005),
 ('miserably', -1.9216610938019989),
 ('sucks', -1.9166645809588516),
 ('uninspired', -1.9131499212248517),
 ('lame', -1.9117232884159072),

## Convert Text into Numbers

In [18]:
vocab = set(total_count.keys())
vocab_size = len(vocab)
print(vocab_size)

74075


In [19]:
## Create empty vector
import numpy as np
layer_0 = np.zeros((1,vocab_size))
layer_0

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [20]:
# Create word to index
word2index = {}

for i, word in enumerate(vocab):
    word2index[word] = i
word2index

{'': 0,
 'rancor': 1,
 'tomato': 2,
 'kassar': 3,
 'hellman': 24401,
 'chambermaid': 4,
 'conquer': 5,
 'bhatti': 49373,
 'circulating': 9,
 'time': 10,
 'unyielding': 11,
 'recapitulates': 12,
 'arklie': 13,
 'treeline': 14,
 'uncomprehensible': 19,
 'rehashes': 17,
 'kridge': 18,
 'accepts': 20,
 'congregate': 57377,
 'panoply': 22,
 'boom': 23,
 'camelot': 24,
 'delarue': 25,
 'outflanking': 28,
 'maybee': 36802,
 'beauteous': 30,
 'tooled': 31,
 'avery': 32,
 'ghostwritten': 34,
 'cavities': 35,
 'molotov': 36,
 'ktla': 37,
 'permanently': 38,
 'listened': 40,
 'pauley': 42,
 'shiva': 71519,
 'priests': 44,
 'recession': 24408,
 'adoptee': 48,
 'warping': 50,
 'gingivitis': 6,
 'savant': 70069,
 'hodges': 51,
 'adalbert': 52,
 'kirshner': 53,
 'junior': 56,
 'cheesier': 57,
 'machinations': 36809,
 'seast': 58,
 'roman': 59,
 'fretful': 60,
 'strongbox': 61,
 'srbljanovic': 63,
 'confusion': 64,
 'posses': 65,
 'waaaaaaaaaaay': 70,
 'evaluate': 69,
 'atkins': 68158,
 'kitaparaporn'

In [21]:
def update_input_layer(review):
    global layer_0
    layer_0 *= 0
    for word in review.split(" "):
        layer_0[0][word2index[word]] += 1

update_input_layer(reviews_clean[0])
layer_0

array([[ 18.,   0.,   0., ...,   0.,   0.,   0.]])

In [22]:
def get_target_for_label(label):
    if(label == "positive"):
        return 1
    else:
        return 0
get_target_for_label(labels_clean[0])

1

## Build Neural Network

In [23]:
import time
import sys
import numpy as np

In [65]:
import time
import sys
import numpy as np

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        # set our random number generator 
        np.random.seed(1)
    
        self.pre_process_data(reviews, labels)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
    
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] += 1
                
    def get_target_for_label(self,label):
        if(label == 'positive'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # TODO: Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # TODO: Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # TODO: Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "positive"
        else:
            return "negative"

In [49]:
mlp = SentimentNetwork(reviews_clean[:-1000], labels_clean[:-1000], learning_rate=0.1)

In [85]:
# evaluate our model before training (just to show how horrible it is)
mlp.test(reviews_clean[-1000:],labels_clean[-1000:])

Progress:99.9% Speed(reviews/sec):535.9% #Correct:500 #Tested:1000 Testing Accuracy:50.0%

In [60]:
mlp.train(reviews_clean[:-1000],labels_clean[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:1.87% Speed(reviews/sec):79.63 #Correct:225 #Trained:452 Training Accuracy:49.7%



Progress:10.4% Speed(reviews/sec):83.27 #Correct:1249 #Trained:2501 Training Accuracy:49.9%
Progress:14.2% Speed(reviews/sec):82.15 #Correct:1709 #Trained:3421 Training Accuracy:49.9%

KeyboardInterrupt: 

In [61]:
mlp = SentimentNetwork(reviews_clean[:-1000],labels_clean[:-1000], learning_rate=0.01)

In [62]:
# train the network
mlp.train(reviews_clean[:-1000],labels_clean[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):87.76 #Correct:1247 #Trained:2501 Training Accuracy:49.8%
Progress:20.8% Speed(reviews/sec):86.57 #Correct:2497 #Trained:5001 Training Accuracy:49.9%
Progress:21.1% Speed(reviews/sec):86.50 #Correct:2532 #Trained:5070 Training Accuracy:49.9%

KeyboardInterrupt: 

In [63]:
mlp = SentimentNetwork(reviews_clean[:-1000],labels_clean[:-1000], learning_rate=0.001)

In [64]:
# train the network
mlp.train(reviews_clean[:-1000],labels_clean[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):83.26 #Correct:1265 #Trained:2501 Training Accuracy:50.5%
Progress:20.8% Speed(reviews/sec):81.42 #Correct:2651 #Trained:5001 Training Accuracy:53.0%
Progress:23.0% Speed(reviews/sec):80.91 #Correct:2960 #Trained:5521 Training Accuracy:53.6%

KeyboardInterrupt: 

## By reducing the learning rate, the neural network is not improving the accuracy fast enough, so we need to change the structure to improve the model

### Analyze the signal vs. noise

## Understand Neural Noise

In [66]:
layer_0

array([[ 18.,   0.,   0., ...,   0.,   0.,   0.]])

In [70]:
list(vocab)[0]
# A noise here

''

In [71]:
reviews_clean[0].split(" ")
# A lot of empty, period

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 '.',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 '',
 'such',
 'as',
 '',
 'teachers',
 '',
 '.',
 'my',
 '',
 '',
 'years',
 'in',
 'the',
 'teaching',
 'profession',
 'lead',
 'me',
 'to',
 'believe',
 'that',
 'bromwell',
 'high',
 '',
 's',
 'satire',
 'is',
 'much',
 'closer',
 'to',
 'reality',
 'than',
 'is',
 '',
 'teachers',
 '',
 '.',
 'the',
 'scramble',
 'to',
 'survive',
 'financially',
 '',
 'the',
 'insightful',
 'students',
 'who',
 'can',
 'see',
 'right',
 'through',
 'their',
 'pathetic',
 'teachers',
 '',
 'pomp',
 '',
 'the',
 'pettiness',
 'of',
 'the',
 'whole',
 'situation',
 '',
 'all',
 'remind',
 'me',
 'of',
 'the',
 'schools',
 'i',
 'knew',
 'and',
 'their',
 'students',
 '.',
 'when',
 'i',
 'saw',
 'the',
 'episode',
 'in',
 'which',
 'a',
 'student',
 'repeatedly',
 'tried',
 'to',
 'burn',
 'down',
 'the',
 'school',
 '',

In [74]:
# Check how many period is in the review
review_counter = Counter()
for word in reviews_clean[0].split(" "):
    review_counter[word] += 1
review_counter.most_common()
# The result below shows the dominant word has nothing to do
# with the sentiment, the weighting has a dominant effect on the hidden layer
# the count weighs heavily on the noise

[('.', 27),
 ('', 18),
 ('the', 9),
 ('to', 6),
 ('high', 5),
 ('i', 5),
 ('that', 4),
 ('teachers', 4),
 ('a', 4),
 ('bromwell', 4),
 ('of', 4),
 ('is', 4),
 ('me', 2),
 ('at', 2),
 ('in', 2),
 ('student', 2),
 ('students', 2),
 ('school', 2),
 ('my', 2),
 ('as', 2),
 ('their', 2),
 ('it', 2),
 ('pathetic', 1),
 ('see', 1),
 ('inspector', 1),
 ('much', 1),
 ('fetched', 1),
 ('satire', 1),
 ('reality', 1),
 ('time', 1),
 ('lead', 1),
 ('same', 1),
 ('isn', 1),
 ('episode', 1),
 ('knew', 1),
 ('which', 1),
 ('immediately', 1),
 ('programs', 1),
 ('classic', 1),
 ('who', 1),
 ('cartoon', 1),
 ('think', 1),
 ('pity', 1),
 ('t', 1),
 ('expect', 1),
 ('other', 1),
 ('burn', 1),
 ('here', 1),
 ('believe', 1),
 ('ran', 1),
 ('and', 1),
 ('tried', 1),
 ('many', 1),
 ('profession', 1),
 ('your', 1),
 ('remind', 1),
 ('life', 1),
 ('welcome', 1),
 ('one', 1),
 ('when', 1),
 ('saw', 1),
 ('years', 1),
 ('pomp', 1),
 ('down', 1),
 ('some', 1),
 ('financially', 1),
 ('s', 1),
 ('far', 1),
 ('what',

## Reducing Noise in the Network

In [78]:
import time
import sys
import numpy as np

# Let's tweak our network from before to model these phenomena
class SentimentNetwork_2:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        # set our random number generator 
        np.random.seed(1)
    
        self.pre_process_data(reviews, labels)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
    
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
        # While update layers, do not increment, change the counts to binary
        # eliminate neural noises
        
    def get_target_for_label(self,label):
        if(label == 'positive'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews, training_labels):
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            self.update_input_layer(review)

            # Hidden layer
            layer_1 = self.layer_0.dot(self.weights_0_1)

            # Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # TODO: Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # TODO: Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # TODO: Update the weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer
        self.update_input_layer(review.lower())

        # Hidden layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        # Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "positive"
        else:
            return "negative"

In [79]:
mlp_2 = SentimentNetwork_2(reviews_clean[:-1000], labels_clean[:-1000], learning_rate = 0.1)

In [82]:
mlp_2.train(reviews_clean[:-1000], labels_clean[:-1000])
#Significant improvement after removing noise
#But the training speed seems really slow a.k.a inefficient

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):85.14 #Correct:1811 #Trained:2501 Training Accuracy:72.4%
Progress:20.8% Speed(reviews/sec):83.53 #Correct:3799 #Trained:5001 Training Accuracy:75.9%
Progress:31.2% Speed(reviews/sec):82.87 #Correct:5884 #Trained:7501 Training Accuracy:78.4%
Progress:41.6% Speed(reviews/sec):86.69 #Correct:8029 #Trained:10001 Training Accuracy:80.2%
Progress:52.0% Speed(reviews/sec):91.05 #Correct:10164 #Trained:12501 Training Accuracy:81.3%
Progress:62.5% Speed(reviews/sec):94.44 #Correct:12284 #Trained:15001 Training Accuracy:81.8%
Progress:72.9% Speed(reviews/sec):98.31 #Correct:14414 #Trained:17501 Training Accuracy:82.3%
Progress:83.3% Speed(reviews/sec):101.3 #Correct:16589 #Trained:20001 Training Accuracy:82.9%
Progress:93.7% Speed(reviews/sec):103.9 #Correct:18775 #Trained:22501 Training Accuracy:83.4%
Progress:99.9% Speed(reviews/sec):105.2 #Correct:20091 #Trained:24000 Training 

In [84]:
mlp_2.test(reviews_clean[-1000:], labels_clean[-1000:])

Progress:99.9% Speed(reviews/sec):504.1% #Correct:854 #Tested:1000 Testing Accuracy:85.4%

## Inefficiency is caused by the large zeros input
1. Only care about the non-zero input calculation in the neural network
2. 1 multiplication is also wasting time

## Increase code efficiency

In [115]:
import time
import sys
import numpy as np

# Let's tweak our network from before to model these phenomena
class SentimentNetwork_3:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        # set our random number generator 
        np.random.seed(1)
    
        self.pre_process_data(reviews, labels)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
        # While update layers, do not increment, change the counts to binary
        # eliminate neural noises
        
    def get_target_for_label(self,label):
        if(label == 'positive'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer
            # Can completely skip generating input layer
            # self.update_input_layer(review)

            # Hidden layer
            # layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # TODO: Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # TODO: Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # TODO: Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            #self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate
            
            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer
        

        # Hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
            
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "positive"
        else:
            return "negative"

In [116]:
mlp_3 = SentimentNetwork_3(reviews_clean[:-1000], labels_clean[:-1000], learning_rate=0.1)

In [117]:
mlp_3.train(reviews_clean[:-1000], labels_clean[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):767.0 #Correct:1819 #Trained:2501 Training Accuracy:72.7%
Progress:20.8% Speed(reviews/sec):566.4 #Correct:3800 #Trained:5001 Training Accuracy:75.9%
Progress:31.2% Speed(reviews/sec):557.6 #Correct:5882 #Trained:7501 Training Accuracy:78.4%
Progress:41.6% Speed(reviews/sec):543.2 #Correct:8018 #Trained:10001 Training Accuracy:80.1%
Progress:52.0% Speed(reviews/sec):529.9 #Correct:10162 #Trained:12501 Training Accuracy:81.2%
Progress:62.5% Speed(reviews/sec):506.8 #Correct:12298 #Trained:15001 Training Accuracy:81.9%
Progress:72.9% Speed(reviews/sec):498.1 #Correct:14420 #Trained:17501 Training Accuracy:82.3%
Progress:83.3% Speed(reviews/sec):492.0 #Correct:16606 #Trained:20001 Training Accuracy:83.0%
Progress:93.7% Speed(reviews/sec):484.7 #Correct:18796 #Trained:22501 Training Accuracy:83.5%
Progress:99.9% Speed(reviews/sec):477.8 #Correct:20120 #Trained:24000 Training 

In [120]:
mlp_3.test(reviews_clean[-1000:], labels_clean[-1000:])

Progress:99.9% Speed(reviews/sec):454.9% #Correct:500 #Tested:1000 Testing Accuracy:50.0%

In [111]:
import time
import sys

# Let's tweak our network from before to model these phenomena
class SentimentNetwork_3T:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
       
        np.random.seed(1)
    
        self.pre_process_data(reviews)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self,reviews):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            self.layer_0[0][self.word2index[word]] = 1

    def get_target_for_label(self,label):
        if(label == 'positive'):
            return 1
        else:
            return 0
        
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer

            # Hidden layer
#             layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer


        # Hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] > 0.5):
            return "positive"
        else:
            return "negative"

In [112]:
mlp_3_T = SentimentNetwork_3T(reviews_clean[:-1000], labels_clean[:-1000], learning_rate=0.1)

In [113]:
mlp_3_T.train(reviews_clean[:-1000], labels_clean[:-1000])

Progress:99.9% Speed(reviews/sec):765.3 #Correct:20120 #Trained:24000 Training Accuracy:83.8%

In [114]:
mlp_3_T.test(reviews_clean[-1000:], labels_clean[-1000:])

Progress:99.9% Speed(reviews/sec):991.1% #Correct:860 #Tested:1000 Testing Accuracy:86.0%