In [1]:
import project1 as p1
import utils
import numpy as np

### 1. Load and Explore data


In [2]:
train_data = utils.load_data('reviews_train.tsv')
val_data = utils.load_data('reviews_val.tsv')
test_data = utils.load_data('reviews_test.tsv')

train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data))
test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data))

In [13]:
#Take a view at our data
#Print the first 5 review and its label
for i in range(5):
    print(f'SAMPLE {i+1}:\nreview: {train_texts[i]},\nlabel: {train_labels[i]}')
    print('\n')

SAMPLE 1:
review: The chips are okay Not near as flavorful as the regular blue chips. Nice size bag for a family.,
label: -1


SAMPLE 2:
review: I had high hopes for this, but it was bad.  Really bad.  The whole pan of cupcakes made from this had to be thrown out.  Very gritty and dense.,
label: -1


SAMPLE 3:
review: I guess it's only one can since there is nothing in the description about how many cans you get. I guess we can't all be intelligent.,
label: -1


SAMPLE 4:
review: "Oatmeal Squares" is in about the largest print you can fit on the Front of the Box.  When you read the ingredients, the second largest ingredient is WHEAT flour. So how can it rightfully be called a "Crunchy Oat Cereal".<br /><br />I wonder why Quaker, which is noted for Oatmeal has to be somewhat deceptive and not disclose upfront that this is not completely an Oat Cereal and then go on to make claims about Cholesterol Lowering when it is not ALL Oat Flour.<br /><br />Oatmeal is made from oats not wheat.  Th

### 2. Transform data into the form suitable for perceptron algorithm

For more infor on how `bag_of_words` and `extract_bow_feature_vectors` work, see project.py

In [13]:
dictionary = p1.bag_of_words(train_texts,remove_stopword=True)


In [15]:
# a dictionary that maps each word appearing in corpus of text to a unique integer `index`. In here, the corpus of text is our train_texts
print('LENGTH OF THE DICT: ', len(dictionary))

# Print the first 10 items in the dictonary
print('\n')
print('FIRST 5 ITEMS')
count=0
for word,idx in dictionary.items():
    if count==5:
        break
    count+=1
    print("Word: {}, Idx: {}".format(word,idx))


LENGTH OF THE DICT:  13108


FIRST 5 ITEMS
Word: chips, Idx: 0
Word: okay, Idx: 1
Word: near, Idx: 2
Word: flavorful, Idx: 3
Word: regular, Idx: 4


Using dictionary to convert each sample in train_texts to 1-d array,each of length of the dictionary used (13108).<br>
`extract_bow_feature_vectors` provides 2 modes: <br>
(a) binary=True: i^th entry of the train_bow_features sample is:
* 1 of the train sentence contain corrensponding word ith in dictionary; 
* 0 otherwise <br>
(b) binary=False: i^th entry of the train_bow_features sample is the number of times the corresponding word occurs in the train sentence

In [32]:
#MODE 1: binary=False
train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary,binarize=False)
val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary,binarize=False)
test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary,binarize=False)

In [34]:
#MODE 2: binary=True
train_bow_features_bi = p1.extract_bow_feature_vectors(train_texts, dictionary,binarize=True)
val_bow_features_bi = p1.extract_bow_feature_vectors(val_texts, dictionary,binarize=True)
test_bow_features_bi = p1.extract_bow_feature_vectors(test_texts, dictionary,binarize=True)

In [29]:
print('SHAPE OF TRAIN_BOW_FEATURES: ', train_bow_features.shape)
print('NUMBER OF SAMPLE: ', train_bow_features.shape[0])
print('DIMENSION OF A TRAIN DATA: ', train_bow_features.shape[1])

SHAPE OF TRAIN_BOW_FEATURES:  (4000, 13108)
NUMBER OF SAMPLE:  4000
DIMENSION OF A TRAIN DATA:  13108


### 3. Use 3 different variations of perceptron algorith for training

We'll use 3 type of perceptron algorithm to classify sentiments.
1. Vanilla perceptron algorithm
2. Average perceptron algorithm
3. Pegasos perceptron algorithm

In [36]:
# Set some hyperparameters needed for training

T = 10
L = 0.01
print('MODE 1: binarize=False')
pct_train_accuracy, pct_val_accuracy = \
   p1.classifier_accuracy(p1.perceptron, train_bow_features,val_bow_features,train_labels,val_labels,T=T)
print("{:35} {:.4f}".format("Training accuracy for perceptron:", pct_train_accuracy))
print("{:35} {:.4f}".format("Validation accuracy for perceptron:", pct_val_accuracy))

avg_pct_train_accuracy, avg_pct_val_accuracy = \
   p1.classifier_accuracy(p1.average_perceptron, train_bow_features,val_bow_features,train_labels,val_labels,T=T)
print("{:43} {:.4f}".format("Training accuracy for average perceptron:", avg_pct_train_accuracy))
print("{:43} {:.4f}".format("Validation accuracy for average perceptron:", avg_pct_val_accuracy))

avg_peg_train_accuracy, avg_peg_val_accuracy = \
   p1.classifier_accuracy(p1.pegasos, train_bow_features,val_bow_features,train_labels,val_labels,T=T,L=L)
print("{:50} {:.4f}".format("Training accuracy for Pegasos:", avg_peg_train_accuracy))
print("{:50} {:.4f}".format("Validation accuracy for Pegasos:", avg_peg_val_accuracy))

MODE 1: binarize=False
Training accuracy for perceptron:   0.7953
Validation accuracy for perceptron: 0.6900
Training accuracy for average perceptron:   0.9030
Validation accuracy for average perceptron: 0.7520
Training accuracy for Pegasos:                     0.8592
Validation accuracy for Pegasos:                   0.7480


In [35]:
print('MODE 1: BINARIZE=True')
pct_train_accuracy, pct_val_accuracy = \
   p1.classifier_accuracy(p1.perceptron, train_bow_features_bi,val_bow_features_bi,train_labels,val_labels,T=T)
print("{:35} {:.4f}".format("Training accuracy for perceptron:", pct_train_accuracy))
print("{:35} {:.4f}".format("Validation accuracy for perceptron:", pct_val_accuracy))

avg_pct_train_accuracy, avg_pct_val_accuracy = \
   p1.classifier_accuracy(p1.average_perceptron, train_bow_features_bi,val_bow_features_bi,train_labels,val_labels,T=T)
print("{:43} {:.4f}".format("Training accuracy for average perceptron:", avg_pct_train_accuracy))
print("{:43} {:.4f}".format("Validation accuracy for average perceptron:", avg_pct_val_accuracy))

avg_peg_train_accuracy, avg_peg_val_accuracy = \
   p1.classifier_accuracy(p1.pegasos, train_bow_features_bi,val_bow_features_bi,train_labels,val_labels,T=T,L=L)
print("{:50} {:.4f}".format("Training accuracy for Pegasos:", avg_peg_train_accuracy))
print("{:50} {:.4f}".format("Validation accuracy for Pegasos:", avg_peg_val_accuracy))

MODE 1: BINARIZE=True
Training accuracy for perceptron:   0.9832
Validation accuracy for perceptron: 0.7600
Training accuracy for average perceptron:   0.9868
Validation accuracy for average perceptron: 0.7760
Training accuracy for Pegasos:                     0.9123
Validation accuracy for Pegasos:                   0.7860


### 4. Testing the performance of the model with test data

In [37]:
best_theta=p1.pegasos(train_bow_features_bi,train_labels,T=T,L=L)

In [41]:
sent_pred=p1.classify(test_bow_features_bi,best_theta[0],best_theta[1])
print('SHAPE OF PREDICTION ON TRAINING DATA: ',sent_pred.shape)

SHAPE OF PREDICTION ON TRAINING DATA:  (500,)


In [47]:
# Print 5 test sentence, its pred label and real label

for i in range(5,10):
    print(f'SAMPLE {i+1}')
    print('Review: ',test_texts[i])
    print('Predicted sentiment: ',sent_pred[i])
    print('Real sentiment: ',test_labels[i])
    print('\n')

SAMPLE 6
Review:  After doing numerous googles looking for Chipsters, one of my favorite childhood snacks, some reported that this product was very close to them. I disagree. About the closest flavor to the Chipsters so far would be Munchos, but still not the same. If you are conscious of diet, these will probably due.
Predicted sentiment:  -1
Real sentiment:  -1


SAMPLE 7
Review:  The texture of this chip is firm, solid, substantial, and minimally crispy. The taste is rich with favors of different seeds and grains. It seems that the inclusion of sesame seeds, in particular, makes this chip especially aromatic and pleasing to the taste.
Predicted sentiment:  -1
Real sentiment:  1


SAMPLE 8
Review:  I'm drinking oolong tea because Dr Oz said it would raise metabolism, the jury is still out.......really how would you know? I have lost weight but that's because I quit drinking coffee which I added too much cream to and started working out a lot more. It does not taste the all that great

### 5. Find the most explanatory words

In [49]:
wordlist   = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
sorted_word_features = utils.most_explanatory_word(best_theta[0], wordlist)
print("Most Positive Explanatory Word Features")
print(sorted_word_features[:10])

Most Explanatory Word Features
['great', 'delicious', 'loves', '!', 'best', 'excellent', 'perfect', 'wonderful', 'favorite', 'tasty']


In [55]:
print("Most Negative Explanatory Word Features")
print(sorted_word_features[-10:][::-1])

Most Negative Explanatory Word Features
['disappointed', 'bad', 'however', 'ok', '?', 'disappointment', 'thought', 'unfortunately', 'stale', 'fine']
