# Sentiment Analysis Using Perceptron Model

In [None]:
import pandas as pd
from string import punctuation, digits
import numpy as np
import random

In [9]:
from sklearn.linear_model import Perceptron

## Loading the data

In [11]:
train_data=pd.read_csv("sentiment_analysis/reviews_train.tsv",sep="\t")
val_data = pd.read_csv('sentiment_analysis/reviews_val.tsv',sep="\t")
test_data = pd.read_csv('sentiment_analysis/reviews_test.tsv',sep="\t")

In [12]:
train_data

Unnamed: 0,sentiment,productId,userId,summary,text,helpfulY,helpfulN
0,-1,B000EQYQBO,A2JZVE0Y19VLL0,blue chips,The chips are okay Not near as flavorful as th...,0,0
1,-1,B000LKVHYC,A3NAKOMAS0I5L9,Bad even for 'healthy',"I had high hopes for this, but it was bad. Re...",0,0
2,-1,B003QRQRY2,ARBO3XW14MNGA,Alot of money for one can,I guess it's only one can since there is nothi...,1,1
3,-1,B008EG58V8,A1IQXGT4MJUYJ8,"The Box says ""OATMEAL SQUARES"" which I believe...","""Oatmeal Squares"" is in about the largest prin...",0,0
4,1,B004WZZY8M,A2TBL6WAZGXB9P,Delicious!,"I really enjoyed this flavor, this has a very ...",1,0
...,...,...,...,...,...,...,...
3995,-1,B0038B1DEU,A3MKRM2Q9F04UH,Great nutrititious drink for kids,I bought both the Berry and Chocolate drinks f...,4,2
3996,1,B0002CDZD0,A1ESH5GWEGT2ZX,SOOO GOOD FOR THE SKIN,"THIS LAVENDER IS SOOO GOOD.IT LOOKS,SMELLS,TAS...",1,0
3997,1,B006N3I2SK,A2S0YE8GUSX20A,Deep fabulous rich decaf coffee for the Keurig,Great great decaf. Made the Keurig worth it. ...,1,0
3998,1,B004TPUSU4,A16J5HGMGX5LWM,EXCELLENT American-made GF pasta!,Cooks up al-dente with great flavor. Doesn't f...,0,0


In [13]:
train_data.isnull().sum()

sentiment    0
productId    0
userId       0
summary      0
text         0
helpfulY     0
helpfulN     0
dtype: int64

In [14]:
train_texts=train_data["text"]
train_label=train_data["sentiment"]
val_texts=val_data["text"]
val_label=val_data["sentiment"]
test_texts=test_data["text"]
test_label=test_data["sentiment"]


In [15]:
train_texts.head()

0    The chips are okay Not near as flavorful as th...
1    I had high hopes for this, but it was bad.  Re...
2    I guess it's only one can since there is nothi...
3    "Oatmeal Squares" is in about the largest prin...
4    I really enjoyed this flavor, this has a very ...
Name: text, dtype: object

In [16]:
train_texts.shape

(4000,)

In [17]:
train_texts[0]

'The chips are okay Not near as flavorful as the regular blue chips. Nice size bag for a family.'

In [18]:
def extract_words(input_string):
    """
    Helper function for bag_of_words()
    Inputs a text string
    Returns a list of lowercase words in the string.
    Punctuation and digits are separated out into their own words.
    """
    for c in punctuation + digits:
        input_string = input_string.replace(c, ' ' + c + ' ')

    return input_string.lower().split()

In [19]:
def bag_of_words(texts):
    """
    Inputs a list of string reviews
    Returns a dictionary of unique unigrams occurring over the input
    """
    dictionary = {} # maps word to unique index
    for text in texts:
        word_list = extract_words(text)
        for word in word_list:
            if word not in dictionary:
                dictionary[word] = len(dictionary)
    return dictionary

In [20]:
def extract_bow_feature_vectors(reviews, dictionary):
    """
    Inputs a list of string reviews
    Inputs the dictionary of words as given by bag_of_words
    Returns the bag-of-words feature matrix representation of the data.
    The returned matrix is of shape (n, m), where n is the number of reviews
    and m the total number of entries in the dictionary.
    
    """

    num_reviews = len(reviews)
    feature_matrix = np.zeros([num_reviews, len(dictionary)])

    for i, text in enumerate(reviews):
        word_list = extract_words(text)
        for word in word_list:
            if word in dictionary:
                feature_matrix[i, dictionary[word]] = 1
    return feature_matrix

In [21]:
dictionary = bag_of_words(train_texts)
dictionary

{'the': 0,
 'chips': 1,
 'are': 2,
 'okay': 3,
 'not': 4,
 'near': 5,
 'as': 6,
 'flavorful': 7,
 'regular': 8,
 'blue': 9,
 '.': 10,
 'nice': 11,
 'size': 12,
 'bag': 13,
 'for': 14,
 'a': 15,
 'family': 16,
 'i': 17,
 'had': 18,
 'high': 19,
 'hopes': 20,
 'this': 21,
 ',': 22,
 'but': 23,
 'it': 24,
 'was': 25,
 'bad': 26,
 'really': 27,
 'whole': 28,
 'pan': 29,
 'of': 30,
 'cupcakes': 31,
 'made': 32,
 'from': 33,
 'to': 34,
 'be': 35,
 'thrown': 36,
 'out': 37,
 'very': 38,
 'gritty': 39,
 'and': 40,
 'dense': 41,
 'guess': 42,
 "'": 43,
 's': 44,
 'only': 45,
 'one': 46,
 'can': 47,
 'since': 48,
 'there': 49,
 'is': 50,
 'nothing': 51,
 'in': 52,
 'description': 53,
 'about': 54,
 'how': 55,
 'many': 56,
 'cans': 57,
 'you': 58,
 'get': 59,
 'we': 60,
 't': 61,
 'all': 62,
 'intelligent': 63,
 '"': 64,
 'oatmeal': 65,
 'squares': 66,
 'largest': 67,
 'print': 68,
 'fit': 69,
 'on': 70,
 'front': 71,
 'box': 72,
 'when': 73,
 'read': 74,
 'ingredients': 75,
 'second': 76,
 'ingr

In [22]:
train_feature_matrix =extract_bow_feature_vectors(train_texts, dictionary)
val_feature_matrix = extract_bow_feature_vectors(val_texts, dictionary)
test_feature_matrix = extract_bow_feature_vectors(test_texts, dictionary)

In [23]:
train_feature_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.]])

In [24]:
train_feature_matrix.shape

(4000, 13233)

In [25]:
len(dictionary)

13233

In [26]:
model=Perceptron()

In [27]:
model.fit(train_feature_matrix,train_label)

Perceptron()

In [28]:
val_predictions=model.predict(val_feature_matrix)

In [29]:
val_predictions[:10]

array([ 1, -1, -1, -1, -1, -1, -1, -1,  1, -1], dtype=int64)

In [30]:
correct=(val_label == val_predictions).sum()
incorrect=(val_label != val_predictions).sum()
total=len(val_predictions)

print(f"Results form model {type(model).__name__} on validation data")
print(f"Correct: {correct}")
print(f"Incorrect: {incorrect}")
print(f"Accuracy: {100*correct/total:.2f}%")

Results form model Perceptron on validation data
Correct: 394
Incorrect: 106
Accuracy: 78.80%


In [31]:
test_predictions=model.predict(test_feature_matrix)
test_predictions[:10]

array([ 1, -1,  1,  1, -1, -1,  1, -1, -1,  1], dtype=int64)

In [32]:
correct=(test_label == test_predictions).sum()
incorrect=(test_label != test_predictions).sum()
total=len(test_predictions)

print(f"Results form model {type(model).__name__} on test data")
print(f"Correct: {correct}")
print(f"Incorrect: {incorrect}")
print(f"Accuracy: {100*correct/total:.2f}%")

Results form model Perceptron on test data
Correct: 398
Incorrect: 102
Accuracy: 79.60%
