# Customer Sentiment Reviews on HP Products

In [None]:
import nltk
import re
from nltk.stem.porter import *
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def simplify(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A).lower().strip() # Remove special characters, whitespaces and make lower case
    tokens = nltk.WordPunctTokenizer().tokenize(doc) # Tokenize
    filtered_tokens = [token for token in tokens if token not in nltk.corpus.stopwords.words('english')] # Remove stopwords
    doc = ' '.join(filtered_tokens) # Re-create document from filtered tokens
    return doc

In [None]:
def get_words(sentence):
    stemmer = PorterStemmer()#Stemming is the process of producing morphological variants of a root/base word
    words = [stemmer.stem(x) for x in simplify(sentence).split()]
    return words

## Training

In [None]:
train = open('training_set.csv', 'r') # Training data
train.readline() # Read and remove header row
word_sentiment = {} # Dictionary stores sentiment weight of all words

In [None]:
for data in train:
    sentiment, line = data.split(',')
    words = get_words(line)
    for word in words:
        try: # Increment weight of the word by 1 if positive, decrement weight by 1 if negative
            word_sentiment[word][0] = word_sentiment[word][0] + 1 if int(sentiment) == 1 else word_sentiment[word][0] - 1
        except: # If word doesn't exist, create new entry in dictionary
            word_sentiment[word] = [1, 0] if int(sentiment) == 1 else [-1, 0]
        finally: # Increment number of occurences of the word; used later to compute the weighted sum instead of just sum
            word_sentiment[word][1] += 1

In [None]:
word_weighted_sentiment = {word: word_sentiment[word][0] / word_sentiment[word][1] for word in word_sentiment.keys()}
# Weighted sum of the sentiment of each word (divide weight of the word by number of occurences of the word)

In [None]:
train.close()

## Testing

In [None]:
test = open('test_set.csv', 'r') # Testing data
test.readline() # Read and remove header row
output = open('prediction_file.csv', 'a') # Output file (testing data with predictions)

In [None]:
for data in test:
    sentiment = 0
    words = get_words(data)
    for word in words:
        try:
            sentiment += word_weighted_sentiment[word] # Compute sum of weighted sentiments of the words in the review
        except:
            sentiment += 0 # MISSING WORDS (words in testing data but not training data) ARE GIVEN SENTIMENT OF 0
    if sentiment >= 0:
        output.write('1,' + data + '\n')
    else:
        output.write('0,' + data + '\n')

In [None]:
test.close()
output.close()

The results of the predictions may be observed in `prediction_file.csv`.