# Sentiment Classification using Bag of Words

In [1]:
"""
This program is a BoW text classifier for sentiment analysis of the reviews.
It analyzes the reviews and classify them to three bins: positive, negative
and neutral.

Created by Ruoxi Jia.
"""

import pandas as pd
import numpy as np
from nltk import WordNetLemmatizer, word_tokenize
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import nltk

# Import the reviews.csv to dataframe
data = pd.read_csv('reviews.csv', sep=',')
data['RatingValue'].value_counts()

2    1465
1     297
0     158
Name: RatingValue, dtype: int64

## Pre-processing

In [2]:
# Random Drop Undersampling, since we have too many positive reviews
data_us = pd.concat([
    data.loc[data.RatingValue == 1],
    data.loc[data.RatingValue == 0],
    data.loc[data.RatingValue == 2].sample(frac=1/4),
]).reset_index().drop('index', axis=1)

data_us['RatingValue'].value_counts()

# Split the dataset into training and valid
training, valid = train_test_split(data_us, test_size=0.2, random_state=42)

# Export the training and valid dataset into csv file
training.to_csv("training.csv")
valid.to_csv("valid.csv")

# Load the training data
train = pd.read_csv("training.csv")

## Build model

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# create a tokenizer with lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, articles):
        # Remove the stop words and lemmatize the rest.
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles) if
                t not in nltk.corpus.stopwords.words('english')]


# Build a SGD pipeline
text_clf = Pipeline([
    # Tokenize the text with ngram 1-3 and removing stopwords
    ('vect', CountVectorizer(tokenizer=LemmaTokenizer(),
                             ngram_range=(1, 3),
                             stop_words='english')),
    # TF-IDF transform
    ('tfidf', TfidfTransformer()),
    # SGD Classification model
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\collin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\collin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\collin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Train the model
text_clf.fit(train['Review'], train['RatingValue'])

# Import the validation data
validation = pd.read_csv("valid.csv")

# Implement the model
predicted = text_clf.predict(validation['Review'])

# calculate the accuracy
accuracy = round(np.mean(predicted == validation['RatingValue']), 2)
# calculate the F1 score
F1_score = round(metrics.f1_score(validation.RatingValue, predicted, average='weighted'), 2)
# print out the result
print("accuracy:", accuracy, "\n")
print("F1_score:", F1_score, "\n")
print("Confusion_matrix:")
# generate the confusion matrix
conf = pd.DataFrame(metrics.confusion_matrix(validation.RatingValue, predicted),
                    index=['negative', 'neutral', 'positive'],
                    columns=['negative', 'neutral', 'positive'])
print(conf)




accuracy: 0.62 

F1_score: 0.59 

Confusion_matrix:
          negative  neutral  positive
negative         8       19        11
neutral          2       36        19
positive         0       11        59
