# Sentiment Analysis on Amazon-Cell Dataset using Naive Bayes

In [106]:
# Basic Libraries
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt

# Text Preprocessing
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import RegexpTokenizer

# Model, Visualization, and Evalutation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akomand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
tokenizer = ToktokTokenizer()

# Stop words
stopword_list = stopwords.words('english')

In [108]:
def load_data1(dataset_str):
    # Dataset of 400,000 samples
    if dataset_str == 'amazon':
        with open('data/test.ft.txt', 'r') as f:
            labels, reviews = [], []
            for line in f:
                content = line.split()
                labels.append(content[0])
                reviews.append(' '.join(content[1:]))
        f.close()
        
        data = pd.DataFrame()
        data['review'] = reviews
        data['sentiment'] = labels 
        
    # Dataset of 50,000 samples
    if dataset_str == 'imdb':
        data = pd.read_csv('./data/IMDB_Dataset.csv')
        
    
    encoder = LabelEncoder()
    data['sentiment'] = encoder.fit_transform(data['sentiment'])
    
    return data

def load_data():
    data = pd.read_csv('./data/amazon_cells_labelled.txt', sep='\t', header=None)
    data.columns = ['reviews', 'sentiment']
    
    return data

In [109]:
def split_data(data):
    X_train, X_test, Y_train, Y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=5)
    
    return X_train, X_test, Y_train, Y_test

## Text Preprocessing

In [110]:
# Preprocessing Functions
# functions for removing html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# removing square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# removing noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

# remove special characters
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

# Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

# Removing stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [111]:
def preprocessing(data):
    data['review'] = data['review'].apply(denoise_text)
    data['review'] = data['review'].apply(remove_special_characters)
    data['review'] = data['review'].apply(simple_stemmer)
    data['review'] = data['review'].apply(remove_stopwords)
    
    return data

## Load Data

In [112]:
data = load_data1('amazon')
data = data[:10000]

In [113]:
data = preprocessing(data)

In [114]:
X_train, X_test, Y_train, Y_test = split_data(data)

In [115]:
# sen_arr = data['reviews'].to_numpy()
# sen_arr = [sen_arr[i].split() for i in range(len(sen_arr))]
# w2v = Word2Vec(sen_arr, min_count=1)

In [116]:
# print(w2v)

In [117]:
# words = list(w2v.wv.vocab)
# print(w2v['mic', 'major', 'good'])
# embedding = [w2v[sen_arr[i]].mean() for i in range(len(sen_arr))]

In [118]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv=CountVectorizer(stop_words='english', ngram_range=(1,2))
cv_train_reviews=cv.fit_transform(X_train)
cv_train_reviews = cv_train_reviews.toarray()
cv_test_reviews=cv.transform(X_test)
cv_test_reviews.shape

(2000, 246282)

## Model

In [119]:
model = MultinomialNB()
model.fit(cv_train_reviews, Y_train)

MultinomialNB()

In [120]:
prediction=model.predict(cv_test_reviews.toarray())

## Testing & Evaluation Metrics

In [121]:
accuracy_score(Y_test,prediction)

0.837

In [122]:
test = pd.Series(u'This product literally saved my life!')

In [123]:
test = cv.transform(test)

In [124]:
pred = model.predict(test)
pred

array([1])

In [125]:
report=classification_report(Y_test.to_numpy(),prediction,target_names=['Negative','Positive'])

In [126]:
print(report)

              precision    recall  f1-score   support

    Negative       0.84      0.84      0.84      1014
    Positive       0.84      0.83      0.83       986

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



## Visualization

In [127]:
#word cloud for positive review words
plt.figure(figsize=(10,10))
spam_text=X_train[1]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
spam_words=WC.generate(spam_text)
plt.imshow(spam_words,interpolation='bilinear')
plt.show

KeyError: 1

<Figure size 720x720 with 0 Axes>