In [1]:
pip install textblob vaderSentiment tabulate


Note: you may need to restart the kernel to use updated packages.


In [2]:
## Data Preprocessing

In [3]:
import pandas as pd
import numpy as np
import re
import string
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import SVC 
from sklearn.metrics import classification_report
from textblob import TextBlob 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer  
from tabulate import tabulate 


# Loading the dataset
data = pd.read_csv("Reviews.csv")
data.drop(data.index[1000:], inplace=True)

# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values (if any)
data = data.dropna()

# Convert the timestamp to a readable format
data['Time'] = pd.to_datetime(data['Time'], unit='s')

# Display the first few rows of the dataset
print(data.head())


Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score       Time  \
0                     1                       1      5 2011-04-27   
1                     0                       0      1 2012-09-07   
2                     1                       1      4 2008-08-18   
3         

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
## Remove Punctuation

#defining the function to remove punctuations in the documents
def remove_punctuation(text):
    # Initialize an empty string to store the result
    punctuation_free = ""
    
    # Iterate over each character in the text
    for i in text:
        # Check if the character is not in the string.punctuation set
        if i not in string.punctuation:
            # If not, add the character to the result string
            punctuation_free += i
    return punctuation_free

data['clean_punctuation'] = data['Text'].apply(remove_punctuation)

# Display the first few rows of the dataset (without punctuation)
print(data.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score       Time  \
0                     1                       1      5 2011-04-27   
1                     0                       0      1 2012-09-07   
2                     1                       1      4 2008-08-18   
3                     3                       3      2 2011-06-13   
4                     0                       0      5 2012-10-21   

                 Summary                                               Text  \
0  Good Quality Dog Food  I have bought several of the Vitality canned d...   
1     

In [6]:
## to standardize the cases in the documents into lower case

data['clean_lower']= data['clean_punctuation'].str.lower()
data 

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,clean_punctuation,clean_lower
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,This is a confection that has been around a fe...,this is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,Great taffy at a great price There was a wide...,great taffy at a great price there was a wide...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,B006F2NYI2,A1D3F6UI1RTXO0,Swopes,1,1,5,2012-03-16,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...,BLACK MARKET HOT SAUCE IS WONDERFUL My husband...,black market hot sauce is wonderful my husband...
996,997,B006F2NYI2,AF50D40Y85TV3,Mike A.,1,1,5,2012-02-02,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i...",Man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...
997,998,B006F2NYI2,A3G313KLWDG3PW,kefka82,1,1,5,2011-12-19,this sauce is the shiznit,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...
998,999,B006F2NYI2,A3NIDDT7E7JIFW,V. B. Brookshaw,1,2,1,2012-05-04,Not Hot,Not hot at all. Like the other low star review...,Not hot at all Like the other low star reviewe...,not hot at all like the other low star reviewe...


In [7]:
## Remove numbers using re.sub ( ) in regular expression library

#import regular expression library
import re

#function to remove digit (\d) or hypens (-) from the documents with an empty string ''
def remove_numbers(text):
    return re.sub("[\d-]",'',text)

#applying the remove_numbers function to the 'clean_lower' column and storing the result in a new column 'clean_number'
data['clean_number'] = data['clean_lower'].apply(remove_numbers)
data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,clean_punctuation,clean_lower,clean_number
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,This is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,Great taffy at a great price There was a wide...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,B006F2NYI2,A1D3F6UI1RTXO0,Swopes,1,1,5,2012-03-16,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...,BLACK MARKET HOT SAUCE IS WONDERFUL My husband...,black market hot sauce is wonderful my husband...,black market hot sauce is wonderful my husband...
996,997,B006F2NYI2,AF50D40Y85TV3,Mike A.,1,1,5,2012-02-02,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i...",Man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...
997,998,B006F2NYI2,A3G313KLWDG3PW,kefka82,1,1,5,2011-12-19,this sauce is the shiznit,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...
998,999,B006F2NYI2,A3NIDDT7E7JIFW,V. B. Brookshaw,1,2,1,2012-05-04,Not Hot,Not hot at all. Like the other low star review...,Not hot at all Like the other low star reviewe...,not hot at all like the other low star reviewe...,not hot at all like the other low star reviewe...


In [8]:
## Tokenization

#import Natural Language Processing (NLP) library called 
#Natural Language Toolkit (NLTK)
import nltk
nltk.download('punkt')

# import the library for word tokenization
from nltk.tokenize import word_tokenize

#the word tokens in the document
data['token_data']= data['clean_number'].apply(word_tokenize)
data


[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,clean_punctuation,clean_lower,clean_number,token_data
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,"[i, have, bought, several, of, the, vitality, ..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,"[product, arrived, labeled, as, jumbo, salted,..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,This is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,"[this, is, a, confection, that, has, been, aro..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,"[if, you, are, looking, for, the, secret, ingr..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,Great taffy at a great price There was a wide...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...,"[great, taffy, at, a, great, price, there, was..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,B006F2NYI2,A1D3F6UI1RTXO0,Swopes,1,1,5,2012-03-16,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...,BLACK MARKET HOT SAUCE IS WONDERFUL My husband...,black market hot sauce is wonderful my husband...,black market hot sauce is wonderful my husband...,"[black, market, hot, sauce, is, wonderful, my,..."
996,997,B006F2NYI2,AF50D40Y85TV3,Mike A.,1,1,5,2012-02-02,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i...",Man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,"[man, what, can, i, say, this, salsa, is, the,..."
997,998,B006F2NYI2,A3G313KLWDG3PW,kefka82,1,1,5,2011-12-19,this sauce is the shiznit,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,"[this, sauce, is, so, good, with, just, about,..."
998,999,B006F2NYI2,A3NIDDT7E7JIFW,V. B. Brookshaw,1,2,1,2012-05-04,Not Hot,Not hot at all. Like the other low star review...,Not hot at all Like the other low star reviewe...,not hot at all like the other low star reviewe...,not hot at all like the other low star reviewe...,"[not, hot, at, all, like, the, other, low, sta..."


In [9]:
## Removing Stop Words

#download stopwords
nltk.download('stopwords')

#Get the list of English stop words present in the library 
stopwords = nltk.corpus.stopwords.words('english')

# Print the list of stopwords
print(stopwords)

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

#Applying the remove_stopwords function to the 'token_data' column and storing the result in a new column 'clean_xstopwords'
data['clean_xstopwords'] = data['token_data'].apply(remove_stopwords)
data

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,"[i, have, bought, several, of, the, vitality, ...","[bought, several, vitality, canned, dog, food,..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,"[product, arrived, labeled, as, jumbo, salted,...","[product, arrived, labeled, jumbo, salted, pea..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,This is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,"[this, is, a, confection, that, has, been, aro...","[confection, around, centuries, light, pillowy..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,"[if, you, are, looking, for, the, secret, ingr...","[looking, secret, ingredient, robitussin, beli..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,Great taffy at a great price There was a wide...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...,"[great, taffy, at, a, great, price, there, was...","[great, taffy, great, price, wide, assortment,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,B006F2NYI2,A1D3F6UI1RTXO0,Swopes,1,1,5,2012-03-16,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...,BLACK MARKET HOT SAUCE IS WONDERFUL My husband...,black market hot sauce is wonderful my husband...,black market hot sauce is wonderful my husband...,"[black, market, hot, sauce, is, wonderful, my,...","[black, market, hot, sauce, wonderful, husband..."
996,997,B006F2NYI2,AF50D40Y85TV3,Mike A.,1,1,5,2012-02-02,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i...",Man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,"[man, what, can, i, say, this, salsa, is, the,...","[man, say, salsa, bomb, different, kinds, almo..."
997,998,B006F2NYI2,A3G313KLWDG3PW,kefka82,1,1,5,2011-12-19,this sauce is the shiznit,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,"[this, sauce, is, so, good, with, just, about,...","[sauce, good, anything, like, adding, asian, f..."
998,999,B006F2NYI2,A3NIDDT7E7JIFW,V. B. Brookshaw,1,2,1,2012-05-04,Not Hot,Not hot at all. Like the other low star review...,Not hot at all Like the other low star reviewe...,not hot at all like the other low star reviewe...,not hot at all like the other low star reviewe...,"[not, hot, at, all, like, the, other, low, sta...","[hot, like, low, star, reviewer, got, suckered..."


In [10]:
## Lemmatization

## Perform word lemmatization using WordNetLemmatizer( ) in nltk library

nltk.download('wordnet')

#importing the Lemmatizer function from nltk library
from nltk.stem import WordNetLemmatizer

#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

# #applying the lemmatizer function to the 'clean_xstopwords' column and storing the result in a new column 'clean_lemmatized1'
data['clean_lemmatized']=data['clean_xstopwords'].apply(lemmatizer)

data

[nltk_data] Error loading wordnet: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords,clean_lemmatized
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,"[i, have, bought, several, of, the, vitality, ...","[bought, several, vitality, canned, dog, food,...","[bought, several, vitality, canned, dog, food,..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,"[product, arrived, labeled, as, jumbo, salted,...","[product, arrived, labeled, jumbo, salted, pea...","[product, arrived, labeled, jumbo, salted, pea..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,This is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,"[this, is, a, confection, that, has, been, aro...","[confection, around, centuries, light, pillowy...","[confection, around, century, light, pillowy, ..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,"[if, you, are, looking, for, the, secret, ingr...","[looking, secret, ingredient, robitussin, beli...","[looking, secret, ingredient, robitussin, beli..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,Great taffy at a great price There was a wide...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...,"[great, taffy, at, a, great, price, there, was...","[great, taffy, great, price, wide, assortment,...","[great, taffy, great, price, wide, assortment,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,B006F2NYI2,A1D3F6UI1RTXO0,Swopes,1,1,5,2012-03-16,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...,BLACK MARKET HOT SAUCE IS WONDERFUL My husband...,black market hot sauce is wonderful my husband...,black market hot sauce is wonderful my husband...,"[black, market, hot, sauce, is, wonderful, my,...","[black, market, hot, sauce, wonderful, husband...","[black, market, hot, sauce, wonderful, husband..."
996,997,B006F2NYI2,AF50D40Y85TV3,Mike A.,1,1,5,2012-02-02,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i...",Man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,"[man, what, can, i, say, this, salsa, is, the,...","[man, say, salsa, bomb, different, kinds, almo...","[man, say, salsa, bomb, different, kind, almos..."
997,998,B006F2NYI2,A3G313KLWDG3PW,kefka82,1,1,5,2011-12-19,this sauce is the shiznit,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,"[this, sauce, is, so, good, with, just, about,...","[sauce, good, anything, like, adding, asian, f...","[sauce, good, anything, like, adding, asian, f..."
998,999,B006F2NYI2,A3NIDDT7E7JIFW,V. B. Brookshaw,1,2,1,2012-05-04,Not Hot,Not hot at all. Like the other low star review...,Not hot at all Like the other low star reviewe...,not hot at all like the other low star reviewe...,not hot at all like the other low star reviewe...,"[not, hot, at, all, like, the, other, low, sta...","[hot, like, low, star, reviewer, got, suckered...","[hot, like, low, star, reviewer, got, suckered..."


In [11]:
# Model Selection (Machine_Learning)

In [12]:
# Create sentiment labels
def create_sentiment_label(score):
    if score in [4, 5]:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

data['Sentiment'] = data['Score'].apply(create_sentiment_label)

In [13]:
X = data['clean_lemmatized']
y = data['Sentiment']

data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,clean_punctuation,clean_lower,clean_number,token_data,clean_xstopwords,clean_lemmatized,Sentiment
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,I have bought several of the Vitality canned d...,i have bought several of the vitality canned d...,i have bought several of the vitality canned d...,"[i, have, bought, several, of, the, vitality, ...","[bought, several, vitality, canned, dog, food,...","[bought, several, vitality, canned, dog, food,...",positive
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled as jumbo salted peanut...,product arrived labeled as jumbo salted peanut...,"[product, arrived, labeled, as, jumbo, salted,...","[product, arrived, labeled, jumbo, salted, pea...","[product, arrived, labeled, jumbo, salted, pea...",negative
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,This is a confection that has been around a fe...,this is a confection that has been around a fe...,this is a confection that has been around a fe...,"[this, is, a, confection, that, has, been, aro...","[confection, around, centuries, light, pillowy...","[confection, around, century, light, pillowy, ...",positive
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,If you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,if you are looking for the secret ingredient i...,"[if, you, are, looking, for, the, secret, ingr...","[looking, secret, ingredient, robitussin, beli...","[looking, secret, ingredient, robitussin, beli...",negative
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,Great taffy at a great price There was a wide...,great taffy at a great price there was a wide...,great taffy at a great price there was a wide...,"[great, taffy, at, a, great, price, there, was...","[great, taffy, great, price, wide, assortment,...","[great, taffy, great, price, wide, assortment,...",positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,B006F2NYI2,A1D3F6UI1RTXO0,Swopes,1,1,5,2012-03-16,Hot & Flavorful,BLACK MARKET HOT SAUCE IS WONDERFUL.... My hus...,BLACK MARKET HOT SAUCE IS WONDERFUL My husband...,black market hot sauce is wonderful my husband...,black market hot sauce is wonderful my husband...,"[black, market, hot, sauce, is, wonderful, my,...","[black, market, hot, sauce, wonderful, husband...","[black, market, hot, sauce, wonderful, husband...",positive
996,997,B006F2NYI2,AF50D40Y85TV3,Mike A.,1,1,5,2012-02-02,Great Hot Sauce and people who run it!,"Man what can i say, this salsa is the bomb!! i...",Man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,man what can i say this salsa is the bomb i ha...,"[man, what, can, i, say, this, salsa, is, the,...","[man, say, salsa, bomb, different, kinds, almo...","[man, say, salsa, bomb, different, kind, almos...",positive
997,998,B006F2NYI2,A3G313KLWDG3PW,kefka82,1,1,5,2011-12-19,this sauce is the shiznit,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,this sauce is so good with just about anything...,"[this, sauce, is, so, good, with, just, about,...","[sauce, good, anything, like, adding, asian, f...","[sauce, good, anything, like, adding, asian, f...",positive
998,999,B006F2NYI2,A3NIDDT7E7JIFW,V. B. Brookshaw,1,2,1,2012-05-04,Not Hot,Not hot at all. Like the other low star review...,Not hot at all Like the other low star reviewe...,not hot at all like the other low star reviewe...,not hot at all like the other low star reviewe...,"[not, hot, at, all, like, the, other, low, sta...","[hot, like, low, star, reviewer, got, suckered...","[hot, like, low, star, reviewer, got, suckered...",negative


In [14]:
# Split the Data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
## Feature Extraction 

In [16]:
# Flatten X_train and X_test

X_train = [' '.join(words) for words in X_train]
X_test = [' '.join(doc) if isinstance(doc, list) else doc for doc in X_test]


In [17]:
# Bag-of-Words
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

X_train_bow

<800x5110 sparse matrix of type '<class 'numpy.int64'>'
	with 24550 stored elements in Compressed Sparse Row format>

In [18]:
# Model Selection (Machine_Learning)

In [19]:
# Initialize and train classifiers (NB)

nb_bow = MultinomialNB() 
nb_bow.fit(X_train_bow, y_train)
#y_pred_bow = nb_bow.predict(X_test_bow)

In [20]:
# Initialize and train classifiers (SVM)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_bow, y_train) 

In [21]:
# Predict sentiment using classifiers 
for text, actual_label in zip(X_test, y_test): 
    # Transform the text using the same vectorizer
    text_transformed = vectorizer.transform([text])
    
    # Predict sentiment using Naive Bayes 
    nb_prediction = nb_bow.predict(text_transformed)[0] 

    # Predict sentiment using SVM 
    svm_prediction = svm_classifier.predict(text_transformed)[0]

    print(f'Text: {text}')
    print(f'Actual Label: {actual_label}')
    print (f'Naive Bayes Prediction: {nb_prediction}')
    print (f'SVM Prediction: {svm_prediction}')
    print("\n")
    


Text: thing darn cheesy like lot flavor youll love otherwise overdose cheddary goodness
Actual Label: neutral
Naive Bayes Prediction: positive
SVM Prediction: positive


Text: love cherrybrook kitchen tried almost product excited try ready spread frosting usually keep needed ingredient hand box frosting picky eater especially item contains sugar horrible consistency absolutely nothing like regular canned frosting light fluffy gooey even imagine trying spread cake would rip piece thick gooey even try cake taste texture offputting seriously waste money
Actual Label: negative
Naive Bayes Prediction: negative
SVM Prediction: negative


Text: recently purchased sale local grocery store know salebr br fluffy creamy white frosting youd expect find opening jar wasnt expecting like betty crocker pretty bad like others said glueytacky mess would completely ruin cake tried spread reminds taffy hasnt quite hit right stage candy making process taste isnt great either sickeningly sweet even frosting

In [22]:
# Calculate classification report for Naive Bayes 
nb_predictions = nb_bow.predict(X_test_bow)
nb_classification_report = classification_report(y_test, nb_predictions, target_names=['negative', 'neutral', 'positive'])

# Calculate classification report for SVM 
svm_predictions = svm_classifier.predict(X_test_bow)
svm_classification_report = classification_report(y_test, svm_predictions, target_names=['negative', 'neutral', 'positive'])

# Print classification report for Naive Bayes 
print("\nClassification Report for Naive Bayes:") 
print(nb_classification_report) 

# Print classification report for SVM 
print("\nClassification Report for SVM:") 
print(svm_classification_report)


Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.80      0.16      0.27        25
     neutral       0.00      0.00      0.00        11
    positive       0.84      0.99      0.91       164

    accuracy                           0.83       200
   macro avg       0.55      0.38      0.39       200
weighted avg       0.78      0.83      0.78       200


Classification Report for SVM:
              precision    recall  f1-score   support

    negative       0.46      0.52      0.49        25
     neutral       0.50      0.18      0.27        11
    positive       0.89      0.91      0.90       164

    accuracy                           0.82       200
   macro avg       0.62      0.54      0.55       200
weighted avg       0.82      0.82      0.82       200



In [23]:
# Model Selection (Lexicon)

In [24]:
data['clean_lemmatized']

0      [bought, several, vitality, canned, dog, food,...
1      [product, arrived, labeled, jumbo, salted, pea...
2      [confection, around, century, light, pillowy, ...
3      [looking, secret, ingredient, robitussin, beli...
4      [great, taffy, great, price, wide, assortment,...
                             ...                        
995    [black, market, hot, sauce, wonderful, husband...
996    [man, say, salsa, bomb, different, kind, almos...
997    [sauce, good, anything, like, adding, asian, f...
998    [hot, like, low, star, reviewer, got, suckered...
999    [admit, sucker, large, quantity, oz, shopping,...
Name: clean_lemmatized, Length: 1000, dtype: object

In [25]:
table_data = [["Text", "Actual Label", "TextBlob Polarity", "TextBlob Sentiment", "VADER Compound", "VADER Sentiment"]]

In [26]:
for idx, row in data.iterrows():
    text = ' '.join(row['clean_lemmatized'])
    actual_label = row['Sentiment']
    
    # TextBlob Analysis
    blob = TextBlob(text) 
    tb_polarity = blob.sentiment.polarity 
    tb_label = 'positive' if tb_polarity > 0 else 'negative' if tb_polarity < 0 else 'neutral'

    # VADER Analysis
    analyzer = SentimentIntensityAnalyzer() 
    vs = analyzer.polarity_scores(text) 
    vader_compound = vs['compound'] 
    vader_label = 'positive' if vader_compound > 0.05 else 'negative' if vader_compound < -0.05 else 'neutral'

    table_data.append([text, actual_label, tb_polarity, tb_label, vader_compound, vader_label]) 

In [27]:
print(tabulate(table_data, headers="firstrow", tablefmt="plain"))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [28]:
# Calculate classification report for TextBlob 
tb_classification_report = classification_report(
    [row[1] for row in table_data[1:]],
    [row[3] for row in table_data[1:]],
    labels=['negative', 'neutral', 'positive'],
    target_names=['negative', 'neutral', 'positive'],
    zero_division=1
)

# Calculate classification report for VADER 
vader_classification_report = classification_report(
    [row[1] for row in table_data[1:]],
    [row[5] for row in table_data[1:]],
    labels=['negative', 'neutral', 'positive'],
    target_names=['negative', 'neutral', 'positive']
)

In [29]:
print("\nClassification Report for TextBlob:") 
print(tb_classification_report) 


Classification Report for TextBlob:
              precision    recall  f1-score   support

    negative       0.57      0.36      0.44       145
     neutral       0.11      0.04      0.06        75
    positive       0.83      0.94      0.88       780

    accuracy                           0.79      1000
   macro avg       0.50      0.45      0.46      1000
weighted avg       0.74      0.79      0.76      1000



In [30]:
print("\nClassification Report for Vader:") 
print(vader_classification_report) 


Classification Report for Vader:
              precision    recall  f1-score   support

    negative       0.57      0.34      0.43       145
     neutral       0.17      0.07      0.10        75
    positive       0.84      0.95      0.89       780

    accuracy                           0.80      1000
   macro avg       0.53      0.45      0.47      1000
weighted avg       0.75      0.80      0.76      1000



In [31]:
# Aisya Batrisyia BInti Azley SW01081523
# Nur Adilah Binti Zainal Abidin SW01081031

# The lexicon-based methods, TextBlob and VADER, achieved overall accuracies of 79% and 80%, respectively, 
# on a dataset of 1000 samples. Both exhibited strong performance in identifying positive sentiments but struggled 
# significantly with neutral sentiments, as indicated by low precision and recall scores. On the other hand, the 
# machine learning approaches, Naive Bayes and SVM, were tested on a smaller dataset of 200 samples and achieved
# accuracies of 83% and 82%, respectively. While Naive Bayes showed high precision for positive sentiments but poor 
# performance for neutral sentiments, SVM provided a more balanced performance across different sentiment categories.
# Despite the smaller dataset, the machine learning models demonstrated competitive accuracy, suggesting their robustness
# in sentiment classification tasks.

AttributeError: 'str' object has no attribute 'keys'