## Data Preparation

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from textblob import TextBlob
import csv
import re

In [None]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", trust_remote_code=True)
print(dataset["full"][0])
print(type(dataset))

In [None]:
file = pd.DataFrame(columns=['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id', 'helpful_vote', 'verified_purchase'])

In [None]:
file["rating"] = [i["rating"] for i in dataset["full"]]
file["title"] = [i["title"] for i in dataset["full"]]
file["text"] = [i["text"] for i in dataset["full"]]
file["asin"] = [i["asin"] for i in dataset["full"]]
file["parent_asin"] = [i["parent_asin"] for i in dataset["full"]]
file["user_id"] = [i["user_id"] for i in dataset["full"]]
file["helpful_vote"] = [i["helpful_vote"] for i in dataset["full"]]
file["verified_purchase"] = [i["verified_purchase"] for i in dataset["full"]]

In [None]:
file.head()

In [None]:
file.to_csv("Amazon_Reviews_2023.csv", index = False)

## Data Preprocessing

In [3]:
data = pd.read_csv("Amazon_Reviews_2023.csv")
data = data.loc[:10000, :]
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase
0,5.0,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True
1,5.0,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True
4,5.0,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True


In [5]:
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

data['cleaned reviews'] = data['text'].apply(clean)
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews
0,5.0,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month
1,5.0,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...
4,5.0,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...


In [7]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

data['POS tagged'] = data['cleaned reviews'].apply(token_stop_pos)
data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Anjana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews,POS tagged
0,5.0,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month,"[(work, n), (great, a), (use, v), (new, a), (o..."
1,5.0,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side,"[(Little, a), (thin, a), (side, n)]"
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue,"[(Quick, a), (delivery, n), (fixed, v), (issue..."
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...,"[(sure, a), (whether, None), (worth, a), (give..."
4,5.0,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...,"[(Easy, n), (install, v), (got, v), (product, ..."


In [9]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

data['Lemma'] = data['POS tagged'].apply(lemmatize)
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews,POS tagged,Lemma
0,5.0,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month,"[(work, n), (great, a), (use, v), (new, a), (o...",work great use new one every month
1,5.0,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side,"[(Little, a), (thin, a), (side, n)]",Little thin side
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue,"[(Quick, a), (delivery, n), (fixed, v), (issue...",Quick delivery fix issue
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...,"[(sure, a), (whether, None), (worth, a), (give...",sure whether worth give cost compare origina...
4,5.0,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...,"[(Easy, n), (install, v), (got, v), (product, ...",Easy install get product expect receive


In [11]:
from textblob import TextBlob

def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity
    
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [15]:
data['Polarity'] = data['Lemma'].apply(getPolarity)
data['Analysis'] = data['Polarity'].apply(analysis)
data.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,cleaned reviews,POS tagged,Lemma,Polarity,Analysis
0,5.0,Work great,work great. use a new one every month,B01N0TQ0OH,B01N0TQ0OH,AGKHLEW2SOWHNMFQIJGBECAF7INQ,0,True,work great use a new one every month,"[(work, n), (great, a), (use, v), (new, a), (o...",work great use new one every month,0.468182,Positive
1,5.0,excellent product,Little on the thin side,B07DD2DMXB,B07DD37QPZ,AHWWLSPCJMALVHDDVSUGICL6RUCA,0,True,Little on the thin side,"[(Little, a), (thin, a), (side, n)]",Little thin side,-0.29375,Negative
2,5.0,Happy customer!,"Quick delivery, fixed the issue!",B082W3Z9YK,B082W3Z9YK,AHZIJGKEWRTAEOZ673G5B3SNXEGQ,0,True,Quick delivery fixed the issue,"[(Quick, a), (delivery, n), (fixed, v), (issue...",Quick delivery fix issue,0.333333,Positive
3,5.0,Amazing value,I wasn't sure whether these were worth it or n...,B078W2BJY8,B078W2BJY8,AFGUPTDFAWOHHL4LZDV27ERDNOYQ,0,True,I wasn t sure whether these were worth it or n...,"[(sure, a), (whether, None), (worth, a), (give...",sure whether worth give cost compare origina...,0.51875,Positive
4,5.0,Dryer parts,Easy to install got the product expected to re...,B08C9LPCQV,B08C9LPCQV,AELFJFAXQERUSMTXJQ6SYFFRDWMA,0,True,Easy to install got the product expected to re...,"[(Easy, n), (install, v), (got, v), (product, ...",Easy install get product expect receive,0.433333,Positive


In [17]:
data['Analysis'].value_counts()

Analysis
Positive    8148
Neutral     1021
Negative     832
Name: count, dtype: int64