**<h2>PROCESSING THE REVIEWS</h2>**

----
----

This data repository can be found [here](https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html )

<br>
A pair of product categories have been chosen for this project.

*   Home and Kitchen
*   Grocery and Gourmet Food

The datasets considered for this project are the **5-core** dataset and the **metadata** of each category.

<br>

**<h4>Business Objectives :</h4>**
*   Apply natural language understanding to convert large set of text into more formal representations that are easier for manipulations.
*   Extract structured information from unstructured information.
*   Analyze the attitude or emotional state of the reviewers from their review texts.


<br>

**<h4> Project Approach :</h4>**
*   Loading the datasets from the url links
*   Using Natural Language Processing techniques to extract information from product descriptions in the metadata to fill in missing data in the review records.
*   Using the Opinion lexicon for identifying positive and negative words, and analyzing the sentiments of the reviews.
*   Storing the datasets for further use

# Libraries :

In [1]:
# Installing libraries - langdetect, pycountry
!pip install langdetect pycountry

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: langdetect, pycountry
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=bbc64869d697c7b7a972c667149e924b6eefb9a566a02efd500dcbce8b7464eb
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f

In [None]:
# Importing libraries to handel data sets
import csv                                              # For csv files
import json                                             # For json files
import gzip                                             # For compressed files

# Libraries to handle datetime data
import datetime as dt                                   # To manipulate dates

# Libraries for data manipulation and visualization
import ast                                              # Abstract Syntax Trees (ast) package
import numpy as np                                      # To manipulate arrays
import pandas as pd                                     # To manipulate dataframes
from scipy import sparse                                # To manipulate spare matrices

import matplotlib.pyplot as plt                         # To create charts and graphs
from matplotlib.patches import ConnectionPatch          # For graphs
import seaborn as sns                                   # To visualize
from wordcloud import WordCloud,STOPWORDS               # To create world cloud graphs

# Libraries to process text data
import re                                               # To apply regular expressions over string data
import string
import pycountry                                        # To convert ISO codes of languages into subjective strings

from langdetect import detect, DetectorFactory          # To detect language of the text
DetectorFactory.seed = 0                                # Setting seed value to enforce consistent results (Language detection is non-deterministic)

import spacy                                            # To create NLP objects
spacy.cli.download('en_core_web_lg')                    # Loading 'english' model trained on large datasets
nlp=spacy.load('en_core_web_lg')                        # Creating a spacy instance

# Libraries to handel warnings
import warnings
warnings.filterwarnings('ignore')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
# Spacy pipeline
print('Original pipeline : ',nlp.pipe_names)

#nlp.disable_pipes()                                 # type in function to disable in the pipeline

print('Amended pipeline :',nlp.pipe_names)

Original pipeline :  ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Amended pipeline : ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


# Loading the data :

## Functions used in loading and reading the datasets :

In [None]:
# Function to open zipped file
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

# Function to create dataframe from zipped file
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

# Function to check the lists in 'helpful' column
def check(lists) :
  l=len(lists)
  if l==2 :                              # if the list has two elements
    if lists[0]>lists[1] :               # and the 1st element is greater than the 2nd
      return 'Yes'
    elif lists[0]==lists[1] :            # if the elements are equal
      return 'Equal'
    elif lists[0]<0 or lists[1]<0 :      # if either element is negative
      return '-ve'
    else :                               # if 1st element is not greater than the 2nd
      return 'No'
  else :                                 # if there are more than 2 elements in the list
    return '>2'

# Function to get the helpfulness ratio
def helpful_ratio(list_num) :
  if list_num[1]!=0 :                       # if the denominator is not zero
    return list_num[0]/list_num[1]          # return ratio
  else :
    pass

## Home and Kitchen :

#### Reviews :

In [None]:
# 5 core review
!wget 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Home_and_Kitchen_5.json.gz'

# Dataframe
hk=getDF('reviews_Home_and_Kitchen_5.json.gz')
hk

--2023-08-22 08:27:45--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Home_and_Kitchen_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 138126598 (132M) [application/x-gzip]
Saving to: ‘reviews_Home_and_Kitchen_5.json.gz’


2023-08-22 08:27:54 (15.6 MB/s) - ‘reviews_Home_and_Kitchen_5.json.gz’ saved [138126598/138126598]



Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,APYOBQE6M18AA,0615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013"
1,A1JVQTAGHYOL7F,0615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014"
2,A3UPYGJKZ0XTU4,0615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013"
3,A2MHCTX43MIMDZ,0615391206,"M. Johnson ""Tea Lover""","[14, 18]",This book is a must have if you get a Zoku (wh...,5.0,Creative Combos,1312416000,"08 4, 2011"
4,AHAI85T5C2DH3,0615391206,PugLover,"[0, 0]",This cookbook is great. I have really enjoyed...,4.0,A must own if you own the Zoku maker...,1402099200,"06 7, 2014"
...,...,...,...,...,...,...,...,...,...
551677,A11J1FHCK5U06J,B00LBFUU12,Karinna Ball,"[0, 0]",These ice pop molds are awesome! Bright kid-ha...,5.0,Summer fun for everyone!,1404950400,"07 10, 2014"
551678,A537XC69FAD3J,B00LBFUU12,L Green,"[0, 0]",great popsicle molds - very nice quality - and...,5.0,Five Stars,1405382400,"07 15, 2014"
551679,AWHZOUIQ0VO7M,B00LBFUU12,Richard N,"[0, 0]",My kids and I are loving these - putting our c...,5.0,... these - putting our creativity to the test...,1405468800,"07 16, 2014"
551680,A1KQNP8MOJDJKC,B00LBFUU12,RS,"[1, 1]","I love these ice pop makers. First off, I love...",5.0,love them,1405209600,"07 13, 2014"


In [None]:
# Checking null values in the data
hk.isnull().sum()

reviewerID           0
asin                 0
reviewerName      4953
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

**<h2> Helpful column : </h2>**


*   Checking the elements in the list.
*   Droping records classified as anamolies in the column.
*   Spliting the list and creating 2 more columns.
*   Getting the helpfulness ratio from the 2 columns.
*   Droping the original helpful column from the dataframe.




In [None]:
# Checking 'helpful' column
x=pd.DataFrame(hk['helpful'].copy())

display(hk['helpful'].value_counts()[0:5]/len(hk))
print('\n')

# Applying 'check' function
x['1>2']=x['helpful'].apply(check)

display(x.head())
print('\n')
x['1>2'].value_counts()

[0, 0]    0.549316
[1, 1]    0.141375
[2, 2]    0.049438
[0, 1]    0.032211
[3, 3]    0.024306
Name: helpful, dtype: float64





Unnamed: 0,helpful,1>2
0,"[0, 0]",Equal
1,"[0, 0]",Equal
2,"[26, 27]",No
3,"[14, 18]",No
4,"[0, 0]",Equal






Equal    452819
No        98863
Name: 1>2, dtype: int64

In [None]:
# Creating 2 columns from the helpful column
hk[['Positive_feedback','Total_feedback']]=hk.apply(lambda x : x['helpful'], result_type='expand',axis=1)

hk.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Positive_feedback,Total_feedback
0,APYOBQE6M18AA,615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013",0,0
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014",0,0
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013",26,27
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""","[14, 18]",This book is a must have if you get a Zoku (wh...,5.0,Creative Combos,1312416000,"08 4, 2011",14,18
4,AHAI85T5C2DH3,615391206,PugLover,"[0, 0]",This cookbook is great. I have really enjoyed...,4.0,A must own if you own the Zoku maker...,1402099200,"06 7, 2014",0,0


In [None]:
# Creating a column of helpfulness ratio
hk['helpfulness_ratio']=hk['helpful'].apply(helpful_ratio)

display(pd.concat([hk.head(3),hk.tail(3)],axis=0))

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio
0,APYOBQE6M18AA,0615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013",0,0,
1,A1JVQTAGHYOL7F,0615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014",0,0,
2,A3UPYGJKZ0XTU4,0615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013",26,27,0.962963
551679,AWHZOUIQ0VO7M,B00LBFUU12,Richard N,"[0, 0]",My kids and I are loving these - putting our c...,5.0,... these - putting our creativity to the test...,1405468800,"07 16, 2014",0,0,
551680,A1KQNP8MOJDJKC,B00LBFUU12,RS,"[1, 1]","I love these ice pop makers. First off, I love...",5.0,love them,1405209600,"07 13, 2014",1,1,1.0
551681,A3AHOSISKNBLZ6,B00LBFUU12,Tonya S,"[0, 0]",My kids love these ice pop makers!!! Since we...,5.0,Great popsicle molds!!! Perfect for making su...,1404777600,"07 8, 2014",0,0,


**<h2> Review columns : </h2>**


*   Combining the reviewText and summary columns
*   Droping the summary column



In [None]:
# Concatenating the strings in reviewText and summary columns
hk['reviewText']=hk['reviewText']+' '+hk['summary']

hk.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio
0,APYOBQE6M18AA,615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013",0,0,
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014",0,0,
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013",26,27,0.962963
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""","[14, 18]",This book is a must have if you get a Zoku (wh...,5.0,Creative Combos,1312416000,"08 4, 2011",14,18,0.777778
4,AHAI85T5C2DH3,615391206,PugLover,"[0, 0]",This cookbook is great. I have really enjoyed...,4.0,A must own if you own the Zoku maker...,1402099200,"06 7, 2014",0,0,


In [None]:
hk['reviewText'][0]

'My daughter wanted this book and the price on Amazon was the best.  She has already tried one recipe a day after receiving the book.  She seems happy with it. Best Price'

**<h2> unixReviewTime  :</h2>**

- The unix time stamp is a way to track time as a running total of seconds. This count starts at the Unix Epoch on January 1st, 1970 at UTC.

- The unix time stamp is merely the number of seconds between a particular date and the Unix Epoch.

- This point in time technically does not change no matter where you are located on the globe, and so it's very useful to computer systems for tracking and sorting dated information.

- We'll be removing this column from the dataframe since the 'reviewTime' column is suffice for this project.

In [None]:
# Dropping irrelevant columns
hk.drop(['summary','helpful','unixReviewTime'],axis=1,inplace=True)
hk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551682 entries, 0 to 551681
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   reviewerID         551682 non-null  object 
 1   asin               551682 non-null  object 
 2   reviewerName       546729 non-null  object 
 3   reviewText         551682 non-null  object 
 4   overall            551682 non-null  float64
 5   reviewTime         551682 non-null  object 
 6   Positive_feedback  551682 non-null  int64  
 7   Total_feedback     551682 non-null  int64  
 8   helpfulness_ratio  248634 non-null  float64
dtypes: float64(2), int64(2), object(5)
memory usage: 58.2+ MB


#### Metadata :

In [None]:
# Metadata
!wget 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Home_and_Kitchen.json.gz'

# Dataframe
hk_meta = getDF('meta_Home_and_Kitchen.json.gz')
hk_meta

--2023-08-22 20:13:15--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Home_and_Kitchen.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152367034 (145M) [application/x-gzip]
Saving to: ‘meta_Home_and_Kitchen.json.gz.4’


2023-08-22 20:13:39 (6.31 MB/s) - ‘meta_Home_and_Kitchen.json.gz.4’ saved [152367034/152367034]



Unnamed: 0,asin,salesRank,imUrl,categories,title,description,related,price,brand
0,0076144011,{'Books': 6285595},http://g-ecx.images-amazon.com/images/G/01/x-s...,[[Home & Kitchen]],"Ninjas, Piranhas, and Galileo",,,,
1,0130350591,{'Kitchen & Dining': 459680},http://ecx.images-amazon.com/images/I/21zcx6RC...,"[[Home & Kitchen, Kitchen & Dining, Dining & E...",Le Creuset Kiwi (Green) Butter Dish Stoneware,Each piece of Le Creuset dinnerware is crafted...,,,
2,0307394530,"{'Arts, Crafts & Sewing': 3597}",http://ecx.images-amazon.com/images/I/51A4FWuj...,"[[Home & Kitchen, Artwork, Posters & Prints]]",Martha Stewart's Wedding Cakes,Of all the decisions that go into planning a w...,"{'also_bought': ['144630163X', '1402717733', '...",14.99,Random House
3,0439903491,{'Software': 7065},http://ecx.images-amazon.com/images/I/61tVxcko...,"[[Home & Kitchen, Artwork, Posters & Prints]]",,Shiver me timbers! Solve I SPY pirate picture ...,"{'also_viewed': ['B000YFSZU8', 'B0016KWZB0', '...",29.99,
4,0578060604,,http://ecx.images-amazon.com/images/I/41aCELWJ...,"[[Home & Kitchen, Furniture, Kitchen & Dining ...",Build A Maloof Inspired Low Back Dining Chair ...,The Maloofinspired Low Back Dining Chair is no...,{'also_viewed': ['B004IO6RS8']},,
...,...,...,...,...,...,...,...,...,...
436983,B00LVJQDBY,,http://ecx.images-amazon.com/images/I/51x%2Btj...,"[[Home & Kitchen, Furniture, Living Room Furni...","Big Boy Recliner Chair, Brown, Hold up to 250 ...",The Northwest Territory Big Boy Recliner Chair...,{'also_viewed': ['B006UMM1Y0']},22.49,
436984,B00LXFN8MS,{'Home &amp; Kitchen': 515079},http://ecx.images-amazon.com/images/I/61e35I1u...,"[[Home & Kitchen, Bedding]]",6 pc Kids Bedding Monkey Twin Comforter set wi...,Alyssa HomeTM Kids twin comforter set Monkey T...,,27.96,
436985,B00LUVXMX0,,http://ecx.images-amazon.com/images/I/61FBUFmn...,"[[Home & Kitchen, Bedding, Quilts]]",3 Pc Cotton Filled Blue White Scroll Floral Qu...,Bedspread offers superior softness and warmth ...,"{'also_viewed': ['B00KD1JYD0', 'B007S7AVVQ', '...",38.98,
436986,B00M0U03EQ,,http://ecx.images-amazon.com/images/I/41CDwpGU...,"[[Home & Kitchen, Furniture, Home Office Furni...",Student Dorm Home Office Laptop Computer Moder...,This student desk is perfect for any home offi...,{'also_viewed': ['B00E7HNT3M']},,


In [None]:
# Converting objects in 'categories' columns from 2D to 1D
hk_meta['categories']=hk_meta['categories'].apply(lambda x : x[0])

hk_meta.head()

Unnamed: 0,asin,salesRank,imUrl,categories,title,description,related,price,brand
0,76144011,{'Books': 6285595},http://g-ecx.images-amazon.com/images/G/01/x-s...,[Home & Kitchen],"Ninjas, Piranhas, and Galileo",,,,
1,130350591,{'Kitchen & Dining': 459680},http://ecx.images-amazon.com/images/I/21zcx6RC...,"[Home & Kitchen, Kitchen & Dining, Dining & En...",Le Creuset Kiwi (Green) Butter Dish Stoneware,Each piece of Le Creuset dinnerware is crafted...,,,
2,307394530,"{'Arts, Crafts & Sewing': 3597}",http://ecx.images-amazon.com/images/I/51A4FWuj...,"[Home & Kitchen, Artwork, Posters & Prints]",Martha Stewart's Wedding Cakes,Of all the decisions that go into planning a w...,"{'also_bought': ['144630163X', '1402717733', '...",14.99,Random House
3,439903491,{'Software': 7065},http://ecx.images-amazon.com/images/I/61tVxcko...,"[Home & Kitchen, Artwork, Posters & Prints]",,Shiver me timbers! Solve I SPY pirate picture ...,"{'also_viewed': ['B000YFSZU8', 'B0016KWZB0', '...",29.99,
4,578060604,,http://ecx.images-amazon.com/images/I/41aCELWJ...,"[Home & Kitchen, Furniture, Kitchen & Dining R...",Build A Maloof Inspired Low Back Dining Chair ...,The Maloofinspired Low Back Dining Chair is no...,{'also_viewed': ['B004IO6RS8']},,


#### Merging data :

Adding a new column in the reviews dataset by mapping the productID (asin) in reviews with the price of the product available in the meta dataset.

In [None]:
# Merging reviews and meta datasets
df_hk=hk.merge(hk_meta[['asin','price']],on='asin',how='left',validate='m:m')
df_hk

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price
0,APYOBQE6M18AA,0615391206,Martin Schwartz,My daughter wanted this book and the price on ...,5.0,"10 19, 2013",0,0,,17.29
1,A1JVQTAGHYOL7F,0615391206,Michelle Dinh,I bought this zoku quick pop for my daughterr ...,5.0,"06 18, 2014",0,0,,17.29
2,A3UPYGJKZ0XTU4,0615391206,mirasreviews,There is no shortage of pop recipes available ...,4.0,"05 5, 2013",26,27,0.962963,17.29
3,A2MHCTX43MIMDZ,0615391206,"M. Johnson ""Tea Lover""",This book is a must have if you get a Zoku (wh...,5.0,"08 4, 2011",14,18,0.777778,17.29
4,AHAI85T5C2DH3,0615391206,PugLover,This cookbook is great. I have really enjoyed...,4.0,"06 7, 2014",0,0,,17.29
...,...,...,...,...,...,...,...,...,...,...
551677,A11J1FHCK5U06J,B00LBFUU12,Karinna Ball,These ice pop molds are awesome! Bright kid-ha...,5.0,"07 10, 2014",0,0,,8.99
551678,A537XC69FAD3J,B00LBFUU12,L Green,great popsicle molds - very nice quality - and...,5.0,"07 15, 2014",0,0,,8.99
551679,AWHZOUIQ0VO7M,B00LBFUU12,Richard N,My kids and I are loving these - putting our c...,5.0,"07 16, 2014",0,0,,8.99
551680,A1KQNP8MOJDJKC,B00LBFUU12,RS,"I love these ice pop makers. First off, I love...",5.0,"07 13, 2014",1,1,1.000000,8.99


In [None]:
# Saving dataset on google drive
df_hk.to_csv('/content/drive/MyDrive/home_kitchen_merged.csv')

In [None]:
df_hk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551682 entries, 0 to 551681
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   reviewerID         551682 non-null  object 
 1   asin               551682 non-null  object 
 2   reviewerName       546729 non-null  object 
 3   reviewText         551682 non-null  object 
 4   overall            551682 non-null  float64
 5   reviewTime         551682 non-null  object 
 6   Positive_feedback  551682 non-null  int64  
 7   Total_feedback     551682 non-null  int64  
 8   helpfulness_ratio  248634 non-null  float64
 9   price              505620 non-null  float64
dtypes: float64(3), int64(2), object(5)
memory usage: 46.3+ MB


In [None]:
df_hk.isnull().sum()

reviewerID                0
asin                      0
reviewerName           4953
reviewText                0
overall                   0
reviewTime                0
Positive_feedback         0
Total_feedback            0
helpfulness_ratio    303048
price                 46062
dtype: int64

In [None]:
# Checking for empty reviews
print('There are %d empty reviews.' %df_hk['reviewText'].eq('').sum())

display(df_hk[df_hk['reviewText'].eq('')].head())

There are 0 empty reviews.


Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price


## Grocery and Gourmet Food :

#### Reviews :

In [None]:
# 5 core reviews
!wget 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Grocery_and_Gourmet_Food_5.json.gz'

# Dataframe
ggf = getDF('reviews_Grocery_and_Gourmet_Food_5.json.gz')
ggf

--2023-08-22 09:36:08--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Grocery_and_Gourmet_Food_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36760767 (35M) [application/x-gzip]
Saving to: ‘reviews_Grocery_and_Gourmet_Food_5.json.gz.1’


2023-08-22 09:36:13 (7.83 MB/s) - ‘reviews_Grocery_and_Gourmet_Food_5.json.gz.1’ saved [36760767/36760767]



Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1VEELTKS8NLZB,616719923X,Amazon Customer,"[0, 0]",Just another flavor of Kit Kat but the taste i...,4.0,Good Taste,1370044800,"06 1, 2013"
1,A14R9XMZVJ6INB,616719923X,amf0001,"[0, 1]",I bought this on impulse and it comes from Jap...,3.0,"3.5 stars, sadly not as wonderful as I had hoped",1400457600,"05 19, 2014"
2,A27IQHDZFQFNGG,616719923X,Caitlin,"[3, 4]",Really good. Great gift for any fan of green t...,4.0,Yum!,1381190400,"10 8, 2013"
3,A31QY5TASILE89,616719923X,DebraDownSth,"[0, 0]","I had never had it before, was curious to see ...",5.0,Unexpected flavor meld,1369008000,"05 20, 2013"
4,A2LWK003FFMCI5,616719923X,Diana X.,"[1, 2]",I've been looking forward to trying these afte...,4.0,"Not a very strong tea flavor, but still yummy ...",1369526400,"05 26, 2013"
...,...,...,...,...,...,...,...,...,...
151249,A2L6QS8SVHT9RG,B00KCJRVO2,"randomartco ""period film aficionado""","[0, 0]",Delicious gluten-free oatmeal: we tried both t...,4.0,Delicious gluten-free oatmeal 'quick' packs!,1405123200,"07 12, 2014"
151250,AFJFXN42RZ3G2,B00KCJRVO2,"R. DelParto ""Rose2""","[0, 0]",With the many selections of instant oatmeal ce...,4.0,Convenient and Instant,1404604800,"07 6, 2014"
151251,ASEBX8TBYWQWA,B00KCJRVO2,"Steven I. Ramm ""Steve Ramm &#34;Anything Phon...","[1, 1]","While I usually review CDs and DVDs, as well a...",5.0,Compares favorably in taste and texture with o...,1404172800,"07 1, 2014"
151252,ANKQGTXHREOI5,B00KCJRVO2,Titanium Lili,"[0, 1]",My son and I enjoyed these oatmeal packets. H...,4.0,Pretty good!,1404432000,"07 4, 2014"


In [None]:
# Checking null values
ggf.isnull().sum()

reviewerID           0
asin                 0
reviewerName      1493
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
dtype: int64

**<h2> Helpful column : </h2>**


*   Checking the elements in the list.
*   Droping records with errors in the column.
*   Spliting the list and creating columns.
*   Getting the helpfulness ratio from the column.
*   Droping the original 'helpful' column from the dataframe.




In [None]:
# Checking the 'helpful' column
x=pd.DataFrame(ggf['helpful'].copy())
display(ggf['helpful'].value_counts()[0:5]/len(ggf))

x['1>2']=x['helpful'].apply(check)

display(x.head())
print('\n')
display(x['1>2'].value_counts())
print('\n')
display(ggf.loc[x[x['1>2']=='Yes'].index[0],:])

[0, 0]    0.571244
[1, 1]    0.122430
[0, 1]    0.055979
[1, 2]    0.039087
[2, 2]    0.038366
Name: helpful, dtype: float64

Unnamed: 0,helpful,1>2
0,"[0, 0]",Equal
1,"[0, 1]",No
2,"[3, 4]",No
3,"[0, 0]",Equal
4,"[1, 2]",No






Equal    118736
No        32517
Yes           1
Name: 1>2, dtype: int64





reviewerID                                           A2V0I904FH7ABY
asin                                                     B001EQ55RW
reviewerName                                                   XNOR
helpful                                                      [3, 2]
reviewText        It was almost a 'love at first bite' - the per...
overall                                                         4.0
summary                Pure cocoa taste with crunchy almonds inside
unixReviewTime                                           1212883200
reviewTime                                               06 8, 2008
Name: 52803, dtype: object

There is one anamoly in the helpful column

In [None]:
# Droping record where number of positive reviews is greater than the total number of reviews
ggf.drop(x[x['1>2']=='Yes'].index[0],axis=0,inplace=True)

In [None]:
# Creating 2 columns from the helpful column
ggf[['Positive_feedback','Total_feedback']]=ggf.apply(lambda x : x['helpful'], result_type='expand',axis=1)

ggf.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Positive_feedback,Total_feedback
0,A1VEELTKS8NLZB,616719923X,Amazon Customer,"[0, 0]",Just another flavor of Kit Kat but the taste i...,4.0,Good Taste,1370044800,"06 1, 2013",0,0
1,A14R9XMZVJ6INB,616719923X,amf0001,"[0, 1]",I bought this on impulse and it comes from Jap...,3.0,"3.5 stars, sadly not as wonderful as I had hoped",1400457600,"05 19, 2014",0,1
2,A27IQHDZFQFNGG,616719923X,Caitlin,"[3, 4]",Really good. Great gift for any fan of green t...,4.0,Yum!,1381190400,"10 8, 2013",3,4
3,A31QY5TASILE89,616719923X,DebraDownSth,"[0, 0]","I had never had it before, was curious to see ...",5.0,Unexpected flavor meld,1369008000,"05 20, 2013",0,0
4,A2LWK003FFMCI5,616719923X,Diana X.,"[1, 2]",I've been looking forward to trying these afte...,4.0,"Not a very strong tea flavor, but still yummy ...",1369526400,"05 26, 2013",1,2


In [None]:
# Creating a column of helpfulness ratio
ggf['helpfulness_ratio']=ggf['helpful'].apply(helpful_ratio)

display(pd.concat([ggf.head(3),ggf.tail(3)],axis=0))

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio
0,A1VEELTKS8NLZB,616719923X,Amazon Customer,"[0, 0]",Just another flavor of Kit Kat but the taste i...,4.0,Good Taste,1370044800,"06 1, 2013",0,0,
1,A14R9XMZVJ6INB,616719923X,amf0001,"[0, 1]",I bought this on impulse and it comes from Jap...,3.0,"3.5 stars, sadly not as wonderful as I had hoped",1400457600,"05 19, 2014",0,1,0.0
2,A27IQHDZFQFNGG,616719923X,Caitlin,"[3, 4]",Really good. Great gift for any fan of green t...,4.0,Yum!,1381190400,"10 8, 2013",3,4,0.75
151251,ASEBX8TBYWQWA,B00KCJRVO2,"Steven I. Ramm ""Steve Ramm &#34;Anything Phon...","[1, 1]","While I usually review CDs and DVDs, as well a...",5.0,Compares favorably in taste and texture with o...,1404172800,"07 1, 2014",1,1,1.0
151252,ANKQGTXHREOI5,B00KCJRVO2,Titanium Lili,"[0, 1]",My son and I enjoyed these oatmeal packets. H...,4.0,Pretty good!,1404432000,"07 4, 2014",0,1,0.0
151253,A2CF66KIQ3RKX3,B00KCJRVO2,Vivian Deliz,"[0, 0]",I like to eat oatmeal i the mornings. I usuall...,4.0,I like to eat oatmeal i the mornings,1405036800,"07 11, 2014",0,0,


**<h2> Review columns : </h2>**


*   Combining the reviewText and summary columns
*   Droping the summary column

In [None]:
# Concatenating the strings in reviewText and summary columns
ggf['reviewText']=ggf['reviewText']+' '+ggf['summary']

ggf.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio
0,A1VEELTKS8NLZB,616719923X,Amazon Customer,"[0, 0]",Just another flavor of Kit Kat but the taste i...,4.0,Good Taste,1370044800,"06 1, 2013",0,0,
1,A14R9XMZVJ6INB,616719923X,amf0001,"[0, 1]",I bought this on impulse and it comes from Jap...,3.0,"3.5 stars, sadly not as wonderful as I had hoped",1400457600,"05 19, 2014",0,1,0.0
2,A27IQHDZFQFNGG,616719923X,Caitlin,"[3, 4]",Really good. Great gift for any fan of green t...,4.0,Yum!,1381190400,"10 8, 2013",3,4,0.75
3,A31QY5TASILE89,616719923X,DebraDownSth,"[0, 0]","I had never had it before, was curious to see ...",5.0,Unexpected flavor meld,1369008000,"05 20, 2013",0,0,
4,A2LWK003FFMCI5,616719923X,Diana X.,"[1, 2]",I've been looking forward to trying these afte...,4.0,"Not a very strong tea flavor, but still yummy ...",1369526400,"05 26, 2013",1,2,0.5


In [None]:
ggf['reviewText'][0]

'Just another flavor of Kit Kat but the taste is unique and a bit different.  The only thing that is bothersome is the price.  I thought it was a bit expensive.... Good Taste'

**<h2> unixReviewTime  :</h2>**

- The unix time stamp is a way to track time as a running total of seconds. This count starts at the Unix Epoch on January 1st, 1970 at UTC.

- The unix time stamp is merely the number of seconds between a particular date and the Unix Epoch.

- This point in time technically does not change no matter where you are located on the globe, and so it's very useful to computer systems for tracking and sorting dated information.

- We'll be removing this column from the dataframe since the 'reviewTime' column is suffice for this project.

In [None]:
# Dropping irrelevant columns
ggf.drop(['helpful','summary','unixReviewTime'],axis=1,inplace=True)
ggf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151253 entries, 0 to 151253
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   reviewerID         151253 non-null  object 
 1   asin               151253 non-null  object 
 2   reviewerName       149760 non-null  object 
 3   reviewText         151253 non-null  object 
 4   overall            151253 non-null  float64
 5   reviewTime         151253 non-null  object 
 6   Positive_feedback  151253 non-null  int64  
 7   Total_feedback     151253 non-null  int64  
 8   helpfulness_ratio  64850 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 11.5+ MB


#### Metadata :

In [None]:
# Metadata
!wget 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Grocery_and_Gourmet_Food.json.gz'

# Dataframe
ggf_meta = getDF('meta_Grocery_and_Gourmet_Food.json.gz')
ggf_meta

--2023-08-22 20:15:02--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Grocery_and_Gourmet_Food.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54490289 (52M) [application/x-gzip]
Saving to: ‘meta_Grocery_and_Gourmet_Food.json.gz.3’


2023-08-22 20:15:45 (1.22 MB/s) - ‘meta_Grocery_and_Gourmet_Food.json.gz.3’ saved [54490289/54490289]



Unnamed: 0,asin,description,title,imUrl,related,salesRank,categories,price,brand
0,0657745316,This is real vanilla extract made with only 3 ...,100 Percent All Natural Vanilla Extract,http://ecx.images-amazon.com/images/I/41gFi5h0...,{'also_viewed': ['B001GE8N4Y']},{'Grocery & Gourmet Food': 374004},[[Grocery & Gourmet Food]],,
1,0700026444,"Silverpot Tea, Pure Darjeeling, is an exquisit...",Pure Darjeeling Tea: Loose Leaf,http://ecx.images-amazon.com/images/I/51hs8sox...,,{'Grocery & Gourmet Food': 620307},[[Grocery & Gourmet Food]],,
2,1403796890,Must have for any WWE Fan\n \n \n \nFeaturing ...,WWE Kids Todler Velvet Slippers featuring John...,http://ecx.images-amazon.com/images/I/518SEST5...,,,[[Grocery & Gourmet Food]],3.99,
3,141278509X,Infused with Vitamins and Electrolytes Good So...,Archer Farms Strawberry Dragonfruit Drink Mix ...,http://ecx.images-amazon.com/images/I/51CFQIis...,{'also_viewed': ['B0051IETTY']},{'Grocery & Gourmet Food': 620322},[[Grocery & Gourmet Food]],,
4,1453060375,MiO Energy is your portable energy source givi...,Mio Energy Liquid Water Enhancer Black Cherry ...,http://ecx.images-amazon.com/images/I/51EUsMcn...,"{'also_viewed': ['B006MSEOJ2', 'B005VOOQLO', '...",{'Grocery & Gourmet Food': 268754},[[Grocery & Gourmet Food]],11.99,Mio
...,...,...,...,...,...,...,...,...,...
171755,B00LDXFI6Y,Nescafe Cafe Viet is extracted from the aromat...,Nescafe Cafe Viet Vietnamese Sweetened Instant...,http://ecx.images-amazon.com/images/I/51qAGS7j...,{'also_viewed': ['B000DN8EZW']},,[[Grocery & Gourmet Food]],17.99,
171756,B00LMMLRG6,Moon Cheese Snacks Moon Cheese High in protein...,"Moon Cheese, 2 Oz. Pack of Three (Assortment)",http://ecx.images-amazon.com/images/I/419FO438...,{'also_viewed': ['B000UPFWW6']},{'Grocery & Gourmet Food': 54090},[[Grocery & Gourmet Food]],16.95,
171757,B00LOXAZ1Q,Sour Punch candy is the brand of mouth waterin...,"Sour Punch Blue Raspberry Bite, 5 Ounce Bag --...",http://ecx.images-amazon.com/images/I/31Cj3cHD...,,{'Grocery & Gourmet Food': 133517},[[Grocery & Gourmet Food]],16.55,
171758,B00LOZ7F0S,"Our Vanilla Extract made from\nPremium, Organi...",Organic Mexican Vanilla,http://ecx.images-amazon.com/images/I/11iORwy7...,,,[[Grocery & Gourmet Food]],,


In [None]:
ggf_meta['categories']=ggf_meta['categories'].apply(lambda x : x[0])

ggf_meta.head()

Unnamed: 0,asin,description,title,imUrl,related,salesRank,categories,price,brand
0,0657745316,This is real vanilla extract made with only 3 ...,100 Percent All Natural Vanilla Extract,http://ecx.images-amazon.com/images/I/41gFi5h0...,{'also_viewed': ['B001GE8N4Y']},{'Grocery & Gourmet Food': 374004},[Grocery & Gourmet Food],,
1,0700026444,"Silverpot Tea, Pure Darjeeling, is an exquisit...",Pure Darjeeling Tea: Loose Leaf,http://ecx.images-amazon.com/images/I/51hs8sox...,,{'Grocery & Gourmet Food': 620307},[Grocery & Gourmet Food],,
2,1403796890,Must have for any WWE Fan\n \n \n \nFeaturing ...,WWE Kids Todler Velvet Slippers featuring John...,http://ecx.images-amazon.com/images/I/518SEST5...,,,[Grocery & Gourmet Food],3.99,
3,141278509X,Infused with Vitamins and Electrolytes Good So...,Archer Farms Strawberry Dragonfruit Drink Mix ...,http://ecx.images-amazon.com/images/I/51CFQIis...,{'also_viewed': ['B0051IETTY']},{'Grocery & Gourmet Food': 620322},[Grocery & Gourmet Food],,
4,1453060375,MiO Energy is your portable energy source givi...,Mio Energy Liquid Water Enhancer Black Cherry ...,http://ecx.images-amazon.com/images/I/51EUsMcn...,"{'also_viewed': ['B006MSEOJ2', 'B005VOOQLO', '...",{'Grocery & Gourmet Food': 268754},[Grocery & Gourmet Food],11.99,Mio


#### Merging data :
Adding a new column in the reviews dataset by mapping the productID (asin) in reviews with the price of the product available in the meta dataset.

In [None]:
# Merging datasets for product prices
df_ggf=ggf.merge(ggf_meta[['asin','price']],on='asin',how='left',validate='m:m')
df_ggf

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price
0,A1VEELTKS8NLZB,616719923X,Amazon Customer,Just another flavor of Kit Kat but the taste i...,4.0,"06 1, 2013",0,0,,
1,A14R9XMZVJ6INB,616719923X,amf0001,I bought this on impulse and it comes from Jap...,3.0,"05 19, 2014",0,1,0.00,
2,A27IQHDZFQFNGG,616719923X,Caitlin,Really good. Great gift for any fan of green t...,4.0,"10 8, 2013",3,4,0.75,
3,A31QY5TASILE89,616719923X,DebraDownSth,"I had never had it before, was curious to see ...",5.0,"05 20, 2013",0,0,,
4,A2LWK003FFMCI5,616719923X,Diana X.,I've been looking forward to trying these afte...,4.0,"05 26, 2013",1,2,0.50,
...,...,...,...,...,...,...,...,...,...,...
151248,A2L6QS8SVHT9RG,B00KCJRVO2,"randomartco ""period film aficionado""",Delicious gluten-free oatmeal: we tried both t...,4.0,"07 12, 2014",0,0,,13.0
151249,AFJFXN42RZ3G2,B00KCJRVO2,"R. DelParto ""Rose2""",With the many selections of instant oatmeal ce...,4.0,"07 6, 2014",0,0,,13.0
151250,ASEBX8TBYWQWA,B00KCJRVO2,"Steven I. Ramm ""Steve Ramm &#34;Anything Phon...","While I usually review CDs and DVDs, as well a...",5.0,"07 1, 2014",1,1,1.00,13.0
151251,ANKQGTXHREOI5,B00KCJRVO2,Titanium Lili,My son and I enjoyed these oatmeal packets. H...,4.0,"07 4, 2014",0,1,0.00,13.0


In [None]:
#df_ggf.to_csv('/content/drive/MyDrive/grocery_gourmet_merged.csv')

In [None]:
df_ggf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151253 entries, 0 to 151252
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   reviewerID         151253 non-null  object 
 1   asin               151253 non-null  object 
 2   reviewerName       149760 non-null  object 
 3   reviewText         151253 non-null  object 
 4   overall            151253 non-null  float64
 5   reviewTime         151253 non-null  object 
 6   Positive_feedback  151253 non-null  int64  
 7   Total_feedback     151253 non-null  int64  
 8   helpfulness_ratio  64850 non-null   float64
 9   price              126733 non-null  float64
dtypes: float64(3), int64(2), object(5)
memory usage: 12.7+ MB


In [None]:
df_ggf.isnull().sum()

reviewerID               0
asin                     0
reviewerName          1493
reviewText               0
overall                  0
reviewTime               0
Positive_feedback        0
Total_feedback           0
helpfulness_ratio    86403
price                24520
dtype: int64

In [None]:
print('There are %d empty reviews.' %df_ggf['reviewText'].eq('').sum())

display(df_ggf[df_ggf['reviewText'].eq('')].head())

There are 0 empty reviews.


Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price


# Feature Engineering and Saving the datasets :

### Functions used to handle nulls and create new features :

In [None]:
# String of punctuations
puncts=re.sub("'|`",'',string.punctuation)                 # Removing ' and ` from a string of punctuations

# NLP stop words
nlp.Defaults.stop_words |= {'blah','etc','eg'}             # Adding custom words into nlp stop_words list 326 words

stop_words_list=nlp.Defaults.stop_words                    # List of nlp stop_words

In [None]:
# Function to detect the language of the given text
def detect_lang(txt) :
  try :
    iso_code = detect(str.lower(txt))                             # Using detect module of langdetect to get iso_code from a text
    return pycountry.languages.get(alpha_2=iso_code).name         # returning the name of the language based on the iso_code
  except :
    return 'Unknown'

# Function to find the count of words in a text
def txt_len(txt) :
  try :
    return int(len(txt.split(' ')))               # creating a list by splitting a string based on the delimiter, then returning the length of the list
  except :
    return 0

# Function for text pre processing
def txtpreprocess(old_txt,punctuation_string,nlp_obj=False) :
  try :
    new_txt=re.sub('http\S+','',old_txt)                                                                       # removing any web links
    new_txt=re.sub('`',"'",new_txt)                                                                            # replacing ` with ' so that stop words can be detected
    new_txt=''.join(x.lower() for x in new_txt if x not in punctuation_string and x not in string.digits)      # removing punctuations and digits
    new_txt=re.sub('[\n|\t|\r]*','',new_txt)                                                                   # removing escape characters
    new_txt=re.sub('x{2,}|X{2,}|ok{2,}','',new_txt)                                                            # removing repeat words

    txt_obj=nlp(new_txt)                                                                                       # NLP object based on cleaned text

    if nlp_obj==False :                                                                                # Block is executed if nlp_obj parameter is set to False
      token_list=[token.lemma_ for token in txt_obj if token.lemma_ not in stop_words_list and
                  token.lemma_ !='I' and token.is_space==False and len(token)>2 and
                  token.is_oov==False and token.ent_type_!='PERSON']                                           # list of lemmatized tokens satisfying given conditions
      return ' '.join(token for token in token_list)                                                           # returns this list as a string (cleaned text)

    else :
       return txt_obj                                                                                  # Block is executed if nlp_obj parameter is set to True

  except :
    return ''

# Function to get polarity score from cleaned text
def get_polarity(clean_txt,positives,negatives) :
  try :
    review=clean_txt.split(' ')
    pos=[p for p in review if p in positives.values]
    neg=[p for p in review if p in negatives.values]

    score=round((len(pos)-len(neg))/len(review),2)

    return pos,neg,int(len(pos)),int(len(neg)),int(len(review)),score
  except :
    return pd.NaT

# Function to get sentiment of a text from the polarity score
def get_sentiment(score):
  try :
    if score < 0 :
      return 'Negative'
    elif score > 0.15 :
      return "Positive"
    else:
      return "Neutral"
  except :
    return pd.NaT

# Function to apply all above defined functions
def all_functions(lang,txt,punctuations,pos_txt,neg_txt) :
  if lang=='English' :                                # Block is executed if the language of the text is english

    cleantxt = txtpreprocess(old_txt = txt, punctuation_string = punctuations)                              # cleaning the text
    pos_words,neg_words,no_pos,no_neg,total_words,polarity = get_polarity(clean_txt = cleantxt,
                                                                          positives=pos_txt,
                                                                          negatives=neg_txt)                # getting the list of positive and negative words, along with their
                                                                                                            # quantities, and  polarity score from cleaned text

    sentiment=get_sentiment(score = polarity)                                                               # getting the sentiment from the polarity score

    return (cleantxt,pos_words,neg_words,no_pos,no_neg,total_words,polarity,sentiment)                      # returning a tuple with 8 elements

  else :                                             # Block is executed if the language of the text is NOT english
    pass

In [None]:
# Function to filter nulls
def nulls(x) :
  if type(x)!=str :
    return False
  else :
    return True

# Function to get number of elements in a list
def related_number(col) :
  try :
    number=len(col)
    return number
  except :
    return 0

# Function to concat strings
def concat_txt(*args) :
  try :
    args=filter(nulls,args)                                                     # filtering out nulls from *args
    return ' '.join(args)                                                       # returning joined args
  except :
    return ''

# Function to detect entities for brand column from given strings
def get_entities(*args,return_dict=False) :
  txt_obj=nlp(concat_txt(*args))                                                # nlp object of joined string filtered from nulls
  entity={'ORG':[],'PRODUCT':[],'PERSON':[]}                                    # dictionary to hold entities of label 'ORG','PRODUCT' and 'PERSON'

  for i in txt_obj.ents :                                                       # looping over entities in nlp object
    if i.label_=='ORG' and len(str(i.text).split(' '))<3 :                      # entity labelled 'ORG' with number of words not greater than 3
      entity['ORG'].append(i)
    elif i.label_=='PRODUCT' and len(str(i.text).split(' '))<3 :                # entity labelled 'PRODUCT' with number of words not greater than 3
      entity['PRODUCT'].append(i)
    elif i.label_=='PERSON' and len(str(i.text).split(' '))<3 :                 # entity labelled 'PERSON' with number of words not greater than 3
      entity['PERSON'].append(i)
    else :                                                                      # skipping over any other entity label
      pass

  if return_dict==False :                   # Block is executed if the return value should be a single entity
    if len(entity['ORG'])>0 :                                                   # if list of 'ORG' entities is not empty, return the 1st element
      return str(entity['ORG'][0])
    elif len(entity['PRODUCT'])>0 :                                             # if list of 'PRODUCT' entities is not empty, return the 1st element
      return str(entity['PRODUCT'][0])
    elif len(entity['PERSON'])>0 :                                              # if list of 'PERSON' entities is not empty, return the 1st element
      return str(entity['PERSON'][0])
    else :                                                                      # if all the values in dictionary are empty, the brand is 'Unknown'
      return 'Unknown'
  else :                                    # Block is executed if the return value should be the dictionary of entities
    return entity

## Meta datasets :

In [None]:
# Merging meta dataframes of both categories
meta_data=pd.concat([hk_meta,ggf_meta],axis=0).reset_index(drop=True)

display(pd.concat([meta_data.head(2),meta_data.tail(2)],axis=0))

Unnamed: 0,asin,salesRank,imUrl,categories,title,description,related,price,brand
0,0076144011,{'Books': 6285595},http://g-ecx.images-amazon.com/images/G/01/x-s...,[Home & Kitchen],"Ninjas, Piranhas, and Galileo",,,,
1,0130350591,{'Kitchen & Dining': 459680},http://ecx.images-amazon.com/images/I/21zcx6RC...,"[Home & Kitchen, Kitchen & Dining, Dining & En...",Le Creuset Kiwi (Green) Butter Dish Stoneware,Each piece of Le Creuset dinnerware is crafted...,,,
608746,B00LOZ7F0S,,http://ecx.images-amazon.com/images/I/11iORwy7...,[Grocery & Gourmet Food],Organic Mexican Vanilla,"Our Vanilla Extract made from\nPremium, Organi...",,,
608747,B00LQWKDBM,,http://ecx.images-amazon.com/images/I/31aDrMe4...,[Grocery & Gourmet Food],Mrs. Grass Onion Soup and Dip Mix,,"{'also_viewed': ['B00LAOSVPG', 'B001CAT22O']}",,


In [None]:
# Checking the null values and their % in the metadata
nulls=pd.concat([meta_data.isnull().sum(),meta_data.isnull().sum()/len(meta_data)],axis=1)
nulls.columns=['null count','null %']
nulls

Unnamed: 0,null count,null %
asin,0,0.0
categories,0,0.0
title,1898,0.003118
description,59438,0.09764
price,210428,0.345673
brand,394932,0.648761


#### <h2> Brand column : </h2>

*   Adding existing brand names in spacy's entity recognition model
*   Getting brand names from title and description of products using NER technique
*   Replacing the empty brands with the extracted brand names.



In [None]:
print('Number of potential brand names from description column : ',meta_data[meta_data['description'].notna()]['brand'].isnull().sum())
print('Number of known brands where description is absent : ',meta_data[meta_data['description'].isnull()]['brand'].notna().sum())
print('Number of missing brands where description is also absent : ',meta_data[meta_data['description'].isnull()]['brand'].isnull().sum())

Number of potential brand names from description column :  347151
Number of known brands where description is absent :  11657
Number of missing brands where description is also absent :  47781


In [None]:
# Brand names to be added in spacy's pipeline for Entity recognition
brands=meta_data[meta_data['brand'].notna()]['brand'].unique()                  # unique not null brand names in the dataset

# List of brand names found during random trials of the get_entities function.
some_brands=['Pfaltzgraff',"All-Clad's",'Mugzie','ArchStone','Cambridge Silversmiths','Paderno','Lipton','OJIA','Maloofinspired',"Trader Joe's",'Bosch',
             'Cuisinart','Mr. Nature','Johnson Bros.','Pop Shots','Yohay Bakery','Square Dot','Better Bowls','Bakery Street','Miam.Miam',
             'Coffee Masters','Sympathy','Kipling','Dobby Dot','JAMOCHA','Sympathy Silks','Vinaroz','Disney']

print('There are %d unique not null brands in the dataset.' %len(brands))

There are 24476 unique not null brands in the dataset.


In [None]:
# Filtering out proper brand names from all the unique brand names
proper_names=[]

for b in brands :
  if all(letter.isalpha() or letter.isspace() for letter in b) :                # appending brands that only contain letters and/or spaces
    proper_names.append(b)
  else :
    pass

# Stacking filtered brand names with manually found brand names
brand_list=list(np.hstack([proper_names,some_brands]))                          # stacking some_brand and proper_names

# Creating custom patterns for spacy's NER
patterns=[]                                                                     # spacy's entity ruler requires a list of dictionaries to identity entity patterns in strings

for brand in brand_list :
  patterns.append({'label':'ORG','pattern':str(brand)})                         # all brand names are labelled 'ORG' and are stored as string dtype

In [None]:
# Original spacy pipeline
print('Original pipeline : ')
display(nlp.pipe_names)

# Adding entity ruler in the pipeline before NER
ruler=nlp.add_pipe('entity_ruler',before='ner')

# Amended spacy pipeline
print('\nAmended pipeline : ')
display(nlp.pipe_names)

Original pipeline : 


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


Amended pipeline : 


['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler',
 'ner']

In [None]:
# Adding the custom patterns in entity ruler to be used in entity recognition
ruler.add_patterns(patterns)

In [None]:
# Sample run of the get_entities function's outputs
data_slice=meta_data[['description','title','brand']][0:20]                     # 1st few records in the metadata set

# Where the output is a dictionary
data_slice[['ORG','PRODUCT','PERSON']]=data_slice.apply(lambda x : get_entities(x['description'],x['title'],return_dict=True),
                                                        axis=1,result_type='expand')                                            # 3 new columns for the entities found

# Where the output is a single entity
data_slice['extracted_brand']=data_slice.apply(lambda x : get_entities(x['description'],x['title'],return_dict=False),axis=1)   # Column of entity extracted

display(data_slice.head())

# Filling nulls in the original 'brand' column with 'extracted' brand names
data_slice['brand'].fillna(data_slice['extracted_brand'],inplace=True)
print('\n\n')
display(data_slice.head())

Unnamed: 0,description,title,brand,ORG,PRODUCT,PERSON,extracted_brand
0,,"Ninjas, Piranhas, and Galileo",,[],[(Galileo)],[],Galileo
1,Each piece of Le Creuset dinnerware is crafted...,Le Creuset Kiwi (Green) Butter Dish Stoneware,,"[(Le, Creuset), (Le, Creuset), (Le, Creuset), ...",[],[],Le Creuset
2,Of all the decisions that go into planning a w...,Martha Stewart's Wedding Cakes,Random House,"[(Martha, Stewart), (Martha, Stewart)]",[],"[(Wendy, Kromer), (forMartha), (Stewart, Weddi...",Martha Stewart
3,Shiver me timbers! Solve I SPY pirate picture ...,,,[],[],"[(Hunt), (Hunt)]",Hunt
4,The Maloofinspired Low Back Dining Chair is no...,Build A Maloof Inspired Low Back Dining Chair ...,,"[(Maloofinspired), (Just), (Instructional, Bun...",[],"[(Charles, Brock, 's), (Charles, Brock)]",Maloofinspired







Unnamed: 0,description,title,brand,ORG,PRODUCT,PERSON,extracted_brand
0,,"Ninjas, Piranhas, and Galileo",Galileo,[],[(Galileo)],[],Galileo
1,Each piece of Le Creuset dinnerware is crafted...,Le Creuset Kiwi (Green) Butter Dish Stoneware,Le Creuset,"[(Le, Creuset), (Le, Creuset), (Le, Creuset), ...",[],[],Le Creuset
2,Of all the decisions that go into planning a w...,Martha Stewart's Wedding Cakes,Random House,"[(Martha, Stewart), (Martha, Stewart)]",[],"[(Wendy, Kromer), (forMartha), (Stewart, Weddi...",Martha Stewart
3,Shiver me timbers! Solve I SPY pirate picture ...,,Hunt,[],[],"[(Hunt), (Hunt)]",Hunt
4,The Maloofinspired Low Back Dining Chair is no...,Build A Maloof Inspired Low Back Dining Chair ...,Maloofinspired,"[(Maloofinspired), (Just), (Instructional, Bun...",[],"[(Charles, Brock, 's), (Charles, Brock)]",Maloofinspired


In [None]:
# Extracting brand's from tile and description of products
meta_data['extracted_brand']=meta_data.apply(lambda x : get_entities(x['description'],x['title'],return_dict=False),axis=1)

# Filling the 60% empty brands with the extracted brands
meta_data['brand'].fillna(meta_data['extracted_brand'],inplace=True)

display(meta_data.sample(n=7))

Unnamed: 0,asin,categories,title,description,price,brand,extracted_brand
180153,B003QP3IDM,"[Home & Kitchen, Kitchen & Dining, Bakeware, C...",CK Products 4-1/4-Inch Hockey Goalie Sucker Ch...,"CK Products chocolate molds are first quality,...",1.99,CK Products,CK Products
245384,B005BSNHB0,"[Home & Kitchen, Artwork, Posters & Prints]","11x14 Picture / Poster Frame, Painted Wood Gra...",This brand new picture frame is manufactured i...,15.99,Poster,Poster
280324,B006WOR53I,"[Home & Kitchen, Kitchen & Dining, Dining & En...",Carson Home Accents The Original Rednek Rita G...,After the unbelievable success of the Orignal ...,10.25,Carson,Carson Home
587216,B00CGDDNSU,[Grocery & Gourmet Food],"Nature Valley, Protein, Salted Caramel Nut Che...","Nature Valley, Protein, Salted Caramel Nut Che...",27.75,Nature Valley,Nature Valley
411786,B00FKD27PI,"[Home & Kitchen, Heating, Cooling & Air Qualit...",Dyson AM01 Air Multiplier 12-Inch Table Fan in...,[if gte mso 9]><xml> <w:WordDocument> <w:View>...,,Zoom,Zoom
587223,B00CGF1S76,"[Grocery & Gourmet Food, Baby Foods, Baby Form...",Baby / Child Similac Expert Care Alimentum Hyp...,"Similac Alimentum Hypoallergenic, Powder, Just...",439.5,Similac Alimentum,Similac Alimentum
196941,B00448KSME,"[Home & Kitchen, Storage & Organization, Trash...","Rubbermaid 32-Gallon Round Wheeled Trash Can, ...","From a brand you can trust, Rubbermaid, our bl...",79.99,United Solutions,Rubbermaid


#### <h2> Related column : </h2>


*   Creating a dataframe by normalizing the json format in the column
*   Getting the count of list elements in every field.



In [None]:
# Reading nested json in meta data
meta_related=pd.json_normalize(meta_data['related'])

# Concatenating 'asin' column to the normalised dataframe
meta_related=pd.concat([meta_data['asin'],meta_related],axis=1)

display(pd.concat([meta_related.head(3),meta_related.tail(3)],axis=0))

Unnamed: 0,asin,also_bought,also_viewed,bought_together,buy_after_viewing
0,0076144011,,,,
1,0130350591,,,,
2,0307394530,"[144630163X, 1402717733, 0789327333, 144630285...","[030795434X, 076455719X, 1600611680, 144630163...","[0789327333, 144630163X]",
3,0439903491,,"[B000YFSZU8, B0016KWZB0, B0014YGPM8, B0019I3KE...",[B000GCBOR0],"[B000A40W4A, B000R4OGZE, B001BFRPVU, B000YFSZU8]"
4,0578060604,,[B004IO6RS8],,
...,...,...,...,...,...
608743,B00LDXFI6Y,,[B000DN8EZW],,
608744,B00LMMLRG6,,[B000UPFWW6],,
608745,B00LOXAZ1Q,,,,
608746,B00LOZ7F0S,,,,


In [None]:
# Creating columns for number of elements in the lists
meta_related['no_also_bought']=meta_related['also_bought'].apply(related_number)
meta_related['no_also_viewed']=meta_related['also_viewed'].apply(related_number)
meta_related['no_bought_together']=meta_related['bought_together'].apply(related_number)
meta_related['no_buy_after_viewing']=meta_related['buy_after_viewing'].apply(related_number)

meta_related.head()

Unnamed: 0,asin,also_bought,also_viewed,bought_together,buy_after_viewing,no_also_bought,no_also_viewed,no_bought_together,no_buy_after_viewing
0,76144011,,,,,0,0,0,0
1,130350591,,,,,0,0,0,0
2,307394530,"[144630163X, 1402717733, 0789327333, 144630285...","[030795434X, 076455719X, 1600611680, 144630163...","[0789327333, 144630163X]",,100,60,2,0
3,439903491,,"[B000YFSZU8, B0016KWZB0, B0014YGPM8, B0019I3KE...",[B000GCBOR0],"[B000A40W4A, B000R4OGZE, B001BFRPVU, B000YFSZU8]",0,9,1,4
4,578060604,,[B004IO6RS8],,,0,1,0,0


#### <h2> Categories column :</h2>


*   Using Multilabel Binarizer to get unique categories



In [None]:
# Creating an instance of MultiLabelBinarizer
mlb=MultiLabelBinarizer()

# Creating a new dataframe with categories expanded into columns
catgs=pd.DataFrame(mlb.fit_transform(meta_data['categories']),columns=mlb.classes_)

catgs.insert(0,'asin',meta_data['asin'])          # Inserting asin column in the dataframe

pd.concat([catgs.head(3),catgs.tail(3)],axis=0)

Unnamed: 0,asin,AC Adapters,Accent Plates,Accessories,Accessories & Supplies,Acid Reflux Pillows,Acrylic Paintings,Active Dry Yeasts,Adjustable Chairs,Adobo Sauce,...,Woks & Stir-Fry Pans,Woodcut Prints,Worcestershire Sauce,Worktables & Workstations,Wort Chillers,Wrapping & Packaging,Yellow Mustard,Yogurt Makers,Zesters & Reamers,Ziti
0,0076144011,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0130350591,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0307394530,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608745,B00LOXAZ1Q,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608746,B00LOZ7F0S,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608747,B00LQWKDBM,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**There are 1595 categories in the complete meta data for both 'Home and Kitchen' and 'Grocery and Gourmet Foods' combined.**

In [None]:
# List of all categories
catg_list=list(catgs.columns)[1:]

# Saving the list
pd.Series(catg_list).to_csv('/content/drive/MyDrive/catg_list.csv',index=False)

In [None]:
catg_list2=pd.read_csv('/content/drive/MyDrive/catg_list.csv')

catg_list=list(catg_list2.loc[:,'0'])
catg_list

#### <h2>Saving the datasets :</h2>

In [None]:
# Dropping columns from metadata dataframe
meta_data.drop(['salesRank','imUrl','related'],axis=1,inplace=True)

In [None]:
# Writing the meta dataframe in chunks : CSV : META DATA
chunks=np.array_split(meta_data,20)

for i,chunk in enumerate(chunks) :
  mode='w' if i==0 else 'a'
  header=i==0
  chunk.to_csv('/content/drive/MyDrive/capstone_project_meta_data.csv',header=header,mode=mode,index=False)        # mode='a' means 'append' the dataframes

In [None]:
# Writing the 'related' dataframe in chunks : CSV : META_RELATED
chunks=np.array_split(meta_related,20)

for i,chunk in enumerate(chunks) :
  mode='w' if i==0 else 'a'
  header=i==0
  chunk.to_csv('/content/drive/MyDrive/capstone_project_meta_related.csv',header=header,mode=mode,index=False)     # mode='a' means 'append' the dataframes

## Review datasets :

In [None]:
# Concatenating the merged dataframes of Home and Kitchen, and Groceries and Gourmet Food
data=pd.concat([df_hk,df_ggf],axis=0).reset_index(drop=True)

# Converting 'reviewTime' to date column
data['reviewTime']=data['reviewTime'].apply(date_time_column, format_date = '%m %d, %Y')

display(data.loc[[0,1,2,len(data)-3,len(data)-2,len(data)-1],:])

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price
0,APYOBQE6M18AA,0615391206,Martin Schwartz,My daughter wanted this book and the price on ...,5.0,2013-10-19,0,0,,17.29
1,A1JVQTAGHYOL7F,0615391206,Michelle Dinh,I bought this zoku quick pop for my daughterr ...,5.0,2014-06-18,0,0,,17.29
2,A3UPYGJKZ0XTU4,0615391206,mirasreviews,There is no shortage of pop recipes available ...,4.0,2013-05-05,26,27,0.962963,17.29
702932,ASEBX8TBYWQWA,B00KCJRVO2,"Steven I. Ramm ""Steve Ramm &#34;Anything Phon...","While I usually review CDs and DVDs, as well a...",5.0,2014-07-01,1,1,1.0,13.0
702933,ANKQGTXHREOI5,B00KCJRVO2,Titanium Lili,My son and I enjoyed these oatmeal packets. H...,4.0,2014-07-04,0,1,0.0,13.0
702934,A2CF66KIQ3RKX3,B00KCJRVO2,Vivian Deliz,I like to eat oatmeal i the mornings. I usuall...,4.0,2014-07-11,0,0,,13.0


#### **<h2> Language of review texts : </h2>**


*   Checking the language of the reviewText.
*   Confirming the results.
*   Re-assigning language label to mislabelled records


In [None]:
# Creating a language column
data['language']=data['reviewText'].apply(detect_lang)

In [None]:
language_list=data['language'].value_counts()
language_list

English       702425
Spanish           77
Danish            57
Afrikaans         52
Norwegian         43
Romanian          36
Swedish           35
French            34
Italian           27
Catalan           27
Welsh             20
Slovenian         14
Albanian          14
Somali            10
Estonian          10
Dutch              9
Croatian           9
Slovak             6
Portuguese         6
Unknown            5
Czech              5
Indonesian         3
Polish             2
Tagalog            2
German             2
Turkish            2
Hungarian          1
Finnish            1
Latvian            1
Name: language, dtype: int64

In [None]:
data[data['language'].eq('Unknown')]

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price,language
45699,A2WCS6IE2K0LK9,B00008CM67,Lee ok hee,&#51665;&#50640;&#49436; &#52280; &#51096; &#5...,5.0,2013-04-06,0,1,0.0,84.99,Unknown
46184,A2WCS6IE2K0LK9,B00008CM6B,Lee ok hee,&#47956;&#44032; &#47566;&#51060; &#54644; &#4...,4.0,2013-04-06,1,3,0.333333,50.2,Unknown
186035,A2WCS6IE2K0LK9,B000ND3QN2,Lee ok hee,&#50508;&#47336;&#48120;&#45700; &#51228;&#544...,4.0,2013-04-06,1,2,0.5,49.99,Unknown
241677,A2WCS6IE2K0LK9,B0017IFSIS,Lee ok hee,&#46321;&#49328;&#44040; &#46412; &#51453;&#51...,4.0,2013-04-06,0,1,0.0,23.15,Unknown
417444,A2WCS6IE2K0LK9,B004RCCTRY,Lee ok hee,&#52280; &#51089;&#51008; &#49324;&#51060;&#51...,4.0,2013-04-06,0,3,0.0,12.75,Unknown


In [None]:
data[data['language'].eq('Spanish')]

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price,language
4537,A12M2CMU3Z5MVU,B00004OCNJ,Edgar alexis martnez blanco,HOLA LE DOY 3 ESTRELLAS A ESTE PRODUCTO PORQUE NO TIENE LA FLEXIBILIDAD NECESARIA PARA RECOGER LOS VEGETALES PICADOS DE LA TABLA. DE RESTO NO ME QUEJO EDGAR MARTINEZ DESDE VENEZUELA,3.0,2013-03-08,2,7,0.285714,11.91,Spanish
5835,A2201YLB3SGLD8,B00004OCOF,"Marcos Miquilena ""tecnico optico""","definitivamnete funciona, no es hermoso ni es una obra de arte, pero hace muy bien el trabajo,de hecho ahora si esta el equipo completo de espatulas y cucharas oxo en mi cocina son robustos,de gran tama&ntilde;o y muy buen grip.lo recomiendo, sumelo a su set y tendra un gran performance en su cocina,eso si solo para aluminio (teflon )nunca,pero nunca con ollas de hierro (logde) ya que se derriten en una fraccione de segundo. excelente",5.0,2011-06-02,0,0,,8.99,Spanish
8723,APSCLGI2RUG4I,B00004RFQ4,"Eduardo Hidalgo ""Bolon 50""","Su precio vale lo que pesa. Es un producto de excelente calidad, lo recomiendo porque esta hecho de un material resistente y con un acabado de primera. Excelente Producto",5.0,2012-11-09,0,0,,21.68,Spanish
9502,APSCLGI2RUG4I,B00004S1AS,"Eduardo Hidalgo ""Bolon 50""",Un producto hecho de un material de primera.Si desea tener una manga para hacer sus galletas con una terminacion perfecta pues compre este producto italiano de excelente calidad Excelente producto,5.0,2012-11-09,0,4,0.000000,42.20,Spanish
20773,A2OW4JFB2SNTEV,B00004WKI3,"Olga Dorronsoro ""olga""",Magnifico y con 2 cuchillas diferentes. El estuche muy bueno pues esta siempre a mano en la cocina. Comprelo sin ninguna duda Buen cuchillo,5.0,2013-02-22,0,1,0.000000,39.95,Spanish
...,...,...,...,...,...,...,...,...,...,...,...
636716,AQPV11MJERSZ4,B003PWC346,Kyle,Delicious TRUE cinnamon!!!!!!!!!! TRUE Cinnamon,5.0,2014-07-01,0,0,,5.91,Spanish
641942,A2OW4JFB2SNTEV,B004165MWA,"Olga Dorronsoro ""olga""","Muy linda repisacon todas las hierbas y ali&ntilde;os que necesitaba. Muy contenta con esta compra, y se ve linda en la cocina Una belleza",5.0,2013-02-22,5,7,0.714286,71.84,Spanish
654340,A2OW4JFB2SNTEV,B004LWJFFY,"Olga Dorronsoro ""olga""","Los mejores higos que me he comido, los recomiendo, puedecomprarlos sin ninguna duda, son deliciosos y muy bien empacados. Para unos fig scones deliciosos Deliciosos",5.0,2013-02-28,0,1,0.000000,13.72,Spanish
656142,A2OW4JFB2SNTEV,B004PEKF2K,"Olga Dorronsoro ""olga""",Lo compre para preparar pan y veo que hay infinidad de formas de prepararlo. Estoy experimentando y me ha parecido delicioso Deli,5.0,2013-01-27,0,0,,40.11,Spanish


In [None]:
# Changing pandas display settings
pd.set_option('display.max_rows',None,'display.max_colwidth',None)

# Checking languages
for lang in language_list[language_list<70].index :
  display(data[data['language'].eq(lang)][['reviewText','overall','language']])

# Default pandas display settings
pd.reset_option('display.max_rows','display.max_colwidth')

Unnamed: 0,reviewText,overall,language
702,"Good solid design, time proven dependability. Five Stars",5.0,Danish
61756,SAVES FINGERTIP BURNS. Five Stars,5.0,Danish
83605,I would like haver more pattern options Four Stars,4.0,Danish
92763,Surprised got several just needed 1. Will keep rest for future. Fits perfect in blender for Oster blender. Fits right for Oster blender,5.0,Danish
95079,Great for Belts,5.0,Danish
99011,Best knife ever!! Five Stars,5.0,Danish
141710,nice gift set. Five Stars,5.0,Danish
193131,good for beginners Five Stars,5.0,Danish
202081,A must have for Tea drinkers,5.0,Danish
202450,Broke after 5 uses. One Star,1.0,Danish


Unnamed: 0,reviewText,overall,language
2057,Works very good Five Stars,5.0,Afrikaans
18647,Works well. Works well!,5.0,Afrikaans
91423,It works. Works fine,4.0,Afrikaans
133280,Like very much. Good pan,5.0,Afrikaans
134638,Sleek design. Easy to use.. Easy to use,5.0,Afrikaans
139244,Doesn't get hot enough to suit me. It needs more design watts. Not hot enough,2.0,Afrikaans
142330,Works great! Works great!,5.0,Afrikaans
145939,Works well with OXO SteeL Soap Dispensing Dish Brush and OXO Good Grips Big Button Dish Soap Dispenser. Minimalist design but very sturdy. Wonderful,5.0,Afrikaans
150421,Work great Work great,5.0,Afrikaans
169432,"I sleep better, neck doesn't hurt as much, seems like I am getting better sleep. Wish it was a little bigger. It also stays very cool! AWESOME",5.0,Afrikaans


Unnamed: 0,reviewText,overall,language
4123,Great I love it,5.0,Norwegian
45972,perfect for frying eggs Five Stars,5.0,Norwegian
46711,Unstable on gas range,2.0,Norwegian
79612,Pretty Five Stars,5.0,Norwegian
118371,great set Five Stars,5.0,Norwegian
121893,pretty reliable Five Stars,5.0,Norwegian
123959,"Tall things tip over, better for smaller utensils. better for smaller utensils",4.0,Norwegian
124146,"Kind of expensive, but I like it. but I like it.",5.0,Norwegian
140312,Great filter. Great filter.,5.0,Norwegian
181526,takes forever to sift One Star,1.0,Norwegian


Unnamed: 0,reviewText,overall,language
27567,Great Grinder Great Price,5.0,Romanian
50157,useful Five Stars,5.0,Romanian
64768,Great pie crust bag Five Stars,5.0,Romanian
73333,cute Five Stars,5.0,Romanian
117887,Great value for a powerful vacuum.,4.0,Romanian
193686,perfect perfect,5.0,Romanian
208427,great deal! Five Stars,5.0,Romanian
258076,Perfect for tea!,5.0,Romanian
288947,Poor product 1star,1.0,Romanian
302551,nice piece Five Stars,5.0,Romanian


Unnamed: 0,reviewText,overall,language
9120,gift Five Stars,5.0,Swedish
10117,k Four Stars,4.0,Swedish
40276,great Five Stars,5.0,Swedish
56277,ok Four Stars,4.0,Swedish
60928,Perfect jamming jars! Five Stars,5.0,Swedish
66897,great Five Stars,5.0,Swedish
95661,given as gift Five Stars,5.0,Swedish
133147,Excellent filters Five Stars,5.0,Swedish
133155,Fits my older style Jura F7. Five Stars,5.0,Swedish
155623,OK Four Stars,4.0,Swedish


Unnamed: 0,reviewText,overall,language
1612,excellent! Five Stars,5.0,French
11227,Excellent!! Five Stars,5.0,French
23069,Beautiful semi-unique canning jars. Great for jellies and/or apple butter. The one piece lids are a nice touch so you don't need to use separate rings. Beautiful semi-unique,5.0,French
36609,Excellent project. Five Stars,5.0,French
51124,excellent Five Stars,5.0,French
113996,Excellent product Five Stars,5.0,French
137970,Just fine Four Stars,4.0,French
158265,excellent Five Stars,5.0,French
160222,"It's a fan, dude!",5.0,French
204455,"Very Quite for a Pole fan Lasko 2520 16"" Pole Fan",5.0,French


Unnamed: 0,reviewText,overall,language
11095,not impressed,2.0,Italian
24855,amazing Five Stars,5.0,Italian
40838,crap One Star,1.0,Italian
75995,nice stamp! Five Stars,5.0,Italian
106515,so convenient Five Stars,5.0,Italian
127865,Caution: Not a universal fit,2.0,Italian
149937,Super Five Stars,5.0,Italian
167448,MAGIC Five Stars,5.0,Italian
192927,"Great pizza pan. If you do not own a pizza stone, buy this one. Does not rust. Bakes pizza perfect. Perfect size. Oneida pizza pan",5.0,Italian
212533,Super storage. I love it,5.0,Italian


Unnamed: 0,reviewText,overall,language
23954,Excellent gift Excellent,5.0,Catalan
25979,to small of items sent backLllllllll l l l l l l l l l l l l l l l no,1.0,Catalan
28664,Excellent Gift Excellent,5.0,Catalan
54741,my favorite jar lids. PERFECT,5.0,Catalan
58101,Excellent Quality! Five Stars,5.0,Catalan
59988,Excellent Grinder; Last one I'll buy.,5.0,Catalan
109051,I have returned it. It did not fit. X X X X X X X X X X X X HEPA filter,3.0,Catalan
114344,"Heavy, excellent quality!!! excellent quality!",5.0,Catalan
138674,Powerful. Excellent quality. Excellent.,5.0,Catalan
189753,Brilliant!,5.0,Catalan


Unnamed: 0,reviewText,overall,language
92602,exactly what I wanted!!dddddd dddd ddd ddd ddd ddd ddd dd dd dd dd dd dd dd dd dd great,5.0,Welsh
166828,Good and handy hanger Good,5.0,Welsh
240312,I'm a fan!,5.0,Welsh
274881,good enough quality and nice looking and good price nothing to judge nice product ddddddddddddddddddd d d dd d d d d d d nice cabinet,5.0,Welsh
397564,All good fgG g h h h y g h h h h h h h h h h Good,5.0,Welsh
401092,GOOD Stuffff GOOD STUFF,5.0,Welsh
481082,Very well made Well Made,5.0,Welsh
537966,Well made Well Made,5.0,Welsh
554956,good stuff - good stuff,5.0,Welsh
585318,rgvr vr v tg rv g vreefvrfv r v tr gv r v gtr grf frvrf f rr f rf v crf v rtg bh yu hnyunhyun yj hny h h nhhy n hy hyn y superb!,5.0,Welsh


Unnamed: 0,reviewText,overall,language
1648,Love it! I love it,5.0,Slovenian
61557,Love Love,5.0,Slovenian
114815,Non stick even no oil used. Love it! Love,5.0,Slovenian
159118,Love it Love it,5.0,Slovenian
202277,Love it. Does a good job. Five Stars,5.0,Slovenian
207294,love Five Stars,5.0,Slovenian
292042,jhfhdfjsf kkhkh kkjkjk jkjkjkjkjk jkjkj kjkj kjkj kjkj kjk kjkj kjkj kjkj kjkj kjk kjkj kjkj kjkj kjkj kjk kjkj Love it,5.0,Slovenian
363417,Love it!,5.0,Slovenian
458477,love love love them Five Stars,5.0,Slovenian
458917,11111111111111 I love it,5.0,Slovenian


Unnamed: 0,reviewText,overall,language
14867,Love it. Live it.,5.0,Albanian
64864,I like it. Five Stars,5.0,Albanian
73089,saves time Five Stars,5.0,Albanian
97574,I like them Five Stars,5.0,Albanian
207384,I like this a lot Five Stars,5.0,Albanian
243144,Love it Five Stars,5.0,Albanian
290726,i love it Five Stars,5.0,Albanian
456746,i love it!!! Five Stars,5.0,Albanian
463136,Great size Five Stars,5.0,Albanian
507515,i don't like it One Star,1.0,Albanian


Unnamed: 0,reviewText,overall,language
38069,does what it says it will doaaaa aa aaa aaaa aa aaa ok ok ok ok ok ok ok good,5.0,Somali
173210,Good quality 5*,5.0,Somali
225482,My husband loves it.It is too hard for me.aa aa aa aa aa aa aa aa aa aa aa nice,5.0,Somali
256479,"So far, so good. so good.",4.0,Somali
318636,good,4.0,Somali
338347,prefer the metal rod. xx xxx xx xxx xx xxx xxx xx xx xxxx xxx xx xxx xxx xx xx rod,4.0,Somali
434419,Good basket. Good,5.0,Somali
541261,Good tool! Good tool,5.0,Somali
609499,"fresh, good buy good",5.0,Somali
662175,SOOOOOOOOO GOOD!!!!! Amazing!,5.0,Somali


Unnamed: 0,reviewText,overall,language
16863,Makes Me Smile,5.0,Estonian
66767,neat jar i like it for snacks its kinda big it looks like a grandma jar too anyways gotta go neat,5.0,Estonian
74571,Just like Bushia used to make :-) Five Stars,5.0,Estonian
201657,Just as adverised Sleep tight,5.0,Estonian
249596,"Yeh, seems like a good idea, but I didn't like it. uhhhhh! no",2.0,Estonian
340329,Not as fragile as it looks...,5.0,Estonian
564502,tastes terrible One Star,1.0,Estonian
596452,taste good I like it,4.0,Estonian
642991,just ok Three Stars,3.0,Estonian
673082,Mmmmm good Makes a great snake,5.0,Estonian


Unnamed: 0,reviewText,overall,language
37979,I have henckel knives dont buy. It does not work . It does not even help a little bit. Dont get dont buy,1.0,Dutch
96582,ITS MY DREAM MIXER. GREAT GREAT COLOUR. A MUST HAVEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE.I LOVE IT.VIBRANT MACHINE WELL BUILT. I AM LOVING ITTTTTTTTTTTTTTTTTTTTTTTT.A KEEPER OH MY GOD,5.0,Dutch
131676,doesnt fit my mini bissel One Star,1.0,Dutch
230189,"Covers well....good fit, made well. Weber Genesis Grill Cover",5.0,Dutch
257458,gift sheet pan,5.0,Dutch
354694,very nice indeed,5.0,Dutch
378932,Gets every last drop Genius,5.0,Dutch
483563,Looks sleek in my kitchen Five Stars,5.0,Dutch
630294,Decent decaf coffee Decaf coffee,4.0,Dutch


Unnamed: 0,reviewText,overall,language
39241,good value Five Stars,5.0,Croatian
40731,Good Five Stars,5.0,Croatian
45851,nice pot Five Stars,5.0,Croatian
48395,good stuff I love it,5.0,Croatian
119977,good value Five Stars,5.0,Croatian
163978,Amazing bake pan!! Five Stars,5.0,Croatian
169191,Good I love it,5.0,Croatian
252970,AMAZING!!!!!! One Star,5.0,Croatian
432070,good size & price Five Stars,5.0,Croatian


Unnamed: 0,reviewText,overall,language
40216,"OK, ready to POP!",5.0,Slovak
58980,Yay! I love pyrex . Love Pyrex,5.0,Slovak
509603,Like um very much Like um very much,4.0,Slovak
516434,ilove you Five Stars,5.0,Slovak
586527,Love stove top So easy to make,5.0,Slovak
588258,Ok Ok,3.0,Slovak


Unnamed: 0,reviewText,overall,language
56440,Esses jogos da Corelle so super pr&aacute;ticos para o dia a dia.Vo a m&aacute;quina de lavar louas e so muito dur&aacute;veis.Recomendo. Tenho v&aacute;rios,5.0,Portuguese
125750,A &uacute;nica coisa que eu penso que poderia ser mudado &eacute; que as facas praticamente no tem corte.O resto &eacute; perfeito. These are very good,4.0,Portuguese
224070,Does a nice job. Five Stars,5.0,Portuguese
361035,No sei porque no inventaram esse sacos h&aacute; mais tempo.Perfeitos para economizar espao nas malas na hora da viagem.Recomendo. Uau,5.0,Portuguese
392664,"Esse &eacute; meu segundo aspirador igual. Perfeito para uma limpeza r&aacute;pida.Pratico e chega a lugares que os maiores no chegam, e o melhor &eacute; que no precisa de sacos.Recomendo. Very Good",5.0,Portuguese
409468,"Esse &eacute; meu segundo jogo de panelas T-fal.Elas so simplismente maravilhosas. De uma qualidade que nunca vi antes em outras panelas.Super recomendo. Very, very good.",5.0,Portuguese


Unnamed: 0,reviewText,overall,language
45699,&#51665;&#50640;&#49436; &#52280; &#51096; &#50416;&#44256; &#51080;&#50612;&#50836;~ &#49373;&#44033;&#48372;&#45796; &#51201;&#45817;&#54620; &#49324;&#51060;&#51592;&#51060;&#44592; &#46412;&#47928;&#50640; &#47588;&#51068; &#49324;&#50857;&#54616;&#45716; &#45252;&#48708;&#46308;&#51060;&#44592;&#46020; &#54633;&#45768;&#45796;.&#44536;&#47532;&#44256; &#44256;&#44032;&#51032; &#47749;&#54408;&#45252;&#48708;&#50752;&#46020; &#46244;&#51648;&#51648; &#50506;&#51012; &#47564;&#53372; &#49464;&#47144;&#48120;&#47484; &#44054;&#52628;&#44256; &#51080;&#50612;&#50836; &#47588;&#51068; &#49324;&#50857;&#54616;&#45716; &#49885;&#44592;&#47196; &#51339;&#50500;&#50836;,5.0,Unknown
46184,&#47956;&#44032; &#47566;&#51060; &#54644; &#47673;&#51012; &#46412; &#50976;&#50857;&#54620; &#45252;&#48708;&#51077;&#45768;&#45796;. &#52376;&#51020;&#50640; &#50668;&#44592; &#51228;&#54408;&#51012; &#51060; &#44163;&#51004;&#47196; &#50416;&#44256; &#51060;&#54980;&#50640; &#45252;&#48708;&#49464;&#53944;&#47484; &#49884;&#53420; &#51221;&#46020;&#47196; &#47564;&#51313;&#54664;&#45912; &#51228;&#54408;&#51077;&#45768;&#45796;. &#51200;&#47156;&#54620; &#44032;&#44201;&#50640; &#50756;&#49457;&#46020;&#44032; &#45458;&#50500; &#49324;&#50857;&#54616;&#44592; &#51339;&#49845;&#45768;&#45796;. &#49324;&#50857;&#54616;&#44592; &#51339;&#50500;&#50836;~,4.0,Unknown
186035,"&#50508;&#47336;&#48120;&#45700; &#51228;&#54408;&#51012; &#49344;&#45796;&#44032; &#48320;&#49353;&#46104;&#50612; &#49828;&#53580;&#51064;&#47112;&#49828;&#47196; &#49344;&#45716;&#45936;, &#45208;&#47492; &#47564;&#51313;&#54616;&#47732;&#49436; &#51096; &#50416;&#44256; &#51080;&#49845;&#45768;&#45796;.&#54616;&#51648;&#47564; &#50508;&#47336;&#48120;&#45700; &#51228;&#54408;&#48372;&#45796; &#52964;&#54588;&#47484; &#47564;&#46308; &#46412;&#51032; &#50756;&#49457;&#46020;&#45716; &#51312;&#44552; &#46504;&#50612;&#51648;&#45716; &#45712;&#45196;&#51060;&#51648;&#47564; &#50025; &#51339;&#51008; &#54200;&#51077;&#45768;&#45796;. &#48320;&#49353;&#46104;&#51648; &#50506;&#50500; &#50416;&#44592; &#51339;&#50500;&#50836;",4.0,Unknown
241677,"&#46321;&#49328;&#44040; &#46412; &#51453;&#51012; &#45812;&#50500;&#44032;&#47140;&#44256; &#49344;&#45716;&#45936;, &#50976;&#50857;&#54616;&#44172; &#51096; &#50044;&#49845;&#45768;&#45796;. &#48372;&#50728;&#54952;&#44284;&#44032; 4&#49884;&#44036;&#51221;&#46020;&#51064; &#44163; &#44057;&#50500;&#50836;.&#49695;&#44032;&#46973;&#51060; &#50504;&#50640; &#46308;&#50612;&#51080;&#50612;&#49436; &#52280; &#54200;&#47532;&#54633;&#45768;&#45796;. &#45796;&#47480; &#51333;&#47448;&#47196;&#46020; &#49324; &#48380;&#47140;&#44396;&#50836; &#47691;&#51652; &#48372;&#50728;&#53685;&#51077;&#45768;&#45796;.",4.0,Unknown
417444,"&#52280; &#51089;&#51008; &#49324;&#51060;&#51592;&#51060;&#44256;, &#46385; &#46020;&#49884;&#46973;&#47564; &#46308;&#50612;&#44036;&#45796;&#44256; &#48372;&#49884;&#47732; &#46121;&#45768;&#45796;. &#46020;&#49884;&#46973; &#53685;&#44284; &#47932;&#53685; 1&#44060;&#51221;&#46020;..&#44032;&#48333;&#44172; &#46308;&#44256; &#45796;&#45768;&#44592; &#51339;&#44256; &#50521;&#51613;&#47582;&#51008; &#49324;&#51060;&#51592;&#51077;&#45768;&#45796;. &#47956;&#44032; &#45812;&#51012; &#44172; &#47566;&#45796;&#47732; &#51060;&#44144;&#48372;&#45796; &#53360; &#49324;&#51060;&#51592;&#47484; &#52628;&#52380;&#54633;&#45768;&#45796;. &#52376;&#51020; &#49324; &#48420;&#45716;&#45936; &#51339;&#45348;&#50836;",4.0,Unknown


Unnamed: 0,reviewText,overall,language
58111,"Very, Very nice!!!! Very nice!!",5.0,Czech
353703,very nice I love it,5.0,Czech
436965,Very nice nice,5.0,Czech
538677,VERY NICE! I love it,5.0,Czech
590487,"If you like beef jerky, you'll love Mr. Z Sweet & Hot jerky. Buy plenty. It's addictive! Delicious!",5.0,Czech


Unnamed: 0,reviewText,overall,language
99571,perfect size and thickness Love it. More words needed ... blah blah blah blah blah blah blah blah blah blah. perfect,5.0,Indonesian
238310,What a piece of freakin joke this is - worth about 12 cents at BEST! La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah La de dah Joke - Right?,1.0,Indonesian
699594,"Yuk, just yuk. Yuk, just yuk.",1.0,Indonesian


Unnamed: 0,reviewText,overall,language
56362,NICE! NICE!,5.0,Polish
336117,I just don't know.. Two Stars,2.0,Polish


Unnamed: 0,reviewText,overall,language
185999,Big! Big!,4.0,Tagalog
690780,tastes good! It's okay,3.0,Tagalog


Unnamed: 0,reviewText,overall,language
236959,"I've only had this tool for my KitchenAid mixer for a couple of weeks, but I've used it about a dozen times. No more having to switch off the machine and stick a spatula inside the bowl to incorporate all the material. I appreciate this tool and its design, plus the time savings it affords.Here's what I didn't know when I bought it. The main body of the mixing paddle is plastic, so far it seems durable. I don't think it will last as long as the metal one that came with my KitchenAid.Also, there is something I didn't realize about the design of the new mixing paddle with the rubber edge blades, the paddles have a slight swirl to them. NOW THAT I KNOW! I don't run it on high. With the swirl in the paddle blades it sucks batter off the bottom for remixing. It's brilliant as ideas go, it's just I didn't pay attention and I had Belgian waffle batter all over the counter! My bad.So now I use my mixer at lower speed.I would have rated this KitchenAid attachment a 5 star if was made of a more durable material. I'll be happy with this plastic one - TILL IT BREAKS.****Ich hatte nur dieses Tool f&uuml;r meine KitchenAid Mixer f&uuml;r ein paar Wochen, aber ich habe es &uuml;ber ein Dutzend Mal verwendet. Nicht mehr erforderlich, die Maschine ausschalten und kleben einem Spatel in die Sch&uuml;ssel, das gesamte Material zu &uuml;bernehmen. Ich sch&auml;tze dieses Tool und seine Gestaltung sowie die Zeitersparnis bietet.Hier ist, was ich nicht wusste, als ich es gekauft. Der Hauptteil des R&uuml;hrquirl ist aus Kunststoff, so weit scheint es dauerhaft. Ich glaube nicht, dass es so lange dauern wie das Metall eine, die mit meiner KitchenAid kam.Au&szlig;erdem gibt es etwas, das ich nicht &uuml;ber das Design des neuen R&uuml;hrquirl mit Gummisicke Klingen haben zu realisieren, haben die Schaufeln einer leichten Drall zu ihnen. Jetzt wo ich wei&szlig;! Ich glaube nicht, f&uuml;hren Sie sie auf hoch. Mit dem Wirbel in der Paddelbl&auml;ttern es saugt Teig vom Boden zum Remixen. Es ist brillant wie Ideen gehen, es ist nur habe ich nicht aufgepasst, und ich hatte belgischen Waffelteig alle &uuml;ber den Ladentisch! Mein schlechtes.So, jetzt benutze ich meine Mischer mit niedriger Geschwindigkeit.Ich w&uuml;rde bewerteten diese KitchenAid Anlage haben ein 5 Sterne, wenn einer mehr haltbaren Material hergestellt wurde. Ich werde gerne mit diesem Kunststoff ein - bis es bricht. Works really good, makes the task easier - but! (sehr gut, aber!)",4.0,German
517499,Unboxing,4.0,German


Unnamed: 0,reviewText,overall,language
568313,Yummy! Very Tastey!,5.0,Turkish
571911,Yummy Yummy,5.0,Turkish


Unnamed: 0,reviewText,overall,language
113427,A a a a a a a a a a a aa a a a a a a a a a a a a . Gmdnd cg j Snazzy!,4.0,Hungarian


Unnamed: 0,reviewText,overall,language
626348,so tasty mmmmmmm,5.0,Finnish


Unnamed: 0,reviewText,overall,language
699631,Not bad Not bad,4.0,Latvian


In [None]:
# Reassigning languages after manually checking some other languages
languages_exists=['German','Spanish','Unknown','Portuguese']                    # List of languages other than 'English' found in the dataset

# Replacing all other mis-labelled records to 'English'
data['language']=data['language'].apply(lambda x : 'English' if x not in languages_exists else x)

data['language'].value_counts()

English       702845
Spanish           77
Portuguese         6
Unknown            5
German             2
Name: language, dtype: int64

#### **<h2> Sentiment of review texts : </h2>**

*   Applying text preprocessing techniques to clean the reviews.
*   Extracting positve and negative words based on 'Opinion Lexicon' datasets
*   Calculating the sentiment scores only on reviews in 'English' language.


In [None]:
# List of positive words
pos=open('/content/positive-words.txt').read()
pos_words=pd.Series(pos.split('\n'))

print('List of positive words :')
display(pos_words.loc[[0,1,2,len(pos_words)-3,len(pos_words)-2,len(pos_words)-1]])

# List of positive words
neg=open('/content/negative-words.txt').read()
neg_words=pd.Series(neg.split('\n'))

print('\n\nList of negative words :')
display(neg_words.loc[[0,1,2,len(neg_words)-3,len(neg_words)-2,len(neg_words)-1]])

List of positive words :


0            a+
1        abound
2       abounds
2003     zenith
2004       zest
2005      zippy
dtype: object



List of negative words :


0         2-faced
1         2-faces
2        abnormal
4780      zealous
4781    zealously
4782       zombie
dtype: object

In [None]:
# Applying all text related functions on reviewText column and creating 8 new columns
data[['cleanText','pos_words','neg_words','no_pos',
      'no_neg','total_words','polarity','sentiment']]=data.apply(lambda x : all_functions(lang = x['language'], txt = x['reviewText'],
                                                                                          punctuations = puncts,pos_list=pos_words,neg_list=neg_words),
                                                                                          axis=1,result_type='expand')

display(pd.concat([data.head(3),data.tail(3)],axis=0))

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price,language,cleanText,pos_words,neg_words,no_pos,no_neg,total_words,polarity,sentiment
0,APYOBQE6M18AA,0615391206,Martin Schwartz,My daughter wanted this book and the price on ...,5.0,2013-10-19,0,0,,17.29,English,daughter want book price amazon good try recip...,"[good, happy, good]",[],3.0,0.0,14.0,0.21,Positive
1,A1JVQTAGHYOL7F,0615391206,Michelle Dinh,I bought this zoku quick pop for my daughterr ...,5.0,2014-06-18,0,0,,17.29,English,buy quick pop quick maker love fun ice cream,"[love, fun]",[],2.0,0.0,9.0,0.22,Positive
2,A3UPYGJKZ0XTU4,0615391206,mirasreviews,There is no shortage of pop recipes available ...,4.0,2013-05-05,26,27,0.962963,17.29,English,shortage pop recipe available free web purchas...,"[available, free, good, sweet, fresh, useful, ...","[shortage, scream, freeze, limited, gimmicky, ...",21.0,8.0,200.0,0.07,Positive
702932,ASEBX8TBYWQWA,B00KCJRVO2,"Steven I. Ramm ""Steve Ramm &#34;Anything Phon...","While I usually review CDs and DVDs, as well a...",5.0,2014-07-01,1,1,1.0,13.0,English,usually review cd dvds entertainment relate bo...,"[hot, free, honest, variety, variety, variety,...","[allergic, intolerance, concerned]",12.0,3.0,158.0,0.06,Positive
702933,ANKQGTXHREOI5,B00KCJRVO2,Titanium Lili,My son and I enjoyed these oatmeal packets. H...,4.0,2014-07-04,0,1,0.0,13.0,English,son enjoy oatmeal packet fond maple brown suga...,"[enjoy, fond, variety, like, variety, variety,...",[junk],10.0,1.0,48.0,0.19,Positive
702934,A2CF66KIQ3RKX3,B00KCJRVO2,Vivian Deliz,I like to eat oatmeal i the mornings. I usuall...,4.0,2014-07-11,0,0,,13.0,English,like eat oatmeal morning usually buy quaker oa...,"[like, good, free, recommend, like]","[suspect, cheap]",5.0,2.0,42.0,0.07,Positive


#### <h2>Weekday from reviewTime :</h2>

In [None]:
# Creating a 'weekday' column from reviewTime
data['weekday']=data['reviewTime'].apply(lambda x : x.day_name())

data.head()

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price,language,cleanText,pos_words,neg_words,no_pos,no_neg,total_words,polarity,sentiment,weekday
0,APYOBQE6M18AA,615391206,Martin Schwartz,My daughter wanted this book and the price on ...,5.0,2013-10-19,0,0,,17.29,English,daughter want book price amazon good try recip...,"[good, happy, good]",[],3.0,0.0,14.0,0.21,Positive,Saturday
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,I bought this zoku quick pop for my daughterr ...,5.0,2014-06-18,0,0,,17.29,English,buy quick pop quick maker love fun ice cream,"[love, fun]",[],2.0,0.0,9.0,0.22,Positive,Wednesday
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,There is no shortage of pop recipes available ...,4.0,2013-05-05,26,27,0.962963,17.29,English,shortage pop recipe available free web purchas...,"[available, free, good, sweet, fresh, useful, ...","[shortage, scream, freeze, limited, gimmicky, ...",21.0,8.0,200.0,0.07,Positive,Sunday
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""",This book is a must have if you get a Zoku (wh...,5.0,2011-08-04,14,18,0.777778,17.29,English,book highly recommend large variety recipe sim...,"[recommend, variety, fancy, creative, beautifu...",[],7.0,0.0,23.0,0.3,Positive,Thursday
4,AHAI85T5C2DH3,615391206,PugLover,This cookbook is great. I have really enjoyed...,4.0,2014-06-07,0,0,,17.29,English,cookbook great enjoy review recipe sure diffic...,"[great, enjoy, nice, variety, worth, favorite,...",[difficult],8.0,1.0,35.0,0.2,Positive,Saturday


#### <h2>Brands from meta data :</h2>

In [None]:
# Merging info from meta data to get brands of products
data=data.merge(meta_data[['asin','brand']],on='asin',how='left',validate='m:m')

display(data.head())

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price,...,cleanText,pos_words,neg_words,no_pos,no_neg,total_words,polarity,sentiment,weekday,brand
0,APYOBQE6M18AA,615391206,Martin Schwartz,My daughter wanted this book and the price on ...,5.0,2013-10-19,0,0,0.0,17.29,...,daughter want book price amazon good try recip...,"[good, happy, good]",[],3.0,0.0,14.0,0.21,Positive,Saturday,Zoku
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,I bought this zoku quick pop for my daughterr ...,5.0,2014-06-18,0,0,0.0,17.29,...,buy quick pop quick maker love fun ice cream,"[love, fun]",[],2.0,0.0,9.0,0.22,Positive,Wednesday,Zoku
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,There is no shortage of pop recipes available ...,4.0,2013-05-05,26,27,0.962963,17.29,...,shortage pop recipe available free web purchas...,"[available, free, good, sweet, fresh, useful, ...","[shortage, scream, freeze, limited, gimmicky, ...",21.0,8.0,200.0,0.07,Neutral,Sunday,Zoku
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""",This book is a must have if you get a Zoku (wh...,5.0,2011-08-04,14,18,0.777778,17.29,...,book highly recommend large variety recipe sim...,"[recommend, variety, fancy, creative, beautifu...",[],7.0,0.0,23.0,0.3,Positive,Thursday,Zoku
4,AHAI85T5C2DH3,615391206,PugLover,This cookbook is great. I have really enjoyed...,4.0,2014-06-07,0,0,0.0,17.29,...,cookbook great enjoy review recipe sure diffic...,"[great, enjoy, nice, variety, worth, favorite,...",[difficult],8.0,1.0,35.0,0.2,Positive,Saturday,Zoku


#### <h2>Categories :</h2>

In [None]:
# Merging info from meta data to get categories of products
data=data.merge(meta_data[['asin','categories']],on='asin',how='left',validate='m:m')

data['set']=data['categories'].apply(lambda x : 'hk' if 'Home & Kitchen' in x else 'ggf')

display(data.head())

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,reviewTime,Positive_feedback,Total_feedback,helpfulness_ratio,price,...,neg_words,no_pos,no_neg,total_words,polarity,sentiment,weekday,brand,categories,set
0,APYOBQE6M18AA,615391206,Martin Schwartz,My daughter wanted this book and the price on ...,5.0,2013-10-19,0,0,0.0,17.29,...,[],3.0,0.0,14.0,0.21,Positive,Saturday,Zoku,"[Home & Kitchen, Kitchen & Dining, Kitchen Ute...",hk
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,I bought this zoku quick pop for my daughterr ...,5.0,2014-06-18,0,0,0.0,17.29,...,[],2.0,0.0,9.0,0.22,Positive,Wednesday,Zoku,"[Home & Kitchen, Kitchen & Dining, Kitchen Ute...",hk
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,There is no shortage of pop recipes available ...,4.0,2013-05-05,26,27,0.962963,17.29,...,"[shortage, scream, freeze, limited, gimmicky, ...",21.0,8.0,200.0,0.07,Neutral,Sunday,Zoku,"[Home & Kitchen, Kitchen & Dining, Kitchen Ute...",hk
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""",This book is a must have if you get a Zoku (wh...,5.0,2011-08-04,14,18,0.777778,17.29,...,[],7.0,0.0,23.0,0.3,Positive,Thursday,Zoku,"[Home & Kitchen, Kitchen & Dining, Kitchen Ute...",hk
4,AHAI85T5C2DH3,615391206,PugLover,This cookbook is great. I have really enjoyed...,4.0,2014-06-07,0,0,0.0,17.29,...,[difficult],8.0,1.0,35.0,0.2,Positive,Saturday,Zoku,"[Home & Kitchen, Kitchen & Dining, Kitchen Ute...",hk


#### **<h2>Saving the dataset : </h2>**

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 702935 entries, 0 to 702934
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   reviewerID         702935 non-null  object        
 1   asin               702935 non-null  object        
 2   reviewerName       696448 non-null  object        
 3   reviewText         702935 non-null  object        
 4   overall            702935 non-null  float64       
 5   reviewTime         702935 non-null  datetime64[ns]
 6   Positive_feedback  702935 non-null  int64         
 7   Total_feedback     702935 non-null  int64         
 8   helpfulness_ratio  702935 non-null  float64       
 9   price              632353 non-null  float64       
 10  language           702935 non-null  object        
 11  cleanText          702840 non-null  object        
 12  pos_words          702840 non-null  object        
 13  neg_words          702840 non-null  object  

In [None]:
data.isnull().sum()

reviewerID               0
asin                     0
reviewerName          6487
reviewText               0
overall                  0
reviewTime               0
Positive_feedback        0
Total_feedback           0
helpfulness_ratio        0
price                70582
language                 0
cleanText               95
pos_words               95
neg_words               95
no_pos                  95
no_neg                  95
total_words             95
polarity                95
sentiment                0
weekday                  0
brand                    0
categories               0
set                      0
dtype: int64

In [None]:
# Writing the dataframe in chunks : CSV
chunks=np.array_split(data,20)

for i,chunk in enumerate(chunks) :
  mode='w' if i==0 else 'a'               # 'w' means 'write', 'a' means 'append' the dataframes
  header=i==0
  chunk.to_csv('/content/drive/MyDrive/capstone_project_data.csv',header=header,mode=mode,index=False)

In [None]:
# Deleting declared variables
del hk,hk_meta,   ggf,ggf_meta,    df_hk,df_ggf