In [1]:
import os
wd = os.getcwd() 
print(wd)

import sqlalchemy
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
import plotly.express as px
import datetime
import numpy as np
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from pathlib import Path
from collections import Counter

/Users/Arta/_Sentiment_Analysis


In [2]:
###Create database connection and read tables###
engine = create_engine('postgresql://localhost/')

#Read table sustainability_words from Database
sustainability_words = pd.read_sql_table(
    "sustainability_words",
    con=engine
)

#Read table reviews_food_beverages_tobacco from Database
reviews_food_beverages_tobacco = pd.read_sql_table(
    "reviews_food_beverages_tobacco",
    con=engine
)

#Read table companies_list from Database
#companies_list = pd.read_sql_table(
#    "list_companies",
#    con=engine
#)

In [3]:
reviews_food_beverages_tobacco["total_reviews"] = reviews_food_beverages_tobacco["total_reviews"].str.replace(",","").astype(float)

#Set column as numeric
reviews_food_beverages_tobacco['total_reviews'] = pd.to_numeric(reviews_food_beverages_tobacco['total_reviews'])
reviews_food_beverages_tobacco['general_rating'] = pd.to_numeric(reviews_food_beverages_tobacco['general_rating'])
reviews_food_beverages_tobacco['review_rating'] = pd.to_numeric(reviews_food_beverages_tobacco['review_rating'])


In [4]:
reviews_food_beverages_tobacco['review_text'].replace('', np.nan, inplace=True)
reviews_food_beverages_tobacco.dropna(subset=['review_text'], inplace=True)
reviews_food_beverages_tobacco['review_text']= reviews_food_beverages_tobacco['review_text'].astype(str)


In [5]:
#Reviews data is sampled in order to speed up computations.
reviews_food_beverages_tobacco = reviews_food_beverages_tobacco.sample(frac = 0.01, replace = False, random_state=42)
reviews_food_beverages_tobacco

Unnamed: 0,company,category,total_reviews,general_rating,review_rating,review_text
2425506,Freshly,Online Food Ordering Service,9363.0,4.8,4,I have never in a long time had such superb se...
1727125,Top Chef Meals,Online Food Ordering Service,4082.0,4.5,5,Great customer service changed my delivery dat...
1419024,Vital Choice Wild Seafood & Organics,Food Products Supplier,22991.0,4.8,5,So far ordered twice and fish was excellent. J...
187209,ezCater,Online Food Ordering Service,13408.0,4.7,5,About once a month I sit down with my full wor...
214162,eCigs-Direct,Vaporizer Store,3178.0,4.9,5,"Great service and speedy delivery, as always!!"
...,...,...,...,...,...,...
1858121,eCigs-Direct,Vaporizer Store,3178.0,4.9,5,Prompt delivery considering how difficult trad...
2851605,Splash Wines,Wine Store,19147.0,4.5,5,"Nice assortment of wines, I've been enjoying t..."
884096,KandyPens,Vaporizer Store,46.0,1.7,1,These people are scammers. They say the Oura K...
2996927,PuffItUp!,Vaporizer Store,5507.0,4.8,5,"Fast service, fun candies included in the pack..."


In [6]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    reviews_food_beverages_tobacco['review_text'] = reviews_food_beverages_tobacco['review_text'].str.replace(char, ' ')
    
reviews_food_beverages_tobacco['review_text'] = reviews_food_beverages_tobacco['review_text'].str.split().str.join(" ")

reviews_food_beverages_tobacco.info()    

  reviews_food_beverages_tobacco['review_text'] = reviews_food_beverages_tobacco['review_text'].str.replace(char, ' ')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 29660 entries, 2425506 to 102408
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   company         29660 non-null  object 
 1   category        29660 non-null  object 
 2   total_reviews   29660 non-null  float64
 3   general_rating  29660 non-null  float64
 4   review_rating   29660 non-null  int64  
 5   review_text     29660 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 1.6+ MB


In [7]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    #text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_food_beverages_tobacco["review_clean"] = reviews_food_beverages_tobacco["review_text"].apply(lambda x: clean_text(x))


In [8]:
from collections import Counter
all_names = reviews_food_beverages_tobacco['review_clean'].unique()
names_freq = Counter()
for name in all_names:
    names_freq.update(str(name).split(" "))
key_words = [word for (word,_) in names_freq.most_common(50)]
print(key_words)

['great', 'order', 'service', 'good', 'time', 'customer', 'food', 'love', 'delivery', 'would', 'shipping', 'meals', 'easy', 'get', 'one', 'fast', 'ordered', 'product', 'always', 'like', 'quality', 'company', 'products', 'excellent', 'fresh', 'received', 'never', 'use', 'delicious', 'best', 'well', 'really', 'also', 'got', 'recommend', 'thank', 'even', 'price', 'meal', 'day', 'delivered', 'first', 'arrived', 'prices', 'everything', 'ordering', 'week', 'much', 'days', 'experience']


In [9]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
reviews_food_beverages_tobacco["sentiments"] = reviews_food_beverages_tobacco["review_text"].apply(lambda x: sid.polarity_scores(x))
reviews_food_beverages_tobacco = pd.concat([reviews_food_beverages_tobacco.drop(['sentiments'], axis=1), reviews_food_beverages_tobacco['sentiments'].apply(pd.Series)], axis=1)



In [10]:
from afinn import Afinn
afinn = Afinn(language='en')

In [11]:
reviews_food_beverages_tobacco.head()

Unnamed: 0,company,category,total_reviews,general_rating,review_rating,review_text,review_clean,neg,neu,pos,compound
2425506,Freshly,Online Food Ordering Service,9363.0,4.8,4,I have never in a long time had such superb se...,never long time superb service customer servic...,0.0,0.741,0.259,0.8883
1727125,Top Chef Meals,Online Food Ordering Service,4082.0,4.5,5,Great customer service changed my delivery dat...,great customer service changed delivery date a...,0.0,0.575,0.425,0.8126
1419024,Vital Choice Wild Seafood & Organics,Food Products Supplier,22991.0,4.8,5,So far ordered twice and fish was excellent Ju...,far ordered twice fish excellent promised,0.0,0.592,0.408,0.7351
187209,ezCater,Online Food Ordering Service,13408.0,4.7,5,About once a month I sit down with my full wor...,month sit full work calendar schedule every si...,0.039,0.84,0.121,0.7213
214162,eCigs-Direct,Vaporizer Store,3178.0,4.9,5,Great service and speedy delivery as always,great service speedy delivery always,0.0,0.594,0.406,0.6249


In [12]:
reviews_food_beverages_tobacco['afinn_score'] = reviews_food_beverages_tobacco['review_clean'].apply(afinn.score)


In [13]:
reviews_food_beverages_tobacco['afinn_score'].describe()

count    29660.000000
mean         5.159171
std          5.509459
min        -46.000000
25%          2.000000
50%          5.000000
75%          8.000000
max         84.000000
Name: afinn_score, dtype: float64

In [14]:
columns_to_display = reviews_food_beverages_tobacco[['review_clean', 'afinn_score']]

columns_to_display.sort_values(by='afinn_score').head(10)

Unnamed: 0,review_clean,afinn_score
2086874,update vapewild sells clones would recommend b...,-46.0
2267346,placed orders gifttree problems one day placed...,-37.0
897768,deceived smokewire employees pretending satisf...,-36.0
1773381,ordered elderly mother alternative narcotic op...,-33.0
3074968,tried buy vape da vinci website emailed purcha...,-28.0
28638,foodler byfar worst service ever used service ...,-25.0
339416,wineexpress com exclusive wine shop partner wi...,-24.0
300717,buyer beware signed hellofresh received promot...,-24.0
2009025,briefly spoke spencer called asked interested ...,-23.0
2133445,service midland texas sucks given chances sinc...,-22.0


In [15]:
columns_to_display.sort_values(by='afinn_score').tail(10)


Unnamed: 0,review_clean,afinn_score
755644,canned salmon absolutely delicious it’s perfec...,42.0
393118,wish give stars food superb quality freshness ...,43.0
743407,hello freshly know two different people acct h...,43.0
2389848,purchased vital choice wild alaskan sockeye sa...,44.0
910216,good morning everyone work burger king custome...,45.0
716184,making cappuccinos wife years different machin...,46.0
8657,repeat customer bake wish last years however s...,47.0
2327248,received voucher months ago bought bottles del...,49.0
2873123,began naked wines first september year took ch...,72.0
3271866,live busy life yadda yadda yadda yes kiddos ac...,84.0


In [16]:
def word_count(text_string):
    '''Calculate the number of words in a string'''
    return len(text_string.split())

In [17]:
reviews_food_beverages_tobacco['word_count'] = reviews_food_beverages_tobacco['review_clean'].apply(word_count)


In [18]:
reviews_food_beverages_tobacco['word_count'].describe()

count    29660.000000
mean        19.318678
std         21.313584
min          0.000000
25%          7.000000
50%         13.000000
75%         24.000000
max        547.000000
Name: word_count, dtype: float64

In [19]:
reviews_food_beverages_tobacco['afinn_adjusted'] = reviews_food_beverages_tobacco['afinn_score'] / reviews_food_beverages_tobacco['word_count'] * 100


In [20]:
reviews_food_beverages_tobacco['afinn_adjusted'].describe()

count    29653.000000
mean        42.945624
std         43.427761
min       -200.000000
25%         13.043478
50%         37.142857
75%         66.666667
max        400.000000
Name: afinn_adjusted, dtype: float64

In [21]:
def count_occurences(text, word_list):
    '''Count occurences of words from a list in a text string.'''
    text_list = text_to_words(text)

    intersection = [w for w in text_list if w in word_list]

    return len(intersection)


In [22]:
def text_to_words(text):
    '''Transform a string to a list of words,
    removing all punctuation.'''
    text = text.lower()

    p = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text = ''.join([ch for ch in text if ch not in p])

    return text.split()

In [23]:
social_words_list = sustainability_words['social_sustainability'].values

reviews_food_beverages_tobacco['social_sustainability_words'] = reviews_food_beverages_tobacco['review_clean'].apply(count_occurences, args=(social_words_list, ))



In [25]:
environmental_words_list = sustainability_words['environmental_sustainability'].values

reviews_food_beverages_tobacco['environmental_sustainability_words'] = reviews_food_beverages_tobacco['review_clean'].apply(count_occurences, args=(environmental_words_list, ))


In [26]:
economic_words_list = sustainability_words['economic_sustainability'].values

reviews_food_beverages_tobacco['economic_sustainability_words'] = reviews_food_beverages_tobacco['review_clean'].apply(count_occurences, args=(economic_words_list, ))


In [40]:
reviews_food_beverages_tobacco

Unnamed: 0,company,category,total_reviews,general_rating,review_rating,review_text,review_clean,neg,neu,pos,compound,afinn_score,word_count,afinn_adjusted,social_sustainability_words,environmental_sustainability_words,economic_sustainability_words
2425506,Freshly,Online Food Ordering Service,9363.0,4.8,4,I have never in a long time had such superb service from a customer service department Aljun was exceptional both courteous and knowledgeable in ever way I am a new customer and I am extremely satisfied with the service,never long time superb service customer service department aljun exceptional courteous knowledgeable ever way new customer extremely satisfied service,0.000,0.741,0.259,0.8883,9.0,19,47.368421,0,0,0
1727125,Top Chef Meals,Online Food Ordering Service,4082.0,4.5,5,Great customer service changed my delivery date when I asked Pleasant and accommodating,great customer service changed delivery date asked pleasant accommodating,0.000,0.575,0.425,0.8126,6.0,9,66.666667,0,0,0
1419024,Vital Choice Wild Seafood & Organics,Food Products Supplier,22991.0,4.8,5,So far ordered twice and fish was excellent Just as promised,far ordered twice fish excellent promised,0.000,0.592,0.408,0.7351,4.0,6,66.666667,0,0,0
187209,ezCater,Online Food Ordering Service,13408.0,4.7,5,About once a month I sit down with my full work calendar and schedule every single lunch or breakfast appointment that I have with EZ Cater I check off each appointment once booked and literally don t worry about it again because I know it will be handled from there Always on time easy access to receipts and very convenient to re order Makes a busy work schedule that much easier to manage,month sit full work calendar schedule every single lunch breakfast appointment ez cater check appointment booked literally worry know handled always time easy access receipts convenient order makes busy work schedule much easier manage,0.039,0.840,0.121,0.7213,-2.0,34,-5.882353,0,0,0
214162,eCigs-Direct,Vaporizer Store,3178.0,4.9,5,Great service and speedy delivery as always,great service speedy delivery always,0.000,0.594,0.406,0.6249,3.0,5,60.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1858121,eCigs-Direct,Vaporizer Store,3178.0,4.9,5,Prompt delivery considering how difficult trading is at the moment Everything ordered was supplied Impressed and thank you,prompt delivery considering difficult trading moment everything ordered supplied impressed thank,0.108,0.649,0.242,0.4767,4.0,11,36.363636,0,0,0
2851605,Splash Wines,Wine Store,19147.0,4.5,5,Nice assortment of wines I ve been enjoying them and will definitely purchase again,nice assortment wines enjoying definitely purchase,0.000,0.529,0.471,0.8360,5.0,6,83.333333,0,0,0
884096,KandyPens,Vaporizer Store,46.0,1.7,1,These people are scammers They say the Oura Kandypens has a lifetime warranty but my atomizer broke and the manager said he could not help me I am shocked I spent So much money on this and it is now completely broken,people scammers say oura kandypens lifetime warranty atomizer broke manager said could help shocked spent much money completely broken,0.288,0.712,0.000,-0.9340,-2.0,19,-10.526316,0,0,0
2996927,PuffItUp!,Vaporizer Store,5507.0,4.8,5,Fast service fun candies included in the package would order again,fast service fun candies included package would order,0.000,0.752,0.248,0.5106,4.0,8,50.000000,0,0,0


In [27]:
sustainability_words = ['environmental_sustainability_words', 'economic_sustainability_words', 'social_sustainability_words']
reviews_food_beverages_tobacco[sustainability_words].describe()


Unnamed: 0,environmental_sustainability_words,economic_sustainability_words,social_sustainability_words
count,29660.0,29660.0,29660.0
mean,0.002428,0.011902,0.010924
std,0.052525,0.115374,0.116199
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,2.0,3.0,3.0


In [33]:
reviews_food_beverages_tobacco['social_sustainability_words'].value_counts()


0    29370
1      262
2       22
3        6
Name: social_sustainability_words, dtype: int64

In [34]:
reviews_food_beverages_tobacco['economic_sustainability_words'].value_counts()

0    29327
1      316
2       14
3        3
Name: economic_sustainability_words, dtype: int64

In [35]:
reviews_food_beverages_tobacco['environmental_sustainability_words'].value_counts()

0    29593
1       62
2        5
Name: environmental_sustainability_words, dtype: int64

In [43]:
#reviews_food_beverages_tobacco['social_value'] = DATA['social_sustainability_words'] - DATA['COLUMN3']


#Avg price per 45-65 years for M, D and N and round to 2 decimanls
if reviews_food_beverages_tobacco['social_sustainability_words'] >=1:
    reviews_food_beverages_tobacco['social_value'] = reviews_food_beverages_tobacco['afinn_adjusted']

def social_values(values):
    values = reviews_food_beverages_tobacco['social_sustainability_words']
    if values >= 1:
        for social_value in values:
            social_value = reviews_food_beverages_tobacco['afinn_adjusted'] * reviews_food_beverages_tobacco['social_sustainability_words']
            return social_value

def text_to_words(text):
    '''Transform a string to a list of words,
    removing all punctuation.'''
    text = text.lower()

    p = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text = ''.join([ch for ch in text if ch not in p])

    return text.split()

    def parse(self, response):
        urls_1 = response.css('div.styles_businessUnitCardsContainer__1ggaO > a::attr(href)').extract()
        if urls_1 != []:
            for url_1 in urls_1:
                url_1 = response.urljoin(url_1)
                yield scrapy.Request(url=url_1, callback=self.parse_details)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [38]:
'''
def index(index_env):
    index_env = reviews_food_beverages_tobacco.environmental_sustainability_words
    if index_env > 0:
        for index_1 in index_env:
            index_1 = index_env * reviews_food_beverages_tobacco.afinn_adjusted
            return index_1
'''       

social_sustainability_index = reviews_food_beverages_tobacco[['social_sustainability_words','company', 'category', 'general_rating','total_reviews', 'afinn_adjusted']]
social_sustainability_index = social_sustainability_index.groupby(['CategoricalVariable']).agg({'NumericVariable': ['mean', 'min', 'max']}) #Groups the data per category and calculates the mean, the minimal and the maximal value.



#GroupedData = DATA[['CategoricalVariable', 'NumericVariable']] #Selects a categorical and a numerical column from the dataset DATA (as mentioned above)
#GroupedData = GroupedData.groupby(['CategoricalVariable']).mean() #Groups the data per value from the categorical value and calculates the mean per category.

#GroupedData = GroupedData.groupby(['CategoricalVariable']).agg({'NumericVariable': ['mean', 'min', 'max']}) #Groups the data per category and calculates the mean, the minimal and the maximal value.



ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().