In [1]:
import os
wd = os.getcwd() 
print(wd)

import sqlalchemy
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
import plotly.express as px
import datetime
import numpy as np
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from pathlib import Path
from collections import Counter
from fuzzywuzzy import fuzz

/Users/Arta/_Sentiment_Analysis


In [2]:
###Create database connection and read tables###
engine = create_engine('postgresql://localhost/')

#Read table reviews_food_beverages_tobacco from Database
reviews_food_beverages_tobacco = pd.read_sql_table(
    "reviews_food_beverages_tobacco",
    con=engine
)

#Read table companies_list from Database
companies_list = pd.read_sql_table(
    "list_companies",
    con=engine
)


In [3]:
companies_list.isna().sum()

symbol              0
company             0
subsidiaries      322
sector              0
price               0
price_earnings      2
dividend_yield      0
earnings_share      0
market_cap          0
ebitda              0
price_sales         0
price_book          8
sec_filings         0
dtype: int64

In [4]:
#select only useful columns
companies_list = companies_list[['company', 'subsidiaries', 'price']]

In [5]:
#Reviews data is sampled in order to speed up computations.
reviews_food_beverages_tobacco = reviews_food_beverages_tobacco.sample(frac = 0.01, replace = False, random_state=42)


In [6]:
reviews_food_beverages_tobacco['first_10_letters'] = reviews_food_beverages_tobacco['company'].str[:10]

reviews_food_beverages_tobacco['last_20_letters'] = reviews_food_beverages_tobacco['company'].str[-20:]

reviews_food_beverages_tobacco.dropna(subset=['review_text'], inplace=True)

In [7]:
#define index
companies_list.set_index('company')

# split the strings into lists
companies_list['subsidiaries'] = companies_list['subsidiaries'].str.split(', ')

# explode the lists
companies_list = companies_list.explode('subsidiaries').reset_index(drop=True)
companies_list = companies_list.fillna(value=np.nan)
companies_list

Unnamed: 0,company,subsidiaries,price
0,3M Company,,222.89
1,A.O. Smith Corp,,60.24
2,Abbott Laboratories,,56.27
3,AbbVie Inc.,,108.48
4,Accenture plc,,150.51
...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71
3015,Zions Bancorp,Vectra Bank Colorado,50.71
3016,Zions Bancorp,Zions Bank,50.71
3017,Zions Bancorp,Zions Direct,50.71


In [8]:
companies_list['all_names'] = companies_list['subsidiaries'] + 
companies_list

Unnamed: 0,company,subsidiaries,price,all_names
0,3M Company,,222.89,
1,A.O. Smith Corp,,60.24,
2,Abbott Laboratories,,56.27,
3,AbbVie Inc.,,108.48,
4,Accenture plc,,150.51,
...,...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71,Nevada State Bank
3015,Zions Bancorp,Vectra Bank Colorado,50.71,Vectra Bank Colorado
3016,Zions Bancorp,Zions Bank,50.71,Zions Bank
3017,Zions Bancorp,Zions Direct,50.71,Zions Direct


In [9]:
companies_list.all_names.fillna(companies_list.company, inplace=True)
companies_list

Unnamed: 0,company,subsidiaries,price,all_names
0,3M Company,,222.89,3M Company
1,A.O. Smith Corp,,60.24,A.O. Smith Corp
2,Abbott Laboratories,,56.27,Abbott Laboratories
3,AbbVie Inc.,,108.48,AbbVie Inc.
4,Accenture plc,,150.51,Accenture plc
...,...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71,Nevada State Bank
3015,Zions Bancorp,Vectra Bank Colorado,50.71,Vectra Bank Colorado
3016,Zions Bancorp,Zions Bank,50.71,Zions Bank
3017,Zions Bancorp,Zions Direct,50.71,Zions Direct


In [10]:
companies_list['first_10_letters'] = companies_list['all_names'].str[:10]
companies_list['last_20_letters'] = companies_list['all_names'].str[-20:]
companies_list

Unnamed: 0,company,subsidiaries,price,all_names,first_10_letters,last_20_letters
0,3M Company,,222.89,3M Company,3M Company,3M Company
1,A.O. Smith Corp,,60.24,A.O. Smith Corp,A.O. Smith,A.O. Smith Corp
2,Abbott Laboratories,,56.27,Abbott Laboratories,Abbott Lab,Abbott Laboratories
3,AbbVie Inc.,,108.48,AbbVie Inc.,AbbVie Inc,AbbVie Inc.
4,Accenture plc,,150.51,Accenture plc,Accenture,Accenture plc
...,...,...,...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71,Nevada State Bank,Nevada Sta,Nevada State Bank
3015,Zions Bancorp,Vectra Bank Colorado,50.71,Vectra Bank Colorado,Vectra Ban,Vectra Bank Colorado
3016,Zions Bancorp,Zions Bank,50.71,Zions Bank,Zions Bank,Zions Bank
3017,Zions Bancorp,Zions Direct,50.71,Zions Direct,Zions Dire,Zions Direct


In [30]:
companies_list.drop_duplicates(subset='first_10_letters', keep="last")

Unnamed: 0,company,subsidiaries,price,all_names,first_10_letters,last_20_letters
0,3M Company,,222.89,3M Company,3M Company,3M Company
1,A.O. Smith Corp,,60.24,A.O. Smith Corp,A.O. Smith,A.O. Smith Corp
2,Abbott Laboratories,,56.27,Abbott Laboratories,Abbott Lab,Abbott Laboratories
3,AbbVie Inc.,,108.48,AbbVie Inc.,AbbVie Inc,AbbVie Inc.
4,Accenture plc,,150.51,Accenture plc,Accenture,Accenture plc
...,...,...,...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71,Nevada State Bank,Nevada Sta,Nevada State Bank
3015,Zions Bancorp,Vectra Bank Colorado,50.71,Vectra Bank Colorado,Vectra Ban,Vectra Bank Colorado
3016,Zions Bancorp,Zions Bank,50.71,Zions Bank,Zions Bank,Zions Bank
3017,Zions Bancorp,Zions Direct,50.71,Zions Direct,Zions Dire,Zions Direct


In [11]:
match_2_10 = list(companies_list.first_10_letters.unique())
match_1_10 = list(reviews_food_beverages_tobacco.first_10_letters.unique())

match_2_15 = list(companies_list.last_20_letters.unique())
match_1_15 = list(reviews_food_beverages_tobacco.last_20_letters.unique())


In [12]:
def match_names(name, list_names, min_score=0):
    #-1 we dont get any matches
    max_score = -1
    #returning empty name for no match
    max_name = ""
    #iternating over all names in the other
    for x in list_names:
        #finding fuzzy match score
        score = fuzz.ratio(name, x)
        #checking if we are above our threshold and have a better score
        if (score>min_score)& (score>max_score):
            max_name=x
            max_score=score
    return (max_name, max_score)

In [13]:
names= []
for x in match_1_10:
    match = match_names(x, match_2_10, 90)
    if match[1] >=90:
        name = ('(' + str(x), str(match[0]) +')')
        names.append(name)
name_dict_first = dict(names)

name_dict_first


{'(Pizza Hut': 'Pizza Hut)',
 "(McDonald's": "McDonald's)",
 '(Shipt': 'Shipt)',
 '(KFC': 'KFC)',
 '(California': 'California)',
 '(Olive Gard': 'Olive Gard)',
 '(Starbucks': 'Starbucks)',
 "(See's Cand": "See's Cand)",
 '(Full Circl': 'Full Circl)'}

In [14]:
names_2= []
for x in match_1_15:
    match = match_names(x, match_2_15, 90)
    if match[1] >=90:
        name_2 = ('(' + str(x), str(match[0]) +')')
        names_2.append(name_2)
name_dict_last = dict(names_2)

name_dict_last

{'(Pizza Hut': 'Pizza Hut)',
 '(Shipt': 'Shipt)',
 '(KFC': 'KFC)',
 '(Olive Garden': 'Olive Garden)',
 '(Starbucks': 'Starbucks)',
 "(See's Candies": "See's Candies)",
 '(Full Circle': 'Full Circle)'}

In [15]:
first_10_letters_items = name_dict_first.items()
first_10_letters_list = list(first_10_letters_items)
first_10_letters_df = pd.DataFrame(first_10_letters_list)
first_10_letters_df.columns = ['review_company_abbrev', 'exchange_company_abbrev']
first_10_letters_df['review_company_abbrev'] = first_10_letters_df['review_company_abbrev'].str[1:] 
first_10_letters_df['exchange_company_abbrev'] = first_10_letters_df['exchange_company_abbrev'].str[:-1] 
first_10_letters_df



Unnamed: 0,review_company_abbrev,exchange_company_abbrev
0,Pizza Hut,Pizza Hut
1,McDonald's,McDonald's
2,Shipt,Shipt
3,KFC,KFC
4,California,California
5,Olive Gard,Olive Gard
6,Starbucks,Starbucks
7,See's Cand,See's Cand
8,Full Circl,Full Circl


In [16]:
last_20_letters_items = name_dict_last.items()
last_20_letters_list = list(last_20_letters_items)
last_20_letters_df = pd.DataFrame(last_20_letters_list)
last_20_letters_df.columns = ['review_company_abbrev', 'exchange_company_abbrev']
last_20_letters_df['review_company_abbrev'] = last_20_letters_df['review_company_abbrev'].str[1:] 
last_20_letters_df['exchange_company_abbrev'] = last_20_letters_df['exchange_company_abbrev'].str[:-1] 
last_20_letters_df

Unnamed: 0,review_company_abbrev,exchange_company_abbrev
0,Pizza Hut,Pizza Hut
1,Shipt,Shipt
2,KFC,KFC
3,Olive Garden,Olive Garden
4,Starbucks,Starbucks
5,See's Candies,See's Candies
6,Full Circle,Full Circle


In [26]:
first_10_letters_reviews = pd.merge(first_10_letters_df, reviews_food_beverages_tobacco, left_on = 'review_company_abbrev', right_on = 'first_10_letters', how = 'inner')
last_20_letters_reviews = pd.merge(last_20_letters_df, reviews_food_beverages_tobacco, left_on = 'review_company_abbrev', right_on = 'last_20_letters', how = 'inner')
Merged_fuzzy_reviews = first_10_letters_reviews.append(last_20_letters_reviews, ignore_index=True)
Merged_fuzzy_reviews


Unnamed: 0,review_company_abbrev,exchange_company_abbrev,company,category,total_reviews,general_rating,review_rating,review_text,first_10_letters,last_20_letters
0,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,1,Hair in the pizza. Greasy food. Overpriced. Ba...,Pizza Hut,Pizza Hut
1,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,4,"Really friendly staff, great pizzas! Our serve...",Pizza Hut,Pizza Hut
2,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,3,Good atmosphere but dry and tastyless pizza do...,Pizza Hut,Pizza Hut
3,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,2,ordered a pizza online for delivery pizza took...,Pizza Hut,Pizza Hut
4,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,1,I would give it a big fat zero if I could. I p...,Pizza Hut,Pizza Hut
...,...,...,...,...,...,...,...,...,...,...
1222,See's Candies,See's Candies,See's Candies,Chocolate Factory,133,1.8,1,"I ordered a $200, five pound custom-mix box of...",See's Cand,See's Candies
1223,See's Candies,See's Candies,See's Candies,Chocolate Factory,133,1.8,1,I placed Mother’s Day on April 29th. Delivery ...,See's Cand,See's Candies
1224,See's Candies,See's Candies,See's Candies,Chocolate Factory,133,1.8,1,I placed an order on Dec 9 and was told my ite...,See's Cand,See's Candies
1225,Full Circle,Full Circle,Full Circle,Online Food Ordering Service,15,2.8,3,"It's a nice, small shop. The staff was friendl...",Full Circl,Full Circle


In [22]:
out = []
seen = set()
for c in Merged_fuzzy_reviews['review_text']:
    words = c.split()
    out.append(' '.join([w for w in words if w not in seen]))
    seen.update(words)

Merged_fuzzy_reviews['Final_review_text'] = out
Merged_fuzzy_reviews = Merged_fuzzy_reviews[Merged_fuzzy_reviews.Final_review_text != '']
Merged_fuzzy_reviews


Unnamed: 0,review_company_abbrev,exchange_company_abbrev,company,category,total_reviews,general_rating,review_rating,review_text,first_10_letters,last_20_letters,Final_review_text
0,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,1,Hair in the pizza. Greasy food. Overpriced. Ba...,Pizza Hut,Pizza Hut,Hair in the pizza. Greasy food. Overpriced. Ba...
1,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,4,"Really friendly staff, great pizzas! Our serve...",Pizza Hut,Pizza Hut,"Really friendly staff, great pizzas! Our serve..."
2,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,3,Good atmosphere but dry and tastyless pizza do...,Pizza Hut,Pizza Hut,Good atmosphere but dry tastyless doughy base....
3,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,2,ordered a pizza online for delivery pizza took...,Pizza Hut,Pizza Hut,ordered online delivery took 2 hours come esti...
4,Pizza Hut,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,1,I would give it a big fat zero if I could. I p...,Pizza Hut,Pizza Hut,would big fat zero could. phoned up gluten fre...
...,...,...,...,...,...,...,...,...,...,...,...
639,See's Cand,See's Cand,See's Candies,Chocolate Factory,133,1.8,5,"On Feb 19Th, I gave See's Candie a one star re...",See's Cand,See's Candies,"19Th, Candie rectify-improve Showing star. Qua..."
640,See's Cand,See's Cand,See's Candies,Chocolate Factory,133,1.8,1,"I ordered a $200, five pound custom-mix box of...",See's Cand,See's Candies,"$200, custom-mix Dec Dec 17-21 present expensi..."
641,See's Cand,See's Cand,See's Candies,Chocolate Factory,133,1.8,1,I placed Mother’s Day on April 29th. Delivery ...,See's Cand,See's Candies,"Mother’s 29th. promised May 6, 7th, 8th,9,10, ..."
642,See's Cand,See's Cand,See's Candies,Chocolate Factory,133,1.8,1,I placed an order on Dec 9 and was told my ite...,See's Cand,See's Candies,shipped 12-18. Tracking label created LA ship....


In [28]:
companies_list
#3019 rows

Unnamed: 0,company,subsidiaries,price,all_names,first_10_letters,last_20_letters
0,3M Company,,222.89,3M Company,3M Company,3M Company
1,A.O. Smith Corp,,60.24,A.O. Smith Corp,A.O. Smith,A.O. Smith Corp
2,Abbott Laboratories,,56.27,Abbott Laboratories,Abbott Lab,Abbott Laboratories
3,AbbVie Inc.,,108.48,AbbVie Inc.,AbbVie Inc,AbbVie Inc.
4,Accenture plc,,150.51,Accenture plc,Accenture,Accenture plc
...,...,...,...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71,Nevada State Bank,Nevada Sta,Nevada State Bank
3015,Zions Bancorp,Vectra Bank Colorado,50.71,Vectra Bank Colorado,Vectra Ban,Vectra Bank Colorado
3016,Zions Bancorp,Zions Bank,50.71,Zions Bank,Zions Bank,Zions Bank
3017,Zions Bancorp,Zions Direct,50.71,Zions Direct,Zions Dire,Zions Direct


In [29]:
companies_list.drop_duplicates(subset='first_10_letters', keep="last")


Unnamed: 0,company,subsidiaries,price,all_names,first_10_letters,last_20_letters
0,3M Company,,222.89,3M Company,3M Company,3M Company
1,A.O. Smith Corp,,60.24,A.O. Smith Corp,A.O. Smith,A.O. Smith Corp
2,Abbott Laboratories,,56.27,Abbott Laboratories,Abbott Lab,Abbott Laboratories
3,AbbVie Inc.,,108.48,AbbVie Inc.,AbbVie Inc,AbbVie Inc.
4,Accenture plc,,150.51,Accenture plc,Accenture,Accenture plc
...,...,...,...,...,...,...
3014,Zions Bancorp,Nevada State Bank,50.71,Nevada State Bank,Nevada Sta,Nevada State Bank
3015,Zions Bancorp,Vectra Bank Colorado,50.71,Vectra Bank Colorado,Vectra Ban,Vectra Bank Colorado
3016,Zions Bancorp,Zions Bank,50.71,Zions Bank,Zions Bank,Zions Bank
3017,Zions Bancorp,Zions Direct,50.71,Zions Direct,Zions Dire,Zions Direct


In [24]:
bigdata1 = pd.merge(Merged_fuzzy_reviews, companies_list,  how='inner', left_on=['first_10_letters','last_20_letters'], right_on = ['first_10_letters','last_20_letters'])

bigdata = bigdata1[['company_x', 'subsidiaries','category', 'total_reviews', 'general_rating', 'review_rating', 'review_text', 
                   'first_10_letters', 'last_20_letters', 'company_y', 'price']]


In [25]:
bigdata

Unnamed: 0,company_x,subsidiaries,category,total_reviews,general_rating,review_rating,review_text,first_10_letters,last_20_letters,company_y,price
0,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,1,Hair in the pizza. Greasy food. Overpriced. Ba...,Pizza Hut,Pizza Hut,Yum! Brands Inc,76.3
1,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,4,"Really friendly staff, great pizzas! Our serve...",Pizza Hut,Pizza Hut,Yum! Brands Inc,76.3
2,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,3,Good atmosphere but dry and tastyless pizza do...,Pizza Hut,Pizza Hut,Yum! Brands Inc,76.3
3,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,2,ordered a pizza online for delivery pizza took...,Pizza Hut,Pizza Hut,Yum! Brands Inc,76.3
4,Pizza Hut,Pizza Hut,Food Products Supplier,1373,2.2,1,I would give it a big fat zero if I could. I p...,Pizza Hut,Pizza Hut,Yum! Brands Inc,76.3
...,...,...,...,...,...,...,...,...,...,...,...
535,See's Candies,See's Candies,Chocolate Factory,133,1.8,5,"On Feb 19Th, I gave See's Candie a one star re...",See's Cand,See's Candies,Berkshire Hathaway,191.42
536,See's Candies,See's Candies,Chocolate Factory,133,1.8,1,"I ordered a $200, five pound custom-mix box of...",See's Cand,See's Candies,Berkshire Hathaway,191.42
537,See's Candies,See's Candies,Chocolate Factory,133,1.8,1,I placed Mother’s Day on April 29th. Delivery ...,See's Cand,See's Candies,Berkshire Hathaway,191.42
538,See's Candies,See's Candies,Chocolate Factory,133,1.8,1,I placed an order on Dec 9 and was told my ite...,See's Cand,See's Candies,Berkshire Hathaway,191.42


In [None]:
bigdata["total_reviews"] = bigdata["total_reviews"].str.replace(",","").astype(float)

#Set column as numeric
bigdata['total_reviews'] = pd.to_numeric(bigdata['total_reviews'])
bigdata['general_rating'] = pd.to_numeric(bigdata['general_rating'])
bigdata['review_rating'] = pd.to_numeric(bigdata['review_rating'])


In [None]:
import numpy as np
bigdata['review_text'].replace('', np.nan, inplace=True)
bigdata.dropna(subset=['review_text'], inplace=True)
bigdata['review_text']= bigdata['review_text'].astype(str)



In [None]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    bigdata['review_text'] = bigdata['review_text'].str.replace(char, ' ')
    
bigdata['review_text'] = bigdata['review_text'].str.split().str.join(" ")


In [None]:
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    #text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
bigdata["review_clean"] = bigdata["review_text"].apply(lambda x: clean_text(x))



In [None]:
from collections import Counter
all_names = bigdata['review_clean'].unique()
names_freq = Counter()
for name in all_names:
    names_freq.update(str(name).split(" "))
key_words = [word for (word,_) in names_freq.most_common(30)]
print(key_words)


In [None]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
bigdata["sentiments"] = bigdata["review_clean"].apply(lambda x: sid.polarity_scores(x))
bigdata = pd.concat([bigdata.drop(['sentiments'], axis=1), bigdata['sentiments'].apply(pd.Series)], axis=1)


In [None]:
from afinn import Afinn
afinn = Afinn(language='en')

In [None]:
bigdata.head()

In [None]:
bigdata['afinn_score'] = bigdata['review_clean'].apply(afinn.score)



In [None]:
bigdata['afinn_score'].describe()

In [None]:
columns_to_display = bigdata[['review_clean', 'afinn_score']]

columns_to_display.sort_values(by='afinn_score').head(10)

In [None]:
columns_to_display.sort_values(by='afinn_score').tail(10)

In [None]:
def word_count(text_string):
    '''Calculate the number of words in a string'''
    return len(text_string.split())

bigdata['word_count'] = bigdata['review_clean'].apply(word_count)

bigdata['word_count'].describe()

In [None]:
bigdata['afinn_adjusted'] = bigdata['afinn_score'] / bigdata['word_count'] * 100
bigdata['afinn_adjusted'].describe()



In [None]:
def count_occurences(text, word_list):
    '''Count occurences of words from a list in a text string.'''
    text_list = text_to_words(text)

    intersection = [w for w in text_list if w in word_list]

    return len(intersection)

def text_to_words(text):
    '''Transform a string to a list of words,
    removing all punctuation.'''
    text = text.lower()

    p = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    text = ''.join([ch for ch in text if ch not in p])

    return text.split()

In [None]:
#Read table sustainability_words from Database
sustainability_words = pd.read_sql_table(
    "sustainability_words",
    con=engine
)

In [None]:
social_words_list = sustainability_words['social_sustainability'].values

bigdata['social_sustainability_words'] = bigdata['review_clean'].apply(count_occurences, args=(social_words_list, ))


In [None]:
environmental_words_list = sustainability_words['environmental_sustainability'].values

bigdata['environmental_sustainability_words'] = bigdata['review_clean'].apply(count_occurences, args=(environmental_words_list, ))



In [None]:
economic_words_list = sustainability_words['economic_sustainability'].values

bigdata['economic_sustainability_words'] = bigdata['review_clean'].apply(count_occurences, args=(economic_words_list, ))



In [None]:
sustainability_words = ['environmental_sustainability_words', 'economic_sustainability_words', 'social_sustainability_words']
bigdata[sustainability_words].describe()



In [None]:
bigdata['social_sustainability_words'].value_counts()

In [None]:
bigdata['economic_sustainability_words'].value_counts()

In [None]:
bigdata['environmental_sustainability_words'].value_counts()

In [None]:
filter_bigdata = bigdata[(bigdata['social_sustainability_words'] >= 1) | (bigdata['environmental_sustainability_words'] >=1) 
                                              | (bigdata['economic_sustainability_words'] >=1)]
filter_bigdata

In [None]:

filter_bigdata['total_sust_reviews'] = filter_bigdata.groupby(by='company_x')['company_x'].transform('count')

filter_bigdata['total_environmental_reviews'] = filter_bigdata.groupby('company_x')['environmental_sustainability_words'].transform('sum')

filter_bigdata['total_social_reviews'] = filter_bigdata.groupby('company_x')['social_sustainability_words'].transform('sum')
filter_bigdata['total_economic_reviews'] = filter_bigdata.groupby('company_x')['economic_sustainability_words'].transform('sum')




In [None]:
def divide_two_cols(df_sub):
    df_sub['Sustainability Index'] = df_sub['total_sust_reviews'] / (df_sub['total_reviews']) * 100
    return df_sub


filter_bigdata = filter_bigdata.groupby('company_x').apply(divide_two_cols)


In [None]:
def ratio_division_social(df_sub):
    df_sub['Social Index'] = df_sub['total_social_reviews'] / (df_sub['total_sust_reviews']) * 100
    return df_sub

filter_bigdata = filter_bigdata.groupby('company_x').apply(ratio_division_social)



In [None]:
def ratio_division_environmental(df_sub):
    df_sub['Environmental Index'] = df_sub['total_environmental_reviews'] / (df_sub['total_sust_reviews']) * 100
    return df_sub

filter_bigdata = filter_bigdata.groupby('company_x').apply(ratio_division_environmental)


In [None]:
def ratio_division_economic(df_sub):
    df_sub['Economic Index'] =  df_sub['total_economic_reviews'] / (df_sub['total_sust_reviews']) * 100
    return df_sub
filter_bigdata = filter_bigdata.groupby('company_x').apply(ratio_division_economic)

In [None]:
final_df = filter_bigdata[['company_x', 'subsidiaries','category', 'total_reviews', 
                      'total_sust_reviews', 'Sustainability Index', 
                      'Social Index', 'Environmental Index', 'Economic Index', 'price']]

final_df = final_df.drop_duplicates(subset='company_x', keep="last")

final_df