# 🗣️ Sentiment Analysis: Aspect-Based Opinion Mining

Applying topic modelling and sentiment analysis techniques in order to extract text aspects and the sentiment expressed towards such aspects.

Installing necessary libraries

In [None]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 25.3 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 28.6 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 20.2 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 16.9 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 7.5 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 8.8 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 8.0 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 9.0 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 9.7 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 7.4 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 7.4 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 7.4 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 7.4 M

Importing necesarry libraries

In [None]:
import json
import pandas as pd
import numpy as np
import re
import sys
import nltk

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

from keras.models import load_model

nltk.download('punkt')
from nltk.corpus import stopwords, sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import collections
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#from stop_words_list import stop_words_list
from wn_affect import wn_affect 
import matplotlib.pyplot as plt
from wordcloud import WordCloud



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data was extracted from [Cell Phones Reviews Data](http://jmcauley.ucsd.edu/data/amazon/links.html)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('/content/drive/MyDrive/reviews_Cell_Phones_and_Accessories_5.json.gz')
df = df[['reviewText']]
display(df.head())

Unnamed: 0,reviewText
0,They look good and stick good! I just don't li...
1,These stickers work like the review says they ...
2,These are awesome and make my phone look so st...
3,Item arrived in great time and was in perfect ...
4,"awesome! stays on, and looks great. can be use..."


## Preprocesssing Data

In [None]:
# case text as lowercase, remove punctuation, remove extra whitespace in string and on both sides of string

df['remove_lower_punct'] = df['reviewText'].str.lower().str.replace("'", '').str.replace('[^\w\s]', ' ').str.replace(" \d+", " ").str.replace(' +', ' ').str.strip()

display(df.head(10))

Unnamed: 0,reviewText,remove_lower_punct
0,They look good and stick good! I just don't li...,they look good and stick good i just dont like...
1,These stickers work like the review says they ...,these stickers work like the review says they ...
2,These are awesome and make my phone look so st...,these are awesome and make my phone look so st...
3,Item arrived in great time and was in perfect ...,item arrived in great time and was in perfect ...
4,"awesome! stays on, and looks great. can be use...",awesome stays on and looks great can be used o...
5,These make using the home button easy. My daug...,these make using the home button easy my daugh...
6,Came just as described.. It doesn't come unstu...,came just as described it doesnt come unstuck ...
7,it worked for the first week then it only char...,it worked for the first week then it only char...
8,"Good case, solid build. Protects phone all aro...",good case solid build protects phone all aroun...
9,This is a fantastic case. Very stylish and pro...,this is a fantastic case very stylish and prot...


In [None]:
# apply sentiment analysis
analyser = SentimentIntensityAnalyzer()

sentiment_score_list = []
sentiment_label_list = []

for i in df['remove_lower_punct'].values.tolist():
    sentiment_score = analyser.polarity_scores(i)

    if sentiment_score['compound'] >= 0.05:
        sentiment_score_list.append(sentiment_score['compound'])
        sentiment_label_list.append('Positive')
    elif sentiment_score['compound'] > -0.05 and sentiment_score['compound'] < 0.05:
        sentiment_score_list.append(sentiment_score['compound'])
        sentiment_label_list.append('Neutral')
    elif sentiment_score['compound'] <= -0.05:
        sentiment_score_list.append(sentiment_score['compound'])
        sentiment_label_list.append('Negative')
    
df['sentiment'] = sentiment_label_list
df['sentiment score'] = sentiment_score_list

display(df.head(10))

Unnamed: 0,reviewText,remove_lower_punct,sentiment,sentiment score
0,They look good and stick good! I just don't li...,they look good and stick good i just dont like...,Positive,0.4922
1,These stickers work like the review says they ...,these stickers work like the review says they ...,Positive,0.9136
2,These are awesome and make my phone look so st...,these are awesome and make my phone look so st...,Positive,0.8481
3,Item arrived in great time and was in perfect ...,item arrived in great time and was in perfect ...,Positive,0.9584
4,"awesome! stays on, and looks great. can be use...",awesome stays on and looks great can be used o...,Positive,0.8957
5,These make using the home button easy. My daug...,these make using the home button easy my daugh...,Positive,0.8126
6,Came just as described.. It doesn't come unstu...,came just as described it doesnt come unstuck ...,Positive,0.7964
7,it worked for the first week then it only char...,it worked for the first week then it only char...,Negative,-0.4215
8,"Good case, solid build. Protects phone all aro...",good case solid build protects phone all aroun...,Positive,0.8555
9,This is a fantastic case. Very stylish and pro...,this is a fantastic case very stylish and prot...,Positive,0.9572


In [None]:
# tokenise string

df['tokenise'] = df.apply(lambda row: nltk.word_tokenize(row[1]), axis=1)

display(df.head(10))

Unnamed: 0,reviewText,remove_lower_punct,sentiment,sentiment score,tokenise
0,They look good and stick good! I just don't li...,they look good and stick good i just dont like...,Positive,0.4922,"[they, look, good, and, stick, good, i, just, ..."
1,These stickers work like the review says they ...,these stickers work like the review says they ...,Positive,0.9136,"[these, stickers, work, like, the, review, say..."
2,These are awesome and make my phone look so st...,these are awesome and make my phone look so st...,Positive,0.8481,"[these, are, awesome, and, make, my, phone, lo..."
3,Item arrived in great time and was in perfect ...,item arrived in great time and was in perfect ...,Positive,0.9584,"[item, arrived, in, great, time, and, was, in,..."
4,"awesome! stays on, and looks great. can be use...",awesome stays on and looks great can be used o...,Positive,0.8957,"[awesome, stays, on, and, looks, great, can, b..."
5,These make using the home button easy. My daug...,these make using the home button easy my daugh...,Positive,0.8126,"[these, make, using, the, home, button, easy, ..."
6,Came just as described.. It doesn't come unstu...,came just as described it doesnt come unstuck ...,Positive,0.7964,"[came, just, as, described, it, doesnt, come, ..."
7,it worked for the first week then it only char...,it worked for the first week then it only char...,Negative,-0.4215,"[it, worked, for, the, first, week, then, it, ..."
8,"Good case, solid build. Protects phone all aro...",good case solid build protects phone all aroun...,Positive,0.8555,"[good, case, solid, build, protects, phone, al..."
9,This is a fantastic case. Very stylish and pro...,this is a fantastic case very stylish and prot...,Positive,0.9572,"[this, is, a, fantastic, case, very, stylish, ..."


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# initiate stopwords from nltk

stop_words = stopwords.words('english')

# add additional missing terms

#stop_words.extend(stop_words_list) 

# remove stopwords

df['remove_stopwords'] = df['tokenise'].apply(lambda x: [item for item in x if item not in stop_words])

display(df.head(10))

Unnamed: 0,reviewText,remove_lower_punct,sentiment,sentiment score,tokenise,remove_stopwords
0,They look good and stick good! I just don't li...,they look good and stick good i just dont like...,Positive,0.4922,"[they, look, good, and, stick, good, i, just, ...","[look, good, stick, good, dont, like, rounded,..."
1,These stickers work like the review says they ...,these stickers work like the review says they ...,Positive,0.9136,"[these, stickers, work, like, the, review, say...","[stickers, work, like, review, says, stick, gr..."
2,These are awesome and make my phone look so st...,these are awesome and make my phone look so st...,Positive,0.8481,"[these, are, awesome, and, make, my, phone, lo...","[awesome, make, phone, look, stylish, used, on..."
3,Item arrived in great time and was in perfect ...,item arrived in great time and was in perfect ...,Positive,0.9584,"[item, arrived, in, great, time, and, was, in,...","[item, arrived, great, time, perfect, conditio..."
4,"awesome! stays on, and looks great. can be use...",awesome stays on and looks great can be used o...,Positive,0.8957,"[awesome, stays, on, and, looks, great, can, b...","[awesome, stays, looks, great, used, multiple,..."
5,These make using the home button easy. My daug...,these make using the home button easy my daugh...,Positive,0.8126,"[these, make, using, the, home, button, easy, ...","[make, using, home, button, easy, daughter, li..."
6,Came just as described.. It doesn't come unstu...,came just as described it doesnt come unstuck ...,Positive,0.7964,"[came, just, as, described, it, doesnt, come, ...","[came, described, doesnt, come, unstuck, cute,..."
7,it worked for the first week then it only char...,it worked for the first week then it only char...,Negative,-0.4215,"[it, worked, for, the, first, week, then, it, ...","[worked, first, week, charge, phone, waste, mo..."
8,"Good case, solid build. Protects phone all aro...",good case solid build protects phone all aroun...,Positive,0.8555,"[good, case, solid, build, protects, phone, al...","[good, case, solid, build, protects, phone, ar..."
9,This is a fantastic case. Very stylish and pro...,this is a fantastic case very stylish and prot...,Positive,0.9572,"[this, is, a, fantastic, case, very, stylish, ...","[fantastic, case, stylish, protects, phone, ea..."


In [None]:
# initiate nltk lemmatiser

wordnet_lemmatizer = WordNetLemmatizer()

# lemmatise words

df['lemmatise'] = df['remove_stopwords'].apply(lambda x: [wordnet_lemmatizer.lemmatize(y) for y in x]) 

display(df.head(10))

Unnamed: 0,reviewText,remove_lower_punct,sentiment,sentiment score,tokenise,remove_stopwords,lemmatise
0,They look good and stick good! I just don't li...,they look good and stick good i just dont like...,Positive,0.4922,"[they, look, good, and, stick, good, i, just, ...","[look, good, stick, good, dont, like, rounded,...","[look, good, stick, good, dont, like, rounded,..."
1,These stickers work like the review says they ...,these stickers work like the review says they ...,Positive,0.9136,"[these, stickers, work, like, the, review, say...","[stickers, work, like, review, says, stick, gr...","[sticker, work, like, review, say, stick, grea..."
2,These are awesome and make my phone look so st...,these are awesome and make my phone look so st...,Positive,0.8481,"[these, are, awesome, and, make, my, phone, lo...","[awesome, make, phone, look, stylish, used, on...","[awesome, make, phone, look, stylish, used, on..."
3,Item arrived in great time and was in perfect ...,item arrived in great time and was in perfect ...,Positive,0.9584,"[item, arrived, in, great, time, and, was, in,...","[item, arrived, great, time, perfect, conditio...","[item, arrived, great, time, perfect, conditio..."
4,"awesome! stays on, and looks great. can be use...",awesome stays on and looks great can be used o...,Positive,0.8957,"[awesome, stays, on, and, looks, great, can, b...","[awesome, stays, looks, great, used, multiple,...","[awesome, stay, look, great, used, multiple, a..."
5,These make using the home button easy. My daug...,these make using the home button easy my daugh...,Positive,0.8126,"[these, make, using, the, home, button, easy, ...","[make, using, home, button, easy, daughter, li...","[make, using, home, button, easy, daughter, li..."
6,Came just as described.. It doesn't come unstu...,came just as described it doesnt come unstuck ...,Positive,0.7964,"[came, just, as, described, it, doesnt, come, ...","[came, described, doesnt, come, unstuck, cute,...","[came, described, doesnt, come, unstuck, cute,..."
7,it worked for the first week then it only char...,it worked for the first week then it only char...,Negative,-0.4215,"[it, worked, for, the, first, week, then, it, ...","[worked, first, week, charge, phone, waste, mo...","[worked, first, week, charge, phone, waste, mo..."
8,"Good case, solid build. Protects phone all aro...",good case solid build protects phone all aroun...,Positive,0.8555,"[good, case, solid, build, protects, phone, al...","[good, case, solid, build, protects, phone, ar...","[good, case, solid, build, protects, phone, ar..."
9,This is a fantastic case. Very stylish and pro...,this is a fantastic case very stylish and prot...,Positive,0.9572,"[this, is, a, fantastic, case, very, stylish, ...","[fantastic, case, stylish, protects, phone, ea...","[fantastic, case, stylish, protects, phone, ea..."


In [None]:
# initialise the count vectorizer

vectorizer = CountVectorizer(analyzer = 'word', ngram_range = (2, 2))
                            
# join the processed data to be vectorised

vectors = []

for index, row in df.iterrows():
    vectors.append(", ".join(row[6]))

vectorised = vectorizer.fit_transform(vectors)

print(vectorised)

  (0, 1086472)	1
  (0, 792816)	1
  (0, 1823634)	1
  (0, 790481)	1
  (0, 532393)	1
  (0, 1057936)	1
  (0, 1612601)	1
  (0, 1690586)	1
  (0, 60709)	1
  (0, 242638)	1
  (0, 1727136)	1
  (0, 995490)	1
  (0, 1418365)	1
  (0, 968370)	1
  (0, 2143315)	1
  (0, 250048)	1
  (0, 1464261)	1
  (1, 1825024)	1
  (1, 2147702)	1
  (1, 1057851)	1
  (1, 1594582)	1
  (1, 1639612)	1
  (1, 1823640)	1
  (1, 807398)	1
  (1, 1819459)	1
  :	:
  (194438, 2159330)	1
  (194438, 1057019)	1
  (194438, 314130)	1
  (194438, 1404817)	1
  (194438, 1463149)	1
  (194438, 1097433)	1
  (194438, 1606087)	1
  (194438, 2146462)	1
  (194438, 973515)	1
  (194438, 1033663)	1
  (194438, 414712)	1
  (194438, 1373918)	1
  (194438, 1727718)	1
  (194438, 517036)	1
  (194438, 1369228)	1
  (194438, 413684)	1
  (194438, 250641)	1
  (194438, 784606)	1
  (194438, 2094992)	1
  (194438, 774571)	1
  (194438, 1988000)	1
  (194438, 1185916)	1
  (194438, 413450)	1
  (194438, 482485)	1
  (194438, 2057114)	1


In [None]:
# initisalise LDA Model

lda_model = LatentDirichletAllocation(n_components = 6, # number of topics
                                  random_state = 10,          # random state
                                  evaluate_every = -1,      # compute perplexity every n iters, default: Don't
                                  n_jobs = 1,              # Use all available CPUs
                                 )

lda_output = lda_model.fit_transform(vectorised)

# column names

topic_names = ["Topic" + str(i) for i in range(1, lda_model.n_components + 1)]

# make the pandas dataframe

df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns = topic_names)

# get dominant topic for each document

dominant_topic = (np.argmax(df_document_topic.values, axis=1)+1)
df_document_topic['Dominant_topic'] = dominant_topic

# join to original dataframes

df = pd.merge(df, df_document_topic, left_index = True, right_index = True, how = 'outer')
display(df.head(10))

In [None]:
# index names
docnames = ['Doc' + str(i) for i in range(len(df['reviewText']))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topic_names, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()

#print(topic_names)
topics = []

for topic_name in topic_names:
  #topic = 'Topic'
  topic_name = int(topic_name.replace('Topic',''))
  topics.append(topic_name)
#print(topics)

df_topic_keywords.index = topics

df_topic_no = pd.DataFrame(df_topic_keywords.idxmax())
df_scores = pd.DataFrame(df_topic_keywords.max())

tmp = pd.merge(df_topic_no, df_scores, left_index=True, right_index=True)
tmp.columns = ['topic', 'relevance_score']

display(tmp)

In [None]:
all_topics = []

for i in tmp['topic'].unique():    
    tmp_1 = tmp.loc[tmp['topic'] == i].reset_index()
    tmp_1 = tmp_1.sort_values('relevance_score', ascending=False).head(1)

    #tmp_1['topic'] = tmp_1['topic'] + 1
    
    tmp_2 = []
    tmp_2.append(tmp_1['topic'].unique()[0])
    tmp_2.append(list(tmp_1['index'].unique()))
    all_topics.append(tmp_2)

all_topics = pd.DataFrame(all_topics, columns=['Dominant_topic', 'topic_name'])
display(all_topics)

In [None]:
results = df.groupby(['Dominant_topic', 'sentiment']).count().reset_index()

#results['Dominant_topic'] = results['Dominant_topic'].astype(int)
#all_topics['Dominant_topic'] = all_topics['Dominant_topic'].astype(int)

results = results.merge(all_topics, on='Dominant_topic')
results['topic_name'] = results['topic_name'].apply(', '.join)

graph_results = results[['topic_name', 'sentiment', 'sentiment score']]
graph_results = graph_results.pivot(index='topic_name', columns='sentiment', values='sentiment score').reset_index()

graph_results.set_index('topic_name', inplace=True)

display(graph_results)

In [None]:
fig = graph_results.plot.bar(rot=90, figsize=(10,10))
fig.figure.savefig('sentiment_analysis.png', bbox_inches='tight')

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        pass

positive_words = []
negative_words = []

for i in df['Dominant_topic'].unique():
    
    if i == 1:
        tmp_1 = df.loc[df['Dominant_topic'] == i]
                
        for j in tmp_1['tokenise'].values.tolist():
            for p in nltk.pos_tag(j):
                get_pos_tag = get_wordnet_pos(p[1])
                if type(get_pos_tag) == str:
                    try:        
                        synset = swn.senti_synset(p[0] + '.' + get_pos_tag +'.01')

                        if synset.obj_score() <= 0.49:
                            if synset.pos_score() > synset.neg_score() and p[0] in wn_affect:
                                    positive_words.append(p[0])
                            elif synset.neg_score() > synset.pos_score() and p[0] in wn_affect:
                                    negative_words.append(p[0])      
                    except:
                        pass

In [None]:
unique_positive_words = list(set(positive_words))
unique_negative_words = list(set(negative_words))

count_positive_words = []
count_negative_words = []

for i in unique_positive_words:
    counter = [i, positive_words.count(i)]
    count_positive_words.append(counter)

for i in unique_negative_words:
    counter = [i, negative_words.count(i)]
    count_negative_words.append(counter)    
    
positive_words = pd.DataFrame(count_positive_words, columns = ['word', 'score'])
negative_words = pd.DataFrame(count_negative_words, columns = ['word', 'score'])

positive_words.sort_values('score', ascending=False, inplace = True)
negative_words.sort_values('score', ascending=False, inplace = True)


In [None]:
positive_words.head()

In [None]:
word_dict = {}
for k, v in positive_words.values:
    word_dict[k] = v

wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=word_dict)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('positive_words.png')
plt.show()

In [None]:
word_dict = {}
for k, v in negative_words.values:
    word_dict[k] = v
    
wordcloud.generate_from_frequencies(frequencies=word_dict)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('negative_words.png')
plt.show()

Saving the model


In [None]:
import pickle

pickle.dump(lda_model, open('lda_model.pk','wb'))
# then reload it with
lda_model = pickle.load(open('lda_model.pk','rb'))



# Predicting with Jumia


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math

In [None]:

jumia_product_url = input('Please enter the cellphone product URL')

response = requests.get(jumia_product_url)
# Check successful response
if response.status_code != 200:
    raise Exception('Failed to load page {}'.format(jumia_product_url))
# Parse using BeautifulSoup
product_doc = BeautifulSoup(response.text, 'html.parser')

In [None]:
review_page = product_doc.find('a',{'class':'-plxs'})
base_url = 'https://www.jumia.co.ke'
review_url = base_url + review_page.get('href','')

response = requests.get(review_url)
# Check successful response
if response.status_code != 200:
    raise Exception('Failed to load page {}'.format(review_url))
# Parse using BeautifulSoup
review_doc = BeautifulSoup(response.text, 'html.parser')

In [None]:
# Find the count of reviews of the given product

total_counts = review_doc.find_all('h2',{'class':'-fs14'})
review_count = total_counts[1].text
lst = review_count.split()
refined = lst[2].replace("(","")
f_count = refined.replace(")","")
final_count = int(f_count)
total_review_pages = math.ceil(final_count / 10)


In [None]:
# Find URL of each of the pages
current_page = 1
review_url =  []
while current_page <= total_review_pages :
  review_url.append(base_url + review_page.get('href','') + '?page=' + str(current_page))
  current_page += 1

In [None]:

descs = []
for url in review_url:
  response = requests.get(url)
  review_doc = BeautifulSoup(response.text, 'html.parser')
  desc_tags = review_doc.find_all('p',{'class':'-pvs'})
  for tag in desc_tags:
    descs.append(tag.text)

print('Extracted number of reviews is ' + str(len(descs)))
jumia_df = pd.DataFrame(descs,columns =['reviewText'])
jumia_df

In [None]:
lda_model.predict(jumia_df)