<a href="https://colab.research.google.com/github/Chandanr77/Topic_Analysis_of_Review_Data/blob/main/Topic_Analysis_of_Review_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

### **1. Read the .csv file using Pandas. Take a look at the top few records**

In [4]:
reviews0 = pd.read_csv('K8 Reviews v0.2.csv')
reviews0.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


### **2. Normalize casings for the review text and extract the text into a list for easier manipulation.**

In [5]:
reviews0.shape

(14675, 2)

In [6]:
reviews0['review'] = reviews0['review'].str.lower()

In [7]:
reviews0['review'].head()

0               good but need updates and improvements
1    worst mobile i have bought ever, battery is dr...
2    when i will get my 10% cash back.... its alrea...
3                                                 good
4    the worst phone everthey have changed the last...
Name: review, dtype: object

In [8]:
reviews0['review'][0]

'good but need updates and improvements'

In [9]:
reviews_lower = [sent.lower() for sent in reviews0.review.values]
reviews_lower[0]

'good but need updates and improvements'

### **3. Tokenize the reviews using NLTKs word_tokenize function**

In [10]:
token = [word_tokenize(sent) for sent in reviews_lower]
token[0]

['good', 'but', 'need', 'updates', 'and', 'improvements']

In [11]:
#nltk.download('brown')
#nltk.download('averaged_perceptron_tagger')

### **4. Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.**



In [12]:
nltk.pos_tag(token[0])

[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [13]:
pos_tag = [nltk.pos_tag(wtoken) for wtoken in token]

In [14]:
pd.DataFrame(pos_tag).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817
0,"(good, JJ)","(but, CC)","(need, VBP)","(updates, NNS)","(and, CC)","(improvements, NNS)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,"(worst, JJS)","(mobile, NN)","(i, NN)","(have, VBP)","(bought, VBN)","(ever, RB)","(,, ,)","(battery, NN)","(is, VBZ)","(draining, VBG)","(like, IN)","(hell, NN)","(,, ,)","(backup, NN)","(is, VBZ)","(only, RB)","(6, CD)","(to, TO)","(7, CD)","(hours, NNS)","(with, IN)","(internet, JJ)","(uses, NNS)","(,, ,)","(even, RB)","(if, IN)","(i, JJ)","(put, VBP)","(mobile, JJ)","(idle, NN)","(its, PRP$)","(getting, VBG)","(discharged.this, NN)","(is, VBZ)","(biggest, JJS)","(lie, NN)","(from, IN)","(amazon, NN)","(&, CC)","(lenove, NN)",...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"(when, WRB)","(i, NN)","(will, MD)","(get, VB)","(my, PRP$)","(10, CD)","(%, NN)","(cash, NN)","(back, RB)","(..., :)","(., .)","(its, PRP$)","(already, RB)","(15, CD)","(january.., NN)",,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,"(good, JJ)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,"(the, DT)","(worst, JJS)","(phone, NN)","(everthey, NN)","(have, VBP)","(changed, VBN)","(the, DT)","(last, JJ)","(phone, NN)","(but, CC)","(the, DT)","(problem, NN)","(is, VBZ)","(still, RB)","(same, JJ)","(and, CC)","(the, DT)","(amazon, NN)","(is, VBZ)","(not, RB)","(returning, VBG)","(the, DT)","(phone, NN)","(.highly, RB)","(disappointing, JJ)","(of, IN)","(amazon, NN)",,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### **5. For the topic model, we should  want to include only nouns.**

a. Find out all the POS tags that correspond to nouns.

b. Limit the data to only terms with these tags.

In [15]:
import re
reviews_noun=[]
for sent in pos_tag:
 reviews_noun.append([token for token in sent if re.search("NN.*",
token[1])])

In [16]:
reviews_noun[0:3]

[[('updates', 'NNS'), ('improvements', 'NNS')],
 [('mobile', 'NN'),
  ('i', 'NN'),
  ('battery', 'NN'),
  ('hell', 'NN'),
  ('backup', 'NN'),
  ('hours', 'NNS'),
  ('uses', 'NNS'),
  ('idle', 'NN'),
  ('discharged.this', 'NN'),
  ('lie', 'NN'),
  ('amazon', 'NN'),
  ('lenove', 'NN'),
  ('battery', 'NN'),
  ('charger', 'NN'),
  ('hours', 'NNS'),
  ('don', 'NN')],
 [('i', 'NN'), ('%', 'NN'), ('cash', 'NN'), ('january..', 'NN')]]

### **6.Lemmatize.** 

a. Different forms of the terms need to be treated as one.

b. No need to provide POS tag to lemmatizer for now

In [17]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [18]:
reviews_lemmatized=[]
for sent in reviews_noun:
 reviews_lemmatized.append([wnl.lemmatize(word[0]) for word in sent])

In [19]:
reviews_lemmatized[0],reviews_lemmatized[1],reviews_lemmatized[2]


(['update', 'improvement'],
 ['mobile',
  'i',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'discharged.this',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour',
  'don'],
 ['i', '%', 'cash', 'january..'])

### **7.Remove stopwords and punctuation**(If any)

---



In [20]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [21]:
from string import punctuation
stop_words = stopwords.words("english")+ list(punctuation)

In [22]:
creviews = []
for sent in reviews_lemmatized:
 creviews.append([term for term in sent if term not in stop_words])

In [23]:
creviews[0:4]

[['update', 'improvement'],
 ['mobile',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'discharged.this',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour'],
 ['cash', 'january..'],
 []]

### **8. Create a topic model using LDA on the cleaned-up data with 12 topics.**

a. Print out the top terms for each topic.

b. What is the coherence of the model with the c_v metric?

In [24]:
!pip install --upgrade gensim

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)


In [25]:
import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.parsing.preprocessing import preprocess_documents
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.models.coherencemodel import CoherenceModel






In [26]:
id2word = corpora.Dictionary(creviews)
texts = creviews


In [27]:
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[200])

[(427, 1), (428, 1), (429, 1), (430, 1)]


In [28]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=12, random_state=42, passes=10, per_word_topics=True)


In [29]:
import pprint
lda_model.print_topics()

[(0,
  '0.329*"mobile" + 0.094*"charger" + 0.073*"heat" + 0.030*"box" + 0.022*"system" + 0.022*"worth" + 0.020*"turbo" + 0.019*"phone.." + 0.015*"class" + 0.013*"plz"'),
 (1,
  '0.226*"camera" + 0.097*"quality" + 0.037*"battery" + 0.030*"performance" + 0.020*"display" + 0.017*"price" + 0.017*"mode" + 0.014*"feature" + 0.013*"ram" + 0.013*"sound"'),
 (2,
  '0.157*"issue" + 0.101*"service" + 0.086*"money" + 0.070*"month" + 0.046*"value" + 0.043*"handset" + 0.029*"center" + 0.025*"replacement" + 0.020*"super" + 0.019*"amazon"'),
 (3,
  '0.093*"battery" + 0.044*"day" + 0.043*"time" + 0.037*"phone" + 0.034*"issue" + 0.029*"hour" + 0.025*"update" + 0.021*"device" + 0.018*"charge" + 0.018*"software"'),
 (4,
  '0.255*"battery" + 0.121*"backup" + 0.065*"waste" + 0.046*"superb" + 0.043*"everything" + 0.039*"money" + 0.027*"please" + 0.017*"awesome" + 0.014*"bill" + 0.011*"bit"'),
 (5,
  '0.114*"note" + 0.060*"k8" + 0.058*"lenovo" + 0.038*"amazon" + 0.036*"network" + 0.021*"customer" + 0.020*"cal

### **9. Analyze the topics through the business lens.**
a. Determine which of the topics can be combined.

In [30]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=creviews,dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()


In [31]:
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5330747959468586


### **10. Create a topic model using LDA with what you think is the optimal number of topics**

a. What is the coherence of the model?

In [32]:
lda_model8 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=8, random_state=42, passes=10, per_word_topics=True)


In [33]:
coherence_model_lda = CoherenceModel(model=lda_model8, texts=creviews,dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4758110134966591


### **11. The business should be able to interpret the topics.**

a. Name each of the identified topics.

b. Create a table with the topic name and the top 10 terms in each to present to the business.

In [34]:
x = lda_model8.show_topics(formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
for topic,words in topics_words:
 print(str(topic)+ "::"+ str(words))
print()

0::['mobile', 'battery', 'heat', 'backup', 'charger', '..', 'box', 'issue', 'turbo', 'piece']
1::['camera', 'quality', 'phone', 'battery', 'performance', 'sound', 'mode', 'display', 'speaker', 'price']
2::['delivery', 'service', 'hai', 'handset', 'amazon', 'super', 'thanks', 'set', 'experience', 'hi']
3::['phone', 'battery', 'day', 'time', 'issue', 'hour', 'charge', 'feature', 'update', 'use']
4::['money', 'waste', 'everything', 'value', 'superb', 'smartphone', 'worth', 'please', 'awesome', 'date']
5::['note', 'phone', 'k8', 'issue', 'lenovo', 'call', 'network', 'amazon', 'service', 'sim']
6::['problem', 'heating', 'device', 'cast', 'issue', 'network', 'screen', 'month', 'phone..', 'bill']
7::['product', 'price', 'feature', 'phone', 'range', 'lenovo', 'buy', 'earphone', 'ok', 'cost']



**Topic - Business Name**

Topic0 Product Accessories

Topic1 options to be considered for shopping

Topic2 Performance of the eshopping platform

Topic3 Phone Performance

Topic4 feedbacks of the product

Topic5 Related to Communication ChannelChannels 

Topic6 Features on which pricing depends 

Topic8 Overall General Phone Features
