In [1]:
import warnings

In [2]:
# 1. Read the .csv file using Pandas. Take a look at the top few records.
import pandas as pd
reviews_v0=pd.read_csv("K8 Reviews v0.2.csv")
reviews_v0.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [3]:
#2. Normalize casings for the review text and extract the text into a list for easier manipulation.
import re
reviews_lower=[re.sub(r'\.', ' ', text).lower() for text in reviews_v0.review.values]
reviews_lower[0]

'good but need updates and improvements'

In [4]:
#3. Tokenize the reviews using NLTKs word_tokenize function.
import nltk
from nltk.tokenize import word_tokenize
reviews_token = [word_tokenize(text) for text in reviews_lower]
reviews_token[0]

['good', 'but', 'need', 'updates', 'and', 'improvements']

In [5]:
#4. Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.
from nltk import pos_tag
reviews_pos = [pos_tag(text) for text in reviews_token]
reviews_pos[0]

[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [6]:
'''
5. For the topic model, we should  want to include only nouns.
    1. Find out all the POS tags that correspond to nouns.
    2. Limit the data to only terms with these tags.
'''
import re
reviews_noun=[]
for text in reviews_pos:
    reviews_noun.append([token[0] for token in text if re.search("NN.*", token[1])])
reviews_noun[0]

['updates', 'improvements']

In [7]:
'''
6. Lemmatize. 
    1. Different forms of the terms need to be treated as one.
    2. No need to provide POS tag to lemmatizer for now.
'''
from nltk.stem import WordNetLemmatizer
wnlem = WordNetLemmatizer()
reviews_lem=[]
for text in reviews_noun:
    reviews_lem.append([wnlem.lemmatize(word) for word in text])
reviews_lem[0:3]

[['update', 'improvement'],
 ['mobile',
  'i',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour',
  'don'],
 ['i', '%', 'cash']]

In [8]:
#7. Remove stopwords and punctuation (if there are any). 
from string import punctuation
from nltk.corpus import stopwords
stopword = stopwords.words('english')+list(punctuation)+['😒😒☹️', '☺️🙂', 'u', '😪', 'o', 'r', '👌👌', 'com', 'mo', '😎😎', 'c', '👌👌👌', 'hai', 'h', 'k', 'ho', 'please', 'plz', 'ok', 'k8']
reviews_remstop =[]
for text in reviews_lem:
    reviews_remstop.append([word for word in text if word not in stopword])
reviews_remstop[0:5]

[['update', 'improvement'],
 ['mobile',
  'battery',
  'hell',
  'backup',
  'hour',
  'us',
  'idle',
  'lie',
  'amazon',
  'lenove',
  'battery',
  'charger',
  'hour'],
 ['cash'],
 [],
 ['phone', 'everthey', 'phone', 'problem', 'amazon', 'phone', 'amazon']]

In [9]:
'''8. Create a topic model using LDA on the cleaned-up data with 12 topics.

    1. Print out the top terms for each topic.

'''
import gensim
from gensim import corpora
from gensim.models import ldamodel
from gensim.models import CoherenceModel
from pprint import pprint
new_token = reviews_remstop.copy()
id2word = corpora.Dictionary(new_token)
corpus = [id2word.doc2bow(text) for text in reviews_remstop]
corpus[0:5]

[[(0, 1), (1, 1)],
 [(2, 1),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 1),
  (7, 2),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1)],
 [(13, 1)],
 [],
 [(2, 2), (14, 1), (15, 3), (16, 1)]]

In [10]:
lda_model = ldamodel.LdaModel(corpus=corpus, num_topics=12, id2word=id2word, passes=10, random_state=42, per_word_topics=True)

In [11]:
pprint(lda_model.print_topics())

[(0,
  '0.227*"battery" + 0.083*"phone" + 0.042*"backup" + 0.034*"hour" + '
  '0.034*"issue" + 0.031*"day" + 0.031*"camera" + 0.030*"problem" + '
  '0.024*"life" + 0.023*"time"'),
 (1,
  '0.075*"camera" + 0.033*"device" + 0.029*"processor" + 0.028*"mode" + '
  '0.022*"screen" + 0.021*"music" + 0.020*"ram" + 0.018*"glass" + '
  '0.017*"video" + 0.016*"depth"'),
 (2,
  '0.415*"phone" + 0.034*"issue" + 0.026*"price" + 0.026*"lenovo" + '
  '0.014*"time" + 0.014*"update" + 0.014*"budget" + 0.012*"software" + '
  '0.011*"month" + 0.009*"lot"'),
 (3,
  '0.086*"handset" + 0.047*"star" + 0.042*"worth" + 0.041*"look" + 0.036*"set" '
  '+ 0.036*"speed" + 0.034*"touch" + 0.031*"k4" + 0.025*"light" + '
  '0.022*"rate"'),
 (4,
  '0.089*"service" + 0.079*"amazon" + 0.052*"waste" + 0.049*"day" + '
  '0.045*"delivery" + 0.043*"money" + 0.036*"time" + 0.034*"experience" + '
  '0.033*"replacement" + 0.033*"return"'),
 (5,
  '0.101*"problem" + 0.080*"network" + 0.070*"call" + 0.038*"issue" + '
  '0.034*"o

In [12]:
'''
    2. What is the coherence of the model with the c_v metric?
'''
cm = CoherenceModel(model=lda_model, texts=reviews_remstop, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.536935569123695

In [13]:
'''
9. Analyze the topics through the business lens.

    1. Determine which of the topics can be combined.
'''
import pyLDAvis
from pyLDAvis.gensim_models import prepare
from pyLDAvis import enable_notebook
enable_notebook()
LDAvis = prepare(lda_model, corpus, id2word)
LDAvis

In [21]:
'''
10. Create topic model using LDA with what you think is the optimal number of topics

    1. What is the coherence of the model?
'''
warnings.filterwarnings('ignore', category=DeprecationWarning)
lda_model3 = ldamodel.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=10, random_state=42, per_word_topics=True)
top_10 = lda_model3.print_topics()

In [22]:
#Print top 10 terms for each topic
warnings.filterwarnings('ignore', category=DeprecationWarning)
pprint(top_10)

[(0,
  '0.106*"battery" + 0.097*"camera" + 0.049*"mobile" + 0.035*"quality" + '
  '0.029*"performance" + 0.019*"backup" + 0.016*"hour" + 0.016*"issue" + '
  '0.015*"charger" + 0.014*"day"'),
 (1,
  '0.074*"product" + 0.051*"problem" + 0.033*"issue" + 0.020*"time" + '
  '0.019*"lenovo" + 0.019*"amazon" + 0.018*"network" + 0.016*"service" + '
  '0.016*"call" + 0.016*"screen"'),
 (2,
  '0.240*"phone" + 0.032*"price" + 0.028*"feature" + 0.023*"note" + '
  '0.022*"money" + 0.015*"camera" + 0.013*"range" + 0.011*"lenovo" + '
  '0.010*"waste" + 0.010*"quality"')]


In [23]:
#Coherence
warnings.filterwarnings('ignore', category=DeprecationWarning)
cm = CoherenceModel(model=lda_model3, texts=reviews_remstop, corpus=corpus, dictionary=id2word, coherence='c_v')
coherence = cm.get_coherence()
coherence

0.6662164320292238

In [24]:
'''
11. The business should  be able to interpret the topics.

    1. Name each of the identified topics.

'''
warnings.filterwarnings('ignore', category=DeprecationWarning)
lda_vis3 = prepare(lda_model3, corpus, id2word)
lda_vis3

In [25]:
'''
    2. Create a table with the topic name and the top 10 terms in each to present to the  business.
'''
warnings.filterwarnings('ignore', category=DeprecationWarning)
Table = pd.DataFrame()
for i in range(3):
    Table['index'+str(i)]=pd.Series(re.sub(r'\d\.\d\d\d\*', '', re.sub(r'\"', '', top_10[i][1])).split('+'))

In [26]:
warnings.filterwarnings('ignore', category=DeprecationWarning)
Table.columns = ['mobile_issue', 'product_service', 'phone_price']

In [27]:
warnings.filterwarnings('ignore', category=DeprecationWarning)
Table

Unnamed: 0,mobile_issue,product_service,phone_price
0,battery,product,phone
1,camera,problem,price
2,mobile,issue,feature
3,quality,time,note
4,performance,lenovo,money
5,backup,amazon,camera
6,hour,network,range
7,issue,service,lenovo
8,charger,call,waste
9,day,screen,quality
