In [1]:
import pandas as pd
import numpy as np


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag


import string


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


from pprint import pprint


### Read the .csv file using Pandas. Take a look at the top few records.

In [2]:
df = pd.read_csv('K8 Reviews v0.2.csv')
df.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [3]:
df.shape

(14675, 2)

### Normalize casings for the review text and extract the text into a list for easier manipulation.

In [4]:
reviews_values = list(df['review'].values)
# to change it to lower case
reviews_list = [review.lower() for review in reviews_values]
len(reviews_list)

14675

In [5]:
reviews_list[0]

'good but need updates and improvements'

### Tokenize the reviews using NLTKs word_tokenize function

In [6]:
reviews_tokens = [word_tokenize(review) for review in reviews_list]
reviews_tokens[0]

['good', 'but', 'need', 'updates', 'and', 'improvements']

### Perform parts-of-speech tagging on each sentence using the NLTK POS tagger.

In [7]:
reviews_pos_tags = [pos_tag(review) for review in reviews_tokens]
reviews_pos_tags[0]

[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

### For the topic model, we should  want to include only nouns.

1- Find out all the POS tags that correspond to nouns.

2- Limit the data to only terms with these tags.

In [8]:
# 1- Find out all the POS tags that correspond to nouns.
nltk.download('tagsets')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/ahmadbasha/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [9]:
# 2- Limit the data to only terms with these tags.
nouns = []
for single_review in reviews_pos_tags:
    nouns.append([(word, tag) for word, tag in single_review if tag.startswith('NN')])

nouns[0]

[('updates', 'NNS'), ('improvements', 'NNS')]

### Lemmatize. 

1-Different forms of the terms need to be treated as one.

2-No need to provide POS tag to lemmatizer for now.

In [10]:
## Lemmatization
wnl = WordNetLemmatizer()
wnl_reviews =[]

for review in nouns:
    wnl_reviews.append([wnl.lemmatize(word) for word, tag in review])
    
wnl_reviews[0]

['update', 'improvement']

### Remove stopwords and punctuation (if there are any). 

In [11]:
words = []
for word in wnl_reviews:
    words.append([single_word for single_word in word if single_word not in (stopwords.words('English') + list(string.punctuation))])

words[1]

['mobile',
 'battery',
 'hell',
 'backup',
 'hour',
 'us',
 'idle',
 'discharged.this',
 'lie',
 'amazon',
 'lenove',
 'battery',
 'charger',
 'hour']

### Create a topic model using LDA on the cleaned-up data with 12 topics.

1-Print out the top terms for each topic.

2-What is the coherence of the model with the c_v metric?

In [12]:
word_cor = corpora.Dictionary(words)
print(word_cor)

Dictionary(9263 unique tokens: ['improvement', 'update', 'amazon', 'backup', 'battery']...)


In [13]:
# The term frequency of the document
corpus = [word_cor.doc2bow(word) for word in words]
print(corpus[0])

[(0, 1), (1, 1)]


In [14]:
# Bbuilding LDA model with 12 topics

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=word_cor,
                                       num_topics=12, 
                                       random_state=0,
                                       per_word_topics=True)

In [15]:
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=words, 
                                     dictionary=word_cor, 
                                     coherence='c_v')

In [16]:
# Coherence Score
coherence_lda = coherence_model_lda.get_coherence()
coherence_lda

0.5376462357958322

# Analyze the topics through the business lens.

Determine which of the topics can be combined.

In [17]:
# In 12 topics showing the keywords 
pprint(lda_model.print_topics())

[(0,
  '0.284*"phone" + 0.084*".." + 0.042*"issue" + 0.022*"price" + '
  '0.019*"network" + 0.018*"amazon" + 0.017*"time" + 0.017*"day" + '
  '0.015*"month" + 0.010*"lenovo"'),
 (1,
  '0.223*"battery" + 0.054*"backup" + 0.053*"camera" + 0.042*"charger" + '
  '0.033*"life" + 0.031*"issue" + 0.030*"phone" + 0.025*"h" + 0.024*"hour" + '
  '0.022*"charge"'),
 (2,
  '0.099*"camera" + 0.096*"performance" + 0.059*"phone" + 0.044*"price" + '
  '0.033*"quality" + 0.033*"processor" + 0.028*"ram" + 0.021*"range" + '
  '0.017*"photo" + 0.016*"usage"'),
 (3,
  '0.095*"camera" + 0.074*"feature" + 0.070*"phone" + 0.041*"device" + '
  '0.025*"update" + 0.021*"heat" + 0.016*"android" + 0.015*"music" + '
  '0.014*"stock" + 0.013*"battery"'),
 (4,
  '0.186*"problem" + 0.057*"heating" + 0.035*"handset" + 0.035*"speaker" + '
  '0.022*"battery" + 0.021*"call" + 0.019*"piece" + 0.017*"waste" + '
  '0.015*"replacement" + 0.015*"set"'),
 (5,
  '0.060*"camera" + 0.048*"mode" + 0.041*"sim" + 0.030*"phone" + '
  

### The Possible Topics 
(1,3)The phone price
(2,4-7)Device feature
(8-10)Pros and Cons 
(11-12)product delivery & service



# Create topic model using LDA with what you think is the optimal number of topics

What is the coherence of the model?

In [18]:
# Build LDA model
current = 0
number_of_topics = 0
for n_topic in range(1,20):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=word_cor,
                                                num_topics=n_topic, 
                                                random_state=0,
                                                per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                                texts=words, 
                                                dictionary=word_cor, 
                                                coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    if coherence_lda>current :
        current = coherence_lda
        number_of_topics = n_topic
    
    

In [19]:
current

0.5735257813727284

In [20]:
number_of_topics

9

In [21]:
# Bbuilding LDA model with 12 topics

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=word_cor,
                                       num_topics=9, 
                                       random_state=0,
                                       per_word_topics=True)

coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=words, 
                                     dictionary=word_cor, 
                                     coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()
coherence_lda

0.5735257813727284

In [22]:
pprint(lda_model.print_topics())

[(0,
  '0.194*"phone" + 0.073*".." + 0.037*"product" + 0.030*"money" + '
  '0.027*"amazon" + 0.026*"service" + 0.024*"issue" + 0.018*"day" + '
  '0.015*"time" + 0.014*"waste"'),
 (1,
  '0.193*"battery" + 0.074*"camera" + 0.060*"quality" + 0.052*"backup" + '
  '0.036*"phone" + 0.027*"life" + 0.024*"issue" + 0.023*"h" + '
  '0.016*"everything" + 0.016*"drain"'),
 (2,
  '0.119*"camera" + 0.088*"phone" + 0.058*"price" + 0.057*"performance" + '
  '0.028*"quality" + 0.019*"charger" + 0.018*"mode" + 0.018*"battery" + '
  '0.017*"processor" + 0.017*"ram"'),
 (3,
  '0.148*"product" + 0.054*"camera" + 0.053*"feature" + 0.039*"phone" + '
  '0.026*"device" + 0.022*"superb" + 0.020*"heat" + 0.017*"battery" + '
  '0.014*"android" + 0.012*"clarity"'),
 (4,
  '0.096*"problem" + 0.045*"heating" + 0.032*"handset" + 0.029*"hai" + '
  '0.020*"glass" + 0.019*"piece" + 0.016*"screen" + 0.015*"super" + 0.014*"ho" '
  '+ 0.013*"experience"'),
 (5,
  '0.066*"network" + 0.064*"issue" + 0.040*"sim" + 0.025*"phon

# The business should  be able to interpret the topics.

1-Name each of the identified topics.

2-Create a table with the topic name and the top 10 terms in each to present to the  business.

In [23]:
names = ['Phone issues', 
         'Battery Quality', 
         'Camera Performance', 
         'Phone Features' ,
         'Phone Cons' ,
         'network issues' , 
         'Phone Performance', 
         'Phone Review',
         'phone option'
        ]
topic_id = [1,2,3,4,5,6,7,8,9]

In [24]:
topics = lda_model.show_topics(formatted=False)
top = []
for topic in topics:
    top.append([term[0] for term in topic[1]])

topics = pd.DataFrame({'ID': topic_id, 'Top Terms':top})

In [25]:
topics.style.hide_index()

ID,Top Terms
1,"['phone', '..', 'product', 'money', 'amazon', 'service', 'issue', 'day', 'time', 'waste']"
2,"['battery', 'camera', 'quality', 'backup', 'phone', 'life', 'issue', 'h', 'everything', 'drain']"
3,"['camera', 'phone', 'price', 'performance', 'quality', 'charger', 'mode', 'battery', 'processor', 'ram']"
4,"['product', 'camera', 'feature', 'phone', 'device', 'superb', 'heat', 'battery', 'android', 'clarity']"
5,"['problem', 'heating', 'handset', 'hai', 'glass', 'piece', 'screen', 'super', 'ho', 'experience']"
6,"['network', 'issue', 'sim', 'phone', 'update', 'jio', 'volta', 'mobile', 'call', 'message']"
7,"['phone', 'note', 'camera', 'battery', 'feature', 'screen', 'lenovo', 'call', 'k8', 'issue']"
8,"['mobile', 'problem', 'phone', 'battery', 'time', 'day', 'hr', 'charging', 'camera', 'heat']"
9,"['....', 'note', 'k8', 'phone', 'camera', 'smartphone', 'option', 'quality', 'return', 'display']"
