# Reference: [Topic Modelling](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('insta data/comment_noemo_score.xlsx', encoding='latin-1')
df.head(10)

Unnamed: 0,comments,Categories,google_score,textblob_score,avg_score
0,thanks,Nice content,0.3,0.2,0.25
1,Nice Designs,Praise on product,0.8,0.6,0.7
2,How much?,Query about price or oder or product,0.0,0.2,0.1
3,Looks good! Price?,Query about price or oder or product,0.3,0.875,0.5875
4,Fantastic,Nice content,0.9,0.4,0.65
5,Lovely shoes. It looks like a hole in the welt...,Praise on product,0.1,0.25,0.175
6,Beautiful,Nice content,0.8,0.85,0.825
7,How much for this piece?,Query about price or oder or product,-0.1,0.2,0.05
8,kindly DM what size available,Query about price or oder or product,0.0,0.5,0.25
9,Beautiful colour,Nice content,0.8,0.85,0.825


In [3]:
len(df['comments'].isna())

328

In [4]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from autocorrect import spell

In [5]:
stop = stopwords.words('english')
#Text Lemmatization
#As we are using wordnet Lemmatizer and the the standard NLTK pos tags are treebank tags, we need to convert the treebank tag
#to wordnet tags. 
from nltk.stem import WordNetLemmatizer
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
    
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatization(sentence):
    words = nltk.word_tokenize(sentence)
    tagged_words = nltk.pos_tag(words)
    lemmatized_wordlist=[]
    for w in tagged_words:
        wordnettag=get_wordnet_pos(w[1])
        if wordnettag == '':
            lemmatizedword = wordnet_lemmatizer.lemmatize(w[0].lower())
        else:
            lemmatizedword = wordnet_lemmatizer.lemmatize(w[0].lower(),pos=wordnettag)
        if w[0].istitle():
            lemmatizedword = lemmatizedword.capitalize()
        elif w[0].upper()==w[0]:
            lemmatizedword = lemmatizedword.upper()
        else:
            lemmatizedword = lemmatizedword
        lemmatized_wordlist.append(lemmatizedword)
            
    return lemmatized_wordlist


In [6]:
remove_list = ['hi', 'seamlessbespoke', "'ll", "'re",  "'d", 'congratulations', 
               'congrats', 'allureisourduty', 'pls', 'hey', 'make', 'look', 'post', 'pic', 'really', 'thanks', 'ri', 'veri',
              'much', 'one', 'please', 'would', 'keep', 'wait']

In [7]:
def preprocess(x):
    #lower case
    x = x.lower()
    
    #remove useless noise
#     for lol in remove_list:
#         x = x.replace(lol, '')
    words = nltk.word_tokenize(x)
#     print(words)
    words = [w for w in words if w not in remove_list]
#     print(words)
#     #remove punctuation
#     tokenizer = RegexpTokenizer(r'\w+')
#     tokens = tokenizer.tokenize(x)
#     tokens = words[3:]
    
    #remove stopwords
    filtered_words = [w for w in words if  w not in stop]
    text = " ".join(filtered_words)
    
#     #autocorrect spelling error
#     spells = [spell(w) for w in (nltk.word_tokenize(text))]
#     text = " ".join(spells)
    
    #lemmatization
    text = lemmatization(text)
    text = [w for w in text if w not in remove_list]
#     print(text)
    text = " ".join(text)
    output = text
    
    #remove digits
#     output = ''.join(c for c in text if not c.isdigit())
    
    
    return output


In [8]:
preprocess(df['comments'][134])

'nice gent'

In [9]:
df['clean_text'] = df['comments'].apply(preprocess)

In [10]:
df.head(10)

Unnamed: 0,comments,Categories,google_score,textblob_score,avg_score,clean_text
0,thanks,Nice content,0.3,0.2,0.25,
1,Nice Designs,Praise on product,0.8,0.6,0.7,nice design
2,How much?,Query about price or oder or product,0.0,0.2,0.1,?
3,Looks good! Price?,Query about price or oder or product,0.3,0.875,0.5875,good ! price ?
4,Fantastic,Nice content,0.9,0.4,0.65,fantastic
5,Lovely shoes. It looks like a hole in the welt...,Praise on product,0.1,0.25,0.175,lovely shoe . like hole welt stitch skip towar...
6,Beautiful,Nice content,0.8,0.85,0.825,beautiful
7,How much for this piece?,Query about price or oder or product,-0.1,0.2,0.05,piece ?
8,kindly DM what size available,Query about price or oder or product,0.0,0.5,0.25,kindly dm size available
9,Beautiful colour,Nice content,0.8,0.85,0.825,beautiful colour


In [11]:
# # Import the wordcloud library
# from wordcloud import WordCloud

# # useless_list = ['hi', 'seamlessbespoke', 'll', 'congratulation', 'congrats']

# # Join the different processed titles together.
# long_string = ' '.join(list(df['clean_text'].values))

# # long_string_list = long_string.split()
# # long_string_list = [w for w in long_string_list if w not in useless_list]
# # long_string = ' '.join(long_string_list)

# # Create a WordCloud object
# wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')

# # Generate a word cloud
# wordcloud.generate(long_string)

# # Visualize the word cloud
# wordcloud.to_image()

In [12]:
# # Load the library with the CountVectorizer method
# from sklearn.feature_extraction.text import CountVectorizer
# import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_style('whitegrid')
# %matplotlib inline 

# # Helper function
# def plot_10_most_common_words(count_data, count_vectorizer):
#     import matplotlib.pyplot as plt
#     words = count_vectorizer.get_feature_names()
#     total_counts = np.zeros(len(words))
#     for t in count_data:
#         total_counts+=t.toarray()[0]
    
#     count_dict = (zip(words, total_counts))
#     count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:50]
#     words = [w[0] for w in count_dict]
#     counts = [w[1] for w in count_dict]
#     x_pos = np.arange(len(words)) 
    
#     plt.figure(2, figsize=(15, 15/1.6180))
#     plt.subplot(title='50 most common words')
#     sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
#     sns.barplot(x_pos, counts, palette='husl')
#     plt.xticks(x_pos, words, rotation=90) 
#     plt.xlabel('words')
#     plt.ylabel('counts')
#     plt.show()

# # Initialise the count vectorizer with the English stop words    
# count_vectorizer = CountVectorizer(stop_words='english')
# # Fit and transform the processed titles
# count_data = count_vectorizer.fit_transform(df['clean_text'])
# # Visualise the 10 most common words
# plot_10_most_common_words(count_data, count_vectorizer)

In [13]:
# import warnings
# warnings.simplefilter("ignore", DeprecationWarning)

# # Load the LDA model from sk-learn
# from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# # Helper function
# def print_topics(model, count_vectorizer, n_top_words):
#     words = count_vectorizer.get_feature_names()
#     for topic_idx, topic in enumerate(model.components_):
#         print("\nTopic #%d:" % (topic_idx+1))
#         print(" ".join([words[i]
#                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# # Tweak the two parameters below
# number_topics = 3
# number_words = 10

# # Create and fit the LDA model
# lda = LDA(n_components=number_topics, n_jobs=-1)
# lda.fit(count_data)

# # Print the topics found by the LDA model
# print("Topics found via LDA:")
# print_topics(lda, count_vectorizer, number_words)

In [14]:
# %%time
# from pyLDAvis import sklearn as sklearn_lda
# import pickle 
# import pyLDAvis

# LDAvis_data_filepath = ''.join('./ldavis_prepared_'+str(number_topics))
# # # this is a bit time consuming - make the if statement True
# # # if you want to execute visualization prep yourself

# if 1 == 1:
#     LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)

In [15]:
# pyLDAvis.enable_notebook()
# pyLDAvis.display(LDAvis_prepared)

In [16]:
# with open(LDAvis_data_filepath, 'wb') as f:
#     pickle.dump(LDAvis_prepared, f)

# # load the pre-prepared pyLDAvis data from disk
# with open(LDAvis_data_filepath, 'rb') as f:
#     LDAvis_prepared = pickle.load(f)
# pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')

In [18]:
from gensim.models import Phrases
from nltk.tokenize import word_tokenize

In [19]:
tokens = []
for post in df['clean_text']:
    x = word_tokenize(post)
    x = [w for w in x if w.isalpha()]
    tokens.append(x)

In [20]:
df['tokens'] = tokens
df.head()

Unnamed: 0,comments,Categories,google_score,textblob_score,avg_score,clean_text,tokens
0,thanks,Nice content,0.3,0.2,0.25,,[]
1,Nice Designs,Praise on product,0.8,0.6,0.7,nice design,"[nice, design]"
2,How much?,Query about price or oder or product,0.0,0.2,0.1,?,[]
3,Looks good! Price?,Query about price or oder or product,0.3,0.875,0.5875,good ! price ?,"[good, price]"
4,Fantastic,Nice content,0.9,0.4,0.65,fantastic,[fantastic]


In [21]:
len(tokens)

328

In [22]:
#Prepare bi-grams and tri-grams
tokens = df['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

In [23]:
from gensim import corpora

In [24]:
#Prepare objects for LDA gensim implementation
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

In [25]:
#running LDA
from gensim import models
import numpy as np

In [26]:
np.random.seed(123)
num_topics = 2
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

Wall time: 617 ms


In [27]:
#looking at topics
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print("Topic #"+str(i)+": ")
    print(topic)
    print()

Topic #0: 
0.084*"great" + 0.081*"beautiful" + 0.061*"lovely" + 0.056*"wow" + 0.045*"perfect" + 0.045*"cool" + 0.045*"awesome" + 0.040*"order" + 0.040*"suit" + 0.033*"detail" + 0.030*"linen" + 0.026*"purchase" + 0.026*"stylish" + 0.026*"dm_price" + 0.020*"go" + 0.020*"still" + 0.020*"yanko" + 0.020*"nice_work" + 0.020*"classy" + 0.020*"pm_price"

Topic #1: 
0.176*"nice" + 0.092*"love" + 0.076*"like" + 0.060*"price" + 0.055*"good" + 0.050*"amazing" + 0.045*"photo" + 0.045*"shoe" + 0.030*"fabric" + 0.025*"could" + 0.024*"beautiful" + 0.022*"great" + 0.021*"picture" + 0.020*"right" + 0.020*"best" + 0.020*"top" + 0.020*"amaze" + 0.020*"collar" + 0.018*"wow" + 0.015*"gorgeous"



In [28]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=dictionary_LDA)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [30]:
import pickle
LDAvis_data_filepath = ''.join('./ldavis_prepared_'+str(num_topics))
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(vis, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')

In [32]:
lda_model[corpus[0]]
topics_list = [lda_model[corpus[i]] for i in range(len(df))]

In [33]:
topics_list

[[(0, 0.5), (1, 0.5)],
 [(1, 0.9901961)],
 [(0, 0.5), (1, 0.5)],
 [(1, 0.9950495)],
 [(0, 0.990196)],
 [(0, 0.3003802), (1, 0.6996198)],
 [(0, 0.99019605)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.990196)],
 [(0, 0.9950494)],
 [(0, 0.9950495)],
 [(1, 0.99019605)],
 [(1, 0.990196)],
 [(0, 0.99019605)],
 [(0, 0.9950495)],
 [(1, 0.99019605)],
 [(1, 0.99019605)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.72617346), (1, 0.27382657)],
 [(1, 0.99019605)],
 [(1, 0.99019605)],
 [(0, 0.5), (1, 0.5)],
 [(1, 0.99019605)],
 [(1, 0.99019605)],
 [(1, 0.9950495)],
 [(0, 0.9950494)],
 [(0, 0.9950495)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.99019605)],
 [(1, 0.9950495)],
 [(1, 0.9950495)],
 [(1, 0.99019605)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.99019605)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.99019605)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.9966887)],
 [(0, 0.99019605)],
 [(0, 0.9950495)],
 [(0, 0.5), (1, 0.5)],
 [(0, 0.99019605)],
 [(0, 0.9950495)],
 [(0, 0.99019605)],
 [(0, 0.

In [34]:
df['topic'] = topics_list

In [35]:
df.head()

Unnamed: 0,comments,Categories,google_score,textblob_score,avg_score,clean_text,tokens,topic
0,thanks,Nice content,0.3,0.2,0.25,,[],"[(0, 0.5), (1, 0.5)]"
1,Nice Designs,Praise on product,0.8,0.6,0.7,nice design,"[nice, design]","[(1, 0.9901961)]"
2,How much?,Query about price or oder or product,0.0,0.2,0.1,?,[],"[(0, 0.5), (1, 0.5)]"
3,Looks good! Price?,Query about price or oder or product,0.3,0.875,0.5875,good ! price ?,"[good, price]","[(1, 0.9950495)]"
4,Fantastic,Nice content,0.9,0.4,0.65,fantastic,[fantastic],"[(0, 0.990196)]"


In [36]:
df.to_excel('insta_comment_noemo_topic.xlsx', index=False)