In [1]:
%cd ~/Documents/iss_plp/private_project

/home/tictactoe/Documents/iss_plp/private_project


In [2]:
import os, json, re, time
import pandas as pd
import numpy as np
from collections import Counter
from config import DATASET

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## gensim
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

## **`LDA` for Topic Modelling**
___

* Assumptions:
    * Similar topics use similar set of words
    * Latent topics are found by searching for set of words that frequently occur together in documents across the corpus.
* Translate thw assumptions mathematically as:
    * Documents are probability distributions over latent topics
    * Topics themselves are probability distributions over words
* `LDA` represents documents as mixtures of topics that spit out words with certain probabilities.
* It assumes that documents are produced in the following fashion:
    * Picking a topic according to the multinomial distribution of the topics (e.g. 60% business, 20% politics, 10% food).
    * Using the topic to generate the word itself (e.g. for food topic, the word 'apple' is generated with 60% probability, 'home' with 30% probability and so on).
* Assuming such generative model for the corpus, `LDA` then tries to backtrack from the documents to find a set of topics and corresponding words distribution that are likely to have generated the collection.

### **Backend algorithm of `LDA`**
___
* Given a number of topic to `K`.
* Go through each document, and randomly assign each word in the document to one of the `K` topics.
    * This random assignment gives you both topic representations of all the documents and word distributions of all the topics.
* Iterate over every word in every document to improve these topics:
    * For every word in every document, and for each topic $t$, we compute:
        * $p(t | d)$: the proportion of words in document $d$ that are currently assigned to topic $t$.
        * $p(w | t)$: the proportion of word $w$ over all words assigned to topic $t$ for every document.
        * The product of the two terms, averaged over the corpus, could be taken as an estimator for $p(w | t)$.
* In the next iteration, re-assign `w` a new topic where we choose topic `t` with probability of $p(t | w)$, which is also proportional to $p(w| t)$.
* The process is repeated for some iterations until the topic assignments stabilizes.
* At the end:
    * Each document is assigned to a topic.
    * We can search for the most probable words assigned to a topic.

    

## **Load dataset**
___
* Load dataset
* Convert token sequence from string sequence to list
* Get *index2token* dictionary for all the tokens in the dataset
* Filter out tokens that occur less than twice or more than 95% of the total tokens

In [3]:
## data path
data_path = os.path.join(DATASET, 'AmazonFashion','TokenizedReviews.csv' )
df = pd.read_csv(data_path)
print("Total number of reviews loaded: ", len(df))
df.head()

Total number of reviews loaded:  823046


Unnamed: 0,id,overall,asin,review,tokens
0,0,5.0,7106116521,Exactly what I needed.,exactly need
1,1,2.0,7106116521,"I agree with the other review, the opening is ...",agree review opening small almost bend hook ex...
2,2,4.0,7106116521,Love these... I am going to order another pack...,love go order pack keep work include always lo...
3,3,2.0,7106116521,too tiny an opening,tiny opening
4,4,3.0,7106116521,Okay,


In [4]:
## remove rows with no tokens
df = df[df['tokens'].notna()]
print("Total number of reviews loaded after removal: ", len(df))

Total number of reviews loaded after removal:  814211


In [5]:
# convert the tokens columns to list of tokens
df['tokens'] = df['tokens'].map(lambda x: x.split(' '))

In [6]:
df.head()

Unnamed: 0,id,overall,asin,review,tokens
0,0,5.0,7106116521,Exactly what I needed.,"[exactly, need]"
1,1,2.0,7106116521,"I agree with the other review, the opening is ...","[agree, review, opening, small, almost, bend, ..."
2,2,4.0,7106116521,Love these... I am going to order another pack...,"[love, go, order, pack, keep, work, include, a..."
3,3,2.0,7106116521,too tiny an opening,"[tiny, opening]"
5,5,5.0,7106116521,Exactly what I wanted.,"[exactly, want]"


In [7]:
## create the index2token dictionary
dictionary = Dictionary(df['tokens'])

# print the first ten tokens in the dictionary as demonstration 
i = 0
for idx, tk in dictionary.iteritems():
    print(f'{idx} --> {tk}')
    i+=1
    if i == 10:
        break

0 --> exactly
1 --> need
2 --> agree
3 --> almost
4 --> back
5 --> be
6 --> bend
7 --> buy
8 --> earring
9 --> expensive


In [8]:
## filter out tokens found in:
# less than 2 documents 
# more than 95 percent of the documents
dictionary.filter_extremes(no_below=2,
                          no_above=0.95,)

In [9]:
## for each document, we create a dictionary reporting how many words
# and how many times those words appear
bow_corpus = [dictionary.doc2bow(tks) for tks in df['tokens']]

In [10]:
## get a sample list of tokens 
sample_tks = df['tokens'].iloc[24]
sample_bow = bow_corpus[24]
print("Sample token list: ")
print(sample_tks)
print("===============================")
print("Sample token index and count: ")
for idx, count in sample_bow:
    print(f"{idx} --> {dictionary[idx]} --> Count: {count}")

Sample token list: 
['make', 'well', 'hold', 'small', 'pack', 'ciggs', 'smoke', 'use', 'tip', 'holder', 'work', 'buy', 'different', 'last', 'year', 'fall', 'apart', 'replace', 'brand', 'zipper', 'compartment', 'hold', 'small']
Sample token index and count: 
7 --> buy --> Count: 1
18 --> small --> Count: 2
30 --> pack --> Count: 1
34 --> work --> Count: 1
82 --> fall --> Count: 1
88 --> last --> Count: 1
91 --> make --> Count: 1
105 --> use --> Count: 1
106 --> year --> Count: 1
107 --> zipper --> Count: 1
109 --> brand --> Count: 1
111 --> hold --> Count: 2
116 --> well --> Count: 1
117 --> smoke --> Count: 1
164 --> compartment --> Count: 1
175 --> apart --> Count: 1
176 --> different --> Count: 1
177 --> holder --> Count: 1
178 --> replace --> Count: 1
179 --> tip --> Count: 1


## **`LDA` on the corpus bag of words**
___
* Documentation of `LdaMulticore` [here](https://radimrehurek.com/gensim/models/ldamulticore.html)

#### `Topics = 4`
___

In [13]:
## write a function to gather all the steps above
def get_lda_topics(bow_corpus, id2word, n_topics, n_passes, n_cores):
    print("Starting LDA learning......")
    print("Learning %.d topics across %.d cores......" %(n_topics, n_cores-1))
    start = time.perf_counter()
    lda_model = LdaMulticore(bow_corpus, num_topics=n_topics,
                            id2word=id2word, passes=n_passes, 
                            workers=n_cores-1)
    end = time.perf_counter()
    print("Completed training in %.3f seconds." %(end-start))
    
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: ', idx)
        print('Words: ', topic)

In [14]:
N_TOPICS = 4
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

get_lda_topics(bow_corpus, dictionary, N_TOPICS, N_PASSES, N_CORES)

Starting LDA learning......
Learning 4 topics across 9 cores......
Completed training in 90.780 seconds.
Topic:  0
Words:  0.020*"wear" + 0.017*"well" + 0.014*"fit" + 0.014*"old" + 0.013*"wash" + 0.013*"be" + 0.013*"year" + 0.011*"buy" + 0.010*"warm" + 0.010*"get"
Topic:  1
Words:  0.101*"love" + 0.057*"great" + 0.038*"fit" + 0.026*"perfect" + 0.025*"good" + 0.024*"look" + 0.021*"quality" + 0.019*"buy" + 0.017*"nice" + 0.017*"cute"
Topic:  2
Words:  0.033*"size" + 0.031*"small" + 0.029*"be" + 0.026*"fit" + 0.022*"order" + 0.018*"look" + 0.015*"wear" + 0.015*"dress" + 0.015*"large" + 0.011*"little"
Topic:  3
Words:  0.014*"be" + 0.014*"get" + 0.012*"use" + 0.011*"make" + 0.010*"wear" + 0.009*"bag" + 0.009*"time" + 0.008*"look" + 0.007*"back" + 0.007*"buy"


#### `Topics = 5`

In [13]:
N_TOPICS = 5
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

print("Starting LDA learning......")
print("Learning %.d topics across %.d cores......" %(N_TOPICS, N_CORES-1))
start = time.perf_counter()
lda_model = LdaMulticore(bow_corpus, num_topics=N_TOPICS,
                        id2word=dictionary, passes=N_PASSES, 
                        workers=N_CORES-1)
end = time.perf_counter()
print("Completed training in %.3f seconds." %(end-start))

Starting LDA learning......
Learning 5 topics across 9 cores......
Completed training in 89.056 seconds.


In [14]:
## view the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: ', idx)
    print('Words: ', topic)

Topic:  0
Words:  0.116*"love" + 0.068*"great" + 0.043*"fit" + 0.029*"perfect" + 0.023*"buy" + 0.021*"wear" + 0.018*"look" + 0.017*"comfortable" + 0.016*"get" + 0.016*"color"
Topic:  1
Words:  0.025*"look" + 0.023*"be" + 0.016*"get" + 0.014*"ring" + 0.012*"buy" + 0.012*"wear" + 0.011*"pretty" + 0.010*"gift" + 0.010*"love" + 0.009*"beautiful"
Topic:  2
Words:  0.017*"wear" + 0.014*"use" + 0.014*"get" + 0.012*"be" + 0.012*"time" + 0.010*"work" + 0.009*"keep" + 0.009*"shoe" + 0.008*"make" + 0.008*"well"
Topic:  3
Words:  0.035*"good" + 0.033*"nice" + 0.032*"quality" + 0.023*"look" + 0.018*"product" + 0.016*"color" + 0.015*"well" + 0.015*"price" + 0.014*"make" + 0.014*"expect"
Topic:  4
Words:  0.039*"size" + 0.037*"small" + 0.034*"fit" + 0.032*"be" + 0.022*"order" + 0.019*"large" + 0.018*"wear" + 0.018*"dress" + 0.014*"little" + 0.014*"big"


In [11]:
## write a function to gather all the steps above
def get_lda_topics(bow_corpus, id2word, n_topics, n_passes, n_cores):
    print("Starting LDA learning......")
    print("Learning %.d topics across %.d cores......" %(n_topics, n_cores-1))
    start = time.perf_counter()
    lda_model = LdaMulticore(bow_corpus, num_topics=n_topics,
                            id2word=id2word, passes=n_passes, 
                            workers=n_cores-1)
    end = time.perf_counter()
    print("Completed training in %.3f seconds." %(end-start))
    
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: ', idx)
        print('Words: ', topic)

#### `Topics = 6`

In [17]:
N_TOPICS = 6
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

get_lda_topics(bow_corpus, dictionary, N_TOPICS, N_PASSES, N_CORES)

Starting LDA learning......
Learning 6 topics across 9 cores......
Completed training in 88.923 seconds.
Topic:  0
Words:  0.054*"size" + 0.048*"small" + 0.033*"fit" + 0.031*"order" + 0.030*"be" + 0.023*"large" + 0.020*"wear" + 0.017*"dress" + 0.016*"big" + 0.014*"get"
Topic:  1
Words:  0.091*"love" + 0.024*"beautiful" + 0.021*"wear" + 0.021*"look" + 0.020*"buy" + 0.019*"get" + 0.018*"perfect" + 0.017*"great" + 0.015*"gift" + 0.014*"ring"
Topic:  2
Words:  0.026*"bag" + 0.018*"make" + 0.017*"great" + 0.017*"well" + 0.016*"pocket" + 0.015*"product" + 0.014*"be" + 0.014*"wallet" + 0.012*"love" + 0.012*"purse"
Topic:  3
Words:  0.018*"get" + 0.015*"wear" + 0.015*"be" + 0.015*"use" + 0.013*"time" + 0.010*"belt" + 0.009*"first" + 0.009*"day" + 0.009*"break" + 0.008*"put"
Topic:  4
Words:  0.037*"good" + 0.033*"look" + 0.025*"cheap" + 0.024*"quality" + 0.015*"make" + 0.013*"picture" + 0.012*"buy" + 0.012*"material" + 0.012*"shoe" + 0.011*"price"
Topic:  5
Words:  0.049*"fit" + 0.041*"great" 

#### `Topics = 7`

In [12]:
N_TOPICS = 7
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

get_lda_topics(bow_corpus, dictionary, N_TOPICS, N_PASSES, N_CORES)

Starting LDA learning......
Learning 7 topics across 9 cores......
Completed training in 83.763 seconds.
Topic:  0
Words:  0.017*"be" + 0.015*"get" + 0.014*"bag" + 0.012*"make" + 0.011*"back" + 0.010*"return" + 0.009*"look" + 0.009*"use" + 0.008*"pocket" + 0.008*"come"
Topic:  1
Words:  0.047*"great" + 0.041*"quality" + 0.040*"good" + 0.030*"price" + 0.028*"look" + 0.024*"beautiful" + 0.021*"product" + 0.020*"love" + 0.017*"ring" + 0.013*"nice"
Topic:  2
Words:  0.024*"shoe" + 0.024*"wear" + 0.022*"work" + 0.020*"use" + 0.016*"pair" + 0.015*"watch" + 0.014*"great" + 0.012*"foot" + 0.012*"well" + 0.012*"time"
Topic:  3
Words:  0.054*"fit" + 0.053*"size" + 0.046*"small" + 0.029*"order" + 0.027*"be" + 0.027*"dress" + 0.023*"large" + 0.018*"wear" + 0.014*"big" + 0.013*"shirt"
Topic:  4
Words:  0.030*"be" + 0.015*"look" + 0.013*"well" + 0.012*"little" + 0.012*"wear" + 0.012*"make" + 0.011*"review" + 0.010*"fit" + 0.008*"cute" + 0.007*"head"
Topic:  5
Words:  0.131*"love" + 0.037*"wear" + 0.

#### `Topics = 8`

In [15]:
N_TOPICS = 8
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

get_lda_topics(bow_corpus, dictionary, N_TOPICS, N_PASSES, N_CORES)

Starting LDA learning......
Learning 8 topics across 9 cores......
Completed training in 82.589 seconds.
Topic:  0
Words:  0.177*"love" + 0.047*"fit" + 0.041*"perfect" + 0.031*"cute" + 0.029*"buy" + 0.026*"comfortable" + 0.023*"wear" + 0.022*"daughter" + 0.020*"great" + 0.018*"soft"
Topic:  1
Words:  0.021*"bag" + 0.014*"use" + 0.014*"well" + 0.014*"be" + 0.013*"keep" + 0.013*"great" + 0.012*"strap" + 0.012*"work" + 0.012*"make" + 0.011*"pocket"
Topic:  2
Words:  0.077*"great" + 0.031*"look" + 0.030*"price" + 0.030*"quality" + 0.029*"beautiful" + 0.022*"fit" + 0.021*"product" + 0.021*"good" + 0.020*"love" + 0.018*"well"
Topic:  3
Words:  0.030*"old" + 0.030*"year" + 0.018*"wallet" + 0.018*"use" + 0.018*"be" + 0.016*"month" + 0.016*"get" + 0.012*"card" + 0.011*"purse" + 0.010*"buy"
Topic:  4
Words:  0.017*"get" + 0.014*"be" + 0.013*"look" + 0.013*"wear" + 0.012*"make" + 0.011*"time" + 0.011*"ring" + 0.010*"product" + 0.010*"review" + 0.009*"buy"
Topic:  5
Words:  0.054*"size" + 0.050*"s

#### `Topics = 9`

In [16]:
N_TOPICS = 9
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

get_lda_topics(bow_corpus, dictionary, N_TOPICS, N_PASSES, N_CORES)

Starting LDA learning......
Learning 9 topics across 9 cores......
Completed training in 79.466 seconds.
Topic:  0
Words:  0.030*"color" + 0.026*"shirt" + 0.024*"be" + 0.024*"look" + 0.023*"wear" + 0.020*"material" + 0.016*"picture" + 0.015*"nice" + 0.014*"dress" + 0.013*"love"
Topic:  1
Words:  0.167*"love" + 0.075*"cute" + 0.025*"super" + 0.024*"little" + 0.021*"big" + 0.020*"be" + 0.018*"daughter" + 0.015*"fit" + 0.015*"head" + 0.014*"really"
Topic:  2
Words:  0.052*"wear" + 0.043*"great" + 0.033*"love" + 0.028*"comfortable" + 0.027*"shoe" + 0.022*"work" + 0.021*"fit" + 0.020*"get" + 0.017*"pair" + 0.016*"buy"
Topic:  3
Words:  0.030*"size" + 0.024*"review" + 0.019*"year" + 0.016*"receive" + 0.016*"product" + 0.015*"purchase" + 0.015*"watch" + 0.014*"old" + 0.013*"get" + 0.013*"buy"
Topic:  4
Words:  0.051*"size" + 0.051*"small" + 0.041*"fit" + 0.033*"be" + 0.030*"order" + 0.025*"large" + 0.022*"dress" + 0.016*"wear" + 0.014*"big" + 0.013*"top"
Topic:  5
Words:  0.034*"bag" + 0.021*

#### `Topics = 10`

In [17]:
N_TOPICS = 10
N_PASSES = 3 # number of passes through the corpus during training
N_CORES = 10

get_lda_topics(bow_corpus, dictionary, N_TOPICS, N_PASSES, N_CORES)

Starting LDA learning......
Learning 10 topics across 9 cores......
Completed training in 79.497 seconds.
Topic:  0
Words:  0.045*"love" + 0.025*"wear" + 0.024*"ring" + 0.023*"gift" + 0.018*"get" + 0.018*"be" + 0.016*"buy" + 0.016*"watch" + 0.015*"break" + 0.013*"look"
Topic:  1
Words:  0.084*"size" + 0.080*"small" + 0.043*"order" + 0.041*"fit" + 0.038*"large" + 0.030*"be" + 0.028*"big" + 0.018*"wear" + 0.017*"run" + 0.016*"medium"
Topic:  2
Words:  0.110*"love" + 0.076*"fit" + 0.072*"great" + 0.050*"perfect" + 0.023*"buy" + 0.021*"old" + 0.019*"comfortable" + 0.018*"daughter" + 0.017*"year" + 0.017*"wear"
Topic:  3
Words:  0.032*"cute" + 0.022*"be" + 0.021*"wear" + 0.017*"get" + 0.011*"super" + 0.011*"wash" + 0.011*"make" + 0.011*"well" + 0.010*"really" + 0.010*"back"
Topic:  4
Words:  0.044*"shoe" + 0.031*"pair" + 0.031*"wear" + 0.023*"foot" + 0.021*"sock" + 0.015*"good" + 0.015*"comfortable" + 0.013*"boot" + 0.012*"buy" + 0.011*"work"
Topic:  5
Words:  0.022*"review" + 0.022*"produc