In [1]:
import pandas as pd
import spacy
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [2]:
df = pd.read_json('../example/data/data_processed_1prod_full.json', lines=True)
df.rename({'reviewText': 'argument'}, axis=1, inplace=True)
df.rename({'overall': 'score'}, axis=1, inplace=True)
df['argument'] = df['argument'].astype(str)
df.head()

Unnamed: 0,argument,score
0,I always get a half size up in my tennis shoes...,3
1,Put them on and walked 3 hours with no problem...,5
2,excelente,5
3,The shoes fit well in the arch area. They are ...,4
4,Tried them on in a store before buying online ...,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   argument  371 non-null    object
 1   score     371 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.9+ KB


## Preprocess text data

In [83]:
# tokenize arguments
nlp = spacy.load('en_core_web_md')

removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM', 'VERB']

tokens = []
for argument in nlp.pipe(df['argument']):
    argu_tokens = []
    for token in argument:
        if token.pos_ not in removal and not token.is_stop and token.is_alpha:
            argu_tokens.append(token.lemma_.lower())
    tokens.append(argu_tokens)
df['tokens'] = tokens
df.head()

Unnamed: 0,argument,score,tokens
0,I always get a half size up in my tennis shoes...,3,"[half, size, tennis, shoe, reason, big, heel, ..."
1,Put them on and walked 3 hours with no problem...,5,"[hour, problem, light]"
2,excelente,5,[]
3,The shoes fit well in the arch area. They are ...,4,"[shoe, arch, area, little, wide, toe, area, sh..."
4,Tried them on in a store before buying online ...,5,"[store, good, durable, cross, training, shoe, ..."


In [173]:
# create dictionary
dictionary = Dictionary(df['tokens'])

# no_below and no_above should be fine-tuned
dictionary.filter_extremes(no_below=20, no_above=.5, keep_n=1000)

dictionary.token2id

{'big': 0,
 'feel': 1,
 'shoe': 2,
 'size': 3,
 'light': 4,
 'love': 5,
 'walk': 6,
 'fit': 7,
 'like': 8,
 'buy': 9,
 'comfortable': 10,
 'foot': 11,
 'good': 12,
 'great': 13,
 'look': 14,
 'purchase': 15,
 'training': 16,
 'recommend': 17,
 'pair': 18,
 'color': 19,
 'order': 20,
 'run': 21,
 'support': 22,
 'wear': 23,
 'weight': 24,
 'expect': 25,
 'nike': 26,
 'lightweight': 27,
 'time': 28,
 'use': 29,
 'nice': 30,
 'perfect': 31,
 'work': 32,
 'comfy': 33,
 'day': 34,
 'need': 35}

In [174]:
# create corpus
corpus = []
for doc in df['tokens']:
    corpus.append(dictionary.doc2bow(doc))

### LDA model + coherence score

- Converging process is time-consuming (±1m40s for exploring topic ranges of [1, 20]).
- Convergence usually results in only 1 topic, even with different coherence scores: my guess is that the reviews are mostly very general and covers multiple topics, which makes distinguish of reviews on topics diffficult.

In [175]:
# build model
n_topics = []
coherence_score = []
seed = 13

for i in range(1, 21, 1):
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, 
                         num_topics=i, workers=4, passes=10, random_state=seed)
    
    # coherence coefficient should be a bar to handle
    coherence_model = CoherenceModel(model=lda_model, 
                                    #  texts=df['tokens'],
                                     corpus=corpus,  
                                     dictionary=dictionary, coherence='u_mass')
    
    n_topics.append(i)
    coherence_score.append(coherence_model.get_coherence())
    
max_coherence = max(coherence_score)
n_topics_optimal = n_topics[coherence_score.index(max_coherence)]
n_topics_optimal, max_coherence

(1, -1.9020052371117042)

In [177]:
# topic modeling results
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, 
                         num_topics=n_topics_optimal, workers=4, 
                         passes=100, random_state=seed)
lda_model.print_topics(-1)

[(0,
  '0.139*"shoe" + 0.066*"comfortable" + 0.066*"love" + 0.060*"fit" + 0.047*"wear" + 0.040*"size" + 0.039*"foot" + 0.039*"great" + 0.030*"light" + 0.028*"good"')]

In [178]:
lda_model[corpus][0]

[(0, 1.0)]

### BERTopic Model

- The whole modeling process is modulized into sub-models. Each sub-model has various options to choose from. 
- Runtime performance is better than training LDA models from the beginning as those in BERTopic are pre-trained.
- Can generate multiple but similar topics. 

In [84]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [85]:
def join_tokens(tokens: list):
    return ' '.join(tokens)

df['tokens'] = df['tokens'].apply(join_tokens)
df

Unnamed: 0,argument,score,tokens
0,I always get a half size up in my tennis shoes...,3,half size tennis shoe reason big heel area wide
1,Put them on and walked 3 hours with no problem...,5,hour problem light
2,excelente,5,
3,The shoes fit well in the arch area. They are ...,4,shoe arch area little wide toe area shoe like ...
4,Tried them on in a store before buying online ...,5,store good durable cross training shoe rigorou...
...,...,...,...
366,Favorite Nike shoe ever! The flex sole is exce...,5,favorite nike shoe flex sole excellent free fe...
367,"I wear these everyday to work, the gym, etc.",5,everyday gym etc
368,"Love these shoes! Great fit, very light weight.",5,shoe great fit light weight
369,Super comfortable and fit my small feet perfec...,5,super comfortable small foot flat foot lot sho...


In [132]:
# umap for dimensionality reduction
umap_model = UMAP(n_neighbors=5, n_components=50, metric='cosine', low_memory=False)

# HDBSCAN for clustering documents
hdbscan_model = HDBSCAN(min_cluster_size=7, 
                        metric='euclidean', prediction_data=True)

model = BERTopic(n_gram_range=[1, 1], language='english', 
                 calculate_probabilities=True, verbose=True, 
                 umap_model=umap_model, 
                 hdbscan_model=hdbscan_model
                 )

topics, probs = model.fit_transform(df['tokens'])
new_topics = model.reduce_outliers(df['tokens'], topics)

Batches: 100%|██████████| 12/12 [00:01<00:00, 11.74it/s]
2023-04-14 16:42:53,286 - BERTopic - Transformed documents to Embeddings
2023-04-14 16:42:55,168 - BERTopic - Reduced dimensionality
2023-04-14 16:42:55,210 - BERTopic - Clustered reduced embeddings
100%|██████████| 1/1 [00:00<00:00, 998.17it/s]


In [133]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,8,-1_nice_love_gracias_month
1,0,254,0_shoe_comfortable_foot_light
2,1,40,1_comfortable_color_comfy_fit
3,2,19,2_true___
4,3,15,3_daughter_shipping_fast_wife
5,4,13,4_narrow_large_size_one
6,5,13,5_fit_real_purpose_overpriced
7,6,9,6_excellent_perfect_great_


In [124]:
model.get_topic(3)

[('true', 3.068052935133617),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05)]

In [119]:
hierarchical_topics = model.hierarchical_topics(df['tokens'])
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 10/10 [00:00<00:00, 221.74it/s]


In [106]:
model.visualize_heatmap()

In [64]:
# guided model with seed topic list
seed_topic_list = [['color', 'black', 'pink'], 
                   ['size', 'width', 'fit', 'confortable'], 
                   ['brand', 'nike', 'supreme'], 
                   ['expect', 'recommand']]

guided_model = BERTopic(seed_topic_list=seed_topic_list)

topics, probs = guided_model.fit_transform(df['tokens'])

guided_model.get_topic_info()

2023-04-14 16:12:27,136 - BERTopic - Transformed documents to Embeddings
2023-04-14 16:12:29,007 - BERTopic - Reduced dimensionality
2023-04-14 16:12:29,020 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,0,345,0_shoe_fit_comfortable_love
1,1,26,1_color_excellent_nice_excelente


In [114]:
# topic distributions
i = 4
topic_distr, topic_token_distr = model.approximate_distribution(df['tokens'], calculate_tokens=True)
model.visualize_distribution(topic_distr[i])

100%|██████████| 1/1 [00:00<00:00, 28.58it/s]


In [146]:
topic_distr[0]

array([1., 0., 0., 0., 0., 0., 0.])

In [115]:
model.visualize_approximate_distribution(df['tokens'][i], topic_token_distr[i])

Unnamed: 0,store,good,durable,cross,training,shoe,rigorous,training.1,great,light,comfortable,grip,bottom,foot,plank,push,up,etc,satisfied,purchase
0_shoe_foot_comfortable_pair,0.0,0.0,0.113,0.216,0.216,0.216,0.102,0.119,0.233,0.233,0.233,0.114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1_comfortable_color_size_fit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116,0.224,0.224,0.224,0.108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2_light_comfortable_great_training,0.154,0.369,0.56,0.75,0.76,0.712,0.736,0.821,0.887,0.917,0.83,0.663,0.543,0.459,0.452,0.463,0.467,0.356,0.234,0.114
