In [1]:
import pandas as pd
import spacy
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [2]:
df = pd.read_json('../example/data/data_processed_1prod_full.json', lines=True)
df.rename({'reviewText': 'argument'}, axis=1, inplace=True)
df.rename({'overall': 'score'}, axis=1, inplace=True)
df['argument'] = df['argument'].astype(str)
df.head()

Unnamed: 0,argument,score
0,I always get a half size up in my tennis shoes...,3
1,Put them on and walked 3 hours with no problem...,5
2,excelente,5
3,The shoes fit well in the arch area. They are ...,4
4,Tried them on in a store before buying online ...,5


In [11]:
# split reviews into sentences and add argument and sentence id
nlp = spacy.load('en_core_web_md')
df_sent = pd.DataFrame()
argu_id = []
sent_id = []
text = []
score = []
for i, row in df.iterrows():
    doc = nlp(row['argument'])
    for j, sent in enumerate(doc.sents):
        argu_id.append(i)
        sent_id.append(j)
        text.append(str(sent))
        score.append(row['score'])
df_sent['argu_id'] = argu_id
df_sent['sent_id'] = sent_id
df_sent['text'] = text
df_sent['score'] = score

df_sent

Unnamed: 0,argu_id,sent_id,text,score
0,0,0,I always get a half size up in my tennis shoes.,3
1,0,1,For some reason these feel to big in the heel ...,3
2,1,0,Put them on and walked 3 hours with no problem!,5
3,1,1,Love them!,5
4,1,2,So light feeling,5
...,...,...,...,...
933,369,1,I have flat feet so a lot of shoes are not com...,5
934,369,2,I can wear the shoe all day long and they are ...,5
935,369,3,They are light colored so any dirt will be see...,5
936,369,4,Would definitely buy another pair in a differe...,5


## Preprocess text data

In [16]:
# tokenize arguments
nlp = spacy.load('en_core_web_md')

removal= ['PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM', 'VERB']

tokens = []
for argument in nlp.pipe(df_sent['text']):
    argu_tokens = []
    for token in argument:
        if token.pos_ not in removal and not token.is_stop and token.is_alpha:
            argu_tokens.append(token.lemma_.lower())
    tokens.append(argu_tokens)
df_sent['tokens'] = tokens
df_sent

Unnamed: 0,argu_id,sent_id,text,score,tokens
0,0,0,I always get a half size up in my tennis shoes.,3,"[half, size, tennis, shoe]"
1,0,1,For some reason these feel to big in the heel ...,3,"[reason, big, heel, area, wide]"
2,1,0,Put them on and walked 3 hours with no problem!,5,"[hour, problem]"
3,1,1,Love them!,5,[]
4,1,2,So light feeling,5,[light]
...,...,...,...,...,...
933,369,1,I have flat feet so a lot of shoes are not com...,5,"[flat, foot, lot, shoe, comfortable, long, per..."
934,369,2,I can wear the shoe all day long and they are ...,5,"[shoe, day, long, super, comfortable]"
935,369,3,They are light colored so any dirt will be see...,5,"[light, dirt, right, away, easy, shoe, ve]"
936,369,4,Would definitely buy another pair in a differe...,5,"[definitely, pair, different, color]"


In [22]:
# create dictionary
dictionary = Dictionary(df_sent['tokens'])

# no_below and no_above should be fine-tuned
dictionary.filter_extremes(no_below=10, no_above=.5, keep_n=1000)

dictionary.token2id

{'shoe': 0,
 'size': 1,
 'big': 2,
 'heel': 3,
 'wide': 4,
 'light': 5,
 'arch': 6,
 'like': 7,
 'little': 8,
 'lot': 9,
 'toe': 10,
 'good': 11,
 'cross': 12,
 'great': 13,
 'training': 14,
 'comfortable': 15,
 'foot': 16,
 'highly': 17,
 'pair': 18,
 'color': 19,
 'fit': 20,
 'large': 21,
 'perfectly': 22,
 'support': 23,
 'weight': 24,
 'high': 25,
 'right': 26,
 'nike': 27,
 'gym': 28,
 'time': 29,
 'workout': 30,
 'class': 31,
 'lightweight': 32,
 'comfort': 33,
 'running': 34,
 'sole': 35,
 'different': 36,
 'nice': 37,
 'true': 38,
 'second': 39,
 'bit': 40,
 'way': 41,
 'daughter': 42,
 'cute': 43,
 'perfect': 44,
 'favorite': 45,
 'style': 46,
 'comfy': 47,
 'day': 48,
 'super': 49,
 'small': 50,
 'long': 51,
 'extremely': 52,
 'sneaker': 53,
 'year': 54,
 'happy': 55,
 'flex': 56,
 'pain': 57}

In [23]:
# create corpus
corpus = []
for doc in df_sent['tokens']:
    corpus.append(dictionary.doc2bow(doc))

### LDA model + coherence score

- Converging process is time-consuming (±1m40s for exploring topic ranges of [1, 20]).
- Convergence usually results in only 1 topic, even with different coherence scores: my guess is that the reviews are mostly very general and covers multiple topics, which makes distinguish of reviews on topics diffficult.

In [24]:
# build model
n_topics = []
coherence_score = []
seed = 13

for i in range(1, 21, 1):
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, 
                         num_topics=i, workers=4, passes=10, random_state=seed)
    
    # coherence coefficient should be a bar to handle
    coherence_model = CoherenceModel(model=lda_model, 
                                    #  texts=df['tokens'],
                                     corpus=corpus,  
                                     dictionary=dictionary, coherence='u_mass')
    
    n_topics.append(i)
    coherence_score.append(coherence_model.get_coherence())
    
max_coherence = max(coherence_score)
n_topics_optimal = n_topics[coherence_score.index(max_coherence)]
n_topics_optimal, max_coherence

(1, -7.68783722747417)

In [25]:
# topic modeling results
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, 
                         num_topics=n_topics_optimal, workers=4, 
                         passes=100, random_state=seed)
lda_model.print_topics(-1)

[(0,
  '0.148*"shoe" + 0.070*"comfortable" + 0.043*"great" + 0.042*"foot" + 0.039*"size" + 0.036*"fit" + 0.031*"light" + 0.030*"good" + 0.028*"pair" + 0.024*"nike"')]

In [178]:
lda_model[corpus][0]

[(0, 1.0)]

### BERTopic Model

- The whole modeling process is modulized into sub-models. Each sub-model has various options to choose from. 
- Runtime performance is better than training LDA models from the beginning as those in BERTopic are pre-trained.
- Can generate multiple but similar topics. 

In [26]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

In [27]:
# umap for dimensionality reduction
umap_model = UMAP(n_neighbors=5, n_components=50, metric='cosine', low_memory=False)

# HDBSCAN for clustering documents
hdbscan_model = HDBSCAN(min_cluster_size=7, 
                        metric='euclidean', prediction_data=True)

model = BERTopic(n_gram_range=[1, 1], language='english', 
                 calculate_probabilities=True, verbose=True, 
                 umap_model=umap_model, 
                 hdbscan_model=hdbscan_model
                 )

topics, probs = model.fit_transform(df_sent['text'])
new_topics = model.reduce_outliers(df_sent['text'], topics)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

2023-05-01 15:49:21,904 - BERTopic - Transformed documents to Embeddings
2023-05-01 15:49:25,866 - BERTopic - Reduced dimensionality
2023-05-01 15:49:25,966 - BERTopic - Clustered reduced embeddings
100%|██████████| 1/1 [00:00<00:00, 101.51it/s]


In [28]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,294,-1_the_to_these_shoes
1,0,46,0_are_they_comfortable_light
2,1,44,1_shoe_very_sneakers_nice
3,2,41,2_training_for_gym_cross
4,3,37,3_light_lightweight_comfortable_weight
5,4,32,4_nike_nikes_flex_favorite
6,5,30,5_daughter_loves_them_love
7,6,24,6_fit_perfect_expected_as
8,7,24,7_pair_second_this_two
9,8,23,8_comfortable_looks_very_super


In [124]:
model.get_topic(3)

[('true', 3.068052935133617),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05)]

In [30]:
hierarchical_topics = model.hierarchical_topics(df_sent['text'])
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 33/33 [00:00<00:00, 445.27it/s]


In [106]:
model.visualize_heatmap()

In [64]:
# guided model with seed topic list
seed_topic_list = [['color', 'black', 'pink'], 
                   ['size', 'width', 'fit', 'confortable'], 
                   ['brand', 'nike', 'supreme'], 
                   ['expect', 'recommand']]

guided_model = BERTopic(seed_topic_list=seed_topic_list)

topics, probs = guided_model.fit_transform(df['tokens'])

guided_model.get_topic_info()

2023-04-14 16:12:27,136 - BERTopic - Transformed documents to Embeddings
2023-04-14 16:12:29,007 - BERTopic - Reduced dimensionality
2023-04-14 16:12:29,020 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,0,345,0_shoe_fit_comfortable_love
1,1,26,1_color_excellent_nice_excelente


In [114]:
# topic distributions
i = 4
topic_distr, topic_token_distr = model.approximate_distribution(df['tokens'], calculate_tokens=True)
model.visualize_distribution(topic_distr[i])

100%|██████████| 1/1 [00:00<00:00, 28.58it/s]


In [146]:
topic_distr[0]

array([1., 0., 0., 0., 0., 0., 0.])

In [115]:
model.visualize_approximate_distribution(df['tokens'][i], topic_token_distr[i])

Unnamed: 0,store,good,durable,cross,training,shoe,rigorous,training.1,great,light,comfortable,grip,bottom,foot,plank,push,up,etc,satisfied,purchase
0_shoe_foot_comfortable_pair,0.0,0.0,0.113,0.216,0.216,0.216,0.102,0.119,0.233,0.233,0.233,0.114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1_comfortable_color_size_fit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116,0.224,0.224,0.224,0.108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2_light_comfortable_great_training,0.154,0.369,0.56,0.75,0.76,0.712,0.736,0.821,0.887,0.917,0.83,0.663,0.543,0.459,0.452,0.463,0.467,0.356,0.234,0.114
