In [1]:
import pandas as pd
import spacy
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [2]:
df = pd.read_json('../example/data/data_processed_1prod_full.json', lines=True)
df.rename({'reviewText': 'argument'}, axis=1, inplace=True)
df.rename({'overall': 'score'}, axis=1, inplace=True)
df['argument'] = df['argument'].astype(str)
df.head()

Unnamed: 0,argument,score
0,I always get a half size up in my tennis shoes...,3
1,Put them on and walked 3 hours with no problem...,5
2,excelente,5
3,The shoes fit well in the arch area. They are ...,4
4,Tried them on in a store before buying online ...,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   argument  371 non-null    object
 1   score     371 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.9+ KB


## Preprocess text data

In [4]:
# tokenize arguments
nlp = spacy.load('en_core_web_md')

removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

tokens = []
for argument in nlp.pipe(df['argument']):
    argu_tokens = []
    for token in argument:
        if token.pos_ not in removal and not token.is_stop and token.is_alpha:
            argu_tokens.append(token.lemma_.lower())
    tokens.append(argu_tokens)
df['tokens'] = tokens
df.head()

Unnamed: 0,argument,score,tokens
0,I always get a half size up in my tennis shoes...,3,"[half, size, tennis, shoe, reason, feel, big, ..."
1,Put them on and walked 3 hours with no problem...,5,"[walk, hour, problem, love, light, feel]"
2,excelente,5,[excelente]
3,The shoes fit well in the arch area. They are ...,4,"[shoe, fit, arch, area, little, wide, toe, are..."
4,Tried them on in a store before buying online ...,5,"[try, store, buy, know, fit, good, look, durab..."


In [173]:
# create dictionary
dictionary = Dictionary(df['tokens'])

# no_below and no_above should be fine-tuned
dictionary.filter_extremes(no_below=20, no_above=.5, keep_n=1000)

dictionary.token2id

{'big': 0,
 'feel': 1,
 'shoe': 2,
 'size': 3,
 'light': 4,
 'love': 5,
 'walk': 6,
 'fit': 7,
 'like': 8,
 'buy': 9,
 'comfortable': 10,
 'foot': 11,
 'good': 12,
 'great': 13,
 'look': 14,
 'purchase': 15,
 'training': 16,
 'recommend': 17,
 'pair': 18,
 'color': 19,
 'order': 20,
 'run': 21,
 'support': 22,
 'wear': 23,
 'weight': 24,
 'expect': 25,
 'nike': 26,
 'lightweight': 27,
 'time': 28,
 'use': 29,
 'nice': 30,
 'perfect': 31,
 'work': 32,
 'comfy': 33,
 'day': 34,
 'need': 35}

In [174]:
# create corpus
corpus = []
for doc in df['tokens']:
    corpus.append(dictionary.doc2bow(doc))

In [175]:
# build model
n_topics = []
coherence_score = []
seed = 13

for i in range(1, 21, 1):
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, 
                         num_topics=i, workers=4, passes=10, random_state=seed)
    
    # coherence coefficient should be a bar to handle
    coherence_model = CoherenceModel(model=lda_model, 
                                    #  texts=df['tokens'],
                                     corpus=corpus,  
                                     dictionary=dictionary, coherence='u_mass')
    
    n_topics.append(i)
    coherence_score.append(coherence_model.get_coherence())
    
max_coherence = max(coherence_score)
n_topics_optimal = n_topics[coherence_score.index(max_coherence)]
n_topics_optimal, max_coherence

(1, -1.9020052371117042)

In [177]:
# topic modeling results
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, 
                         num_topics=n_topics_optimal, workers=4, 
                         passes=100, random_state=seed)
lda_model.print_topics(-1)

[(0,
  '0.139*"shoe" + 0.066*"comfortable" + 0.066*"love" + 0.060*"fit" + 0.047*"wear" + 0.040*"size" + 0.039*"foot" + 0.039*"great" + 0.030*"light" + 0.028*"good"')]

In [178]:
lda_model[corpus][0]

[(0, 1.0)]

## Bert Topic Modeling

In [5]:
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def join_tokens(tokens: list):
    return ' '.join(tokens)

df['tokens'] = df['tokens'].apply(join_tokens)
df

Unnamed: 0,argument,score,tokens
0,I always get a half size up in my tennis shoes...,3,half size tennis shoe reason feel big heel are...
1,Put them on and walked 3 hours with no problem...,5,walk hour problem love light feel
2,excelente,5,excelente
3,The shoes fit well in the arch area. They are ...,4,shoe fit arch area little wide toe area shoe f...
4,Tried them on in a store before buying online ...,5,try store buy know fit good look durable cross...
...,...,...,...
366,Favorite Nike shoe ever! The flex sole is exce...,5,favorite nike shoe flex sole excellent love fr...
367,"I wear these everyday to work, the gym, etc.",5,wear everyday work gym etc
368,"Love these shoes! Great fit, very light weight.",5,love shoe great fit light weight
369,Super comfortable and fit my small feet perfec...,5,super comfortable fit small foot flat foot lot...


In [8]:

model = BERTopic()

topics, probs = model.fit_transform(df['tokens'])

In [9]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,33,-1_expect_wear_great_good
1,0,221,0_shoe_foot_wear_comfortable
2,1,73,1_comfortable_fit_light_great
3,2,23,2_love_daughter_come_purchase
4,3,21,3_excellent_nice_excelente_perfect


In [10]:
hierarchical_topics = model.hierarchical_topics(df['tokens'])
model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 3/3 [00:00<00:00, 214.29it/s]


In [11]:
model.visualize_heatmap()

In [15]:
# guided model with seed topic list
seed_topic_list = [['color', 'black', 'pink'], 
                   ['size', 'width', 'fit', 'confortable'], 
                   ['brand', 'nike', 'supreme'], 
                   ['expect', 'recommand']]

guided_model = BERTopic(seed_topic_list=seed_topic_list)

topics, probs = guided_model.fit_transform(df['tokens'])

guided_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,8,-1_color_pattern_artculo_equivocado
1,0,320,0_shoe_fit_comfortable_size
2,1,25,1_love_daughter_month_fit
3,2,18,2_excellent_nice_excelente_perfect
