In [59]:
import numpy as np
import pandas as pd

import nltk
import gensim
from tqdm import tqdm

import matplotlib.pyplot as plt

import os

import tba3102

In [60]:
# file_name = "cleaned-galaxy"
# file_name = "cleaned-apple_review"
file_name = "cleaned-samsung_vs_pixel"
df = pd.read_csv(f'../data/{file_name}.csv')
df['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)
df = df.dropna(subset=['Cleaned_Comment']).reset_index(drop=True)
df = df[df["Cleaned_Comment"].apply(lambda x: len(x) > 2)]
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)


Unnamed: 0,Comment ID,Comment Body,Author,Upvotes,Timestamp,Cleaned_Comment
0,m9aq3ty,Jeez this thread really opened up the fanboy w...,lunarmando,130,1737913000.0,je thread really open fancy war I feel like fi...
1,m9ark0z,They said that indoor photography improved thi...,Deleted,36,1737913000.0,say indoor photography improve generation say ...
2,m99b1im,[removed],Deleted,190,1737896000.0,remove
3,m99kwtr,Snapdragon 8 elite blows google’s tensor out o...,porygon766,147,1737900000.0,snapdragon elite blow gorge tenor water
4,m99f0bd,"Tensor chip, no UFS 4.0 but of course if it's ...",Teo_Yanchev,90,1737898000.0,tenor chip course goose issue fix flagship pho...


In [61]:
# df_merge = pd.DataFrame()
# names = ["cleaned-galaxy", "cleaned-apple_review", "cleaned-samsung_vs_pixel"]
# for name in names:
#     curr = pd.read_csv(f'../data/{name}.csv')
#     curr['Cleaned_Comment'].replace(r'^(\s)+$', np.nan, regex=True, inplace=True)
#     curr = curr.dropna(subset=['Cleaned_Comment']).reset_index(drop=True)
#     curr = curr[curr["Cleaned_Comment"].apply(lambda x: len(x) > 2)]
#     df_merge = pd.concat([df_merge, curr], axis=0)
    
# file_name = "ALL"
# df_merge.reset_index(inplace=True, drop=True)
# df = df_merge

In [62]:
df.head()

Unnamed: 0,Comment ID,Comment Body,Author,Upvotes,Timestamp,Cleaned_Comment
0,m9aq3ty,Jeez this thread really opened up the fanboy w...,lunarmando,130,1737913000.0,je thread really open fancy war I feel like fi...
1,m9ark0z,They said that indoor photography improved thi...,Deleted,36,1737913000.0,say indoor photography improve generation say ...
2,m99b1im,[removed],Deleted,190,1737896000.0,remove
3,m99kwtr,Snapdragon 8 elite blows google’s tensor out o...,porygon766,147,1737900000.0,snapdragon elite blow gorge tenor water
4,m99f0bd,"Tensor chip, no UFS 4.0 but of course if it's ...",Teo_Yanchev,90,1737898000.0,tenor chip course goose issue fix flagship pho...


In [63]:
tba3102.data_quality_report(df)

Unnamed: 0,Comment ID,Comment Body,Author,Upvotes,Timestamp,Cleaned_Comment
count,424,424,424,424.0,424.0,424
unique,424,411,208,,,410
top,m9aq3ty,[deleted],Deleted,,,delete
freq,1,9,27,,,9
mean,,,,6.365566,1738034579.806604,
std,,,,18.393668,533669.761588,
min,,,,-41.0,1737895938.0,
25%,,,,1.0,1737903550.5,
50%,,,,2.0,1737915513.5,
75%,,,,5.0,1737958033.5,


In [64]:
tba3102.set_default_pandas_options()

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

In [65]:
def normalize_corpus(papers):
    
    norm_papers = []
    
    for paper in papers:
        
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        
        if paper_tokens:
            
            norm_papers.append(paper_tokens)
    
    return norm_papers

In [66]:
def lda_topic_model_coherence_generator(corpus, texts, dictionary, 
                                    start_topic_count=2, end_topic_count=10, step=1, cpus=1):
    
    models = []
    coherence_scores = []

    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary,
                                            chunksize=1740, alpha='auto',
                                            eta='auto', random_state=42,
                                            iterations=500, num_topics=topic_nums,
                                            passes=20, eval_every=None)
        
        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model,
                                                            corpus=corpus,
                                                            texts=texts,
                                                            dictionary=dictionary,
                                                            coherence='c_v')
        
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)

    return models, coherence_scores

In [67]:
descriptions = df["Cleaned_Comment"].to_list()
norm_descriptions = normalize_corpus(descriptions)
# norm_descriptions

In [68]:
bigram = gensim.models.Phrases(norm_descriptions, min_count=2, threshold=20, delimiter='_')
bigram_model = gensim.models.phrases.Phraser(bigram)

norm_corpus_bigrams = [bigram_model[doc] for doc in norm_descriptions]

# Create a dictionary representation of the documents.
dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)

# Filter out words that occur less than 2 documents, or more than 60% of the documents.
dictionary.filter_extremes(no_below=2, no_above=0.6)

# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]
dictionary.token2id

{'bad': 0,
 'fancy': 1,
 'feel': 2,
 'fix': 3,
 'like': 4,
 'open': 5,
 'really': 6,
 'samson': 7,
 'since': 8,
 'smartphone': 9,
 'add': 10,
 'always': 11,
 'everyone': 12,
 'fix_pro': 13,
 'galaxy': 14,
 'generation': 15,
 'get': 16,
 'improve': 17,
 'indoor': 18,
 'light': 19,
 'look': 20,
 'mostly': 21,
 'option': 22,
 'photography': 23,
 'say': 24,
 'screen': 25,
 'second': 26,
 'still': 27,
 'take': 28,
 'thing': 29,
 'trade': 30,
 'ultra': 31,
 'would': 32,
 'remove': 33,
 'blow': 34,
 'gorge': 35,
 'snapdragon_elite': 36,
 'tenor': 37,
 'water': 38,
 'charge': 39,
 'chip': 40,
 'course': 41,
 'flagship': 42,
 'goose': 43,
 'hardware': 44,
 'issue': 45,
 'period': 46,
 'phone': 47,
 'price': 48,
 'supper': 49,
 'camera_hardware': 50,
 'follow': 51,
 'go_storage': 52,
 'list': 53,
 'mean': 54,
 'nothing': 55,
 'outdate': 56,
 'performance': 57,
 'siliconcarbon_battery': 58,
 'son': 59,
 'start': 60,
 'use': 61,
 'well': 62,
 'yet': 63,
 'grapheneo': 64,
 'much': 65,
 'powerful': 

In [69]:
lda_models, coherence_scores = lda_topic_model_coherence_generator(corpus=bow_corpus,
                                                                texts=norm_corpus_bigrams,
                                                                dictionary=dictionary,
                                                                start_topic_count=2,
                                                                end_topic_count=10, step=1,
                                                                cpus=16)

100%|██████████| 9/9 [01:05<00:00,  7.31s/it]


In [70]:
lda_coherence_df = pd.DataFrame({'Number of Topics': range(2, 11, 1), 'Coherence Score': np.round(coherence_scores, 4)})
lda_coherence_df_top10 = lda_coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(10)
print(lda_coherence_df_top10)
# lda_coherence_df_top10.to_csv("lda_top10.csv")

   Number of Topics  Coherence Score
1                 3           0.5999
2                 4           0.5931
0                 2           0.5715
4                 6           0.5575
6                 8           0.5567
7                 9           0.5544
3                 5           0.5501
5                 7           0.5490
8                10           0.5357


In [71]:
TOTAL_TOPICS = lda_coherence_df_top10["Number of Topics"].reset_index(drop=True).iloc[0]
TOTAL_TOPICS

3

In [72]:
model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary,
                                    chunksize=1740, alpha='auto',
                                    eta='auto', random_state=42,
                                    iterations=500, num_topics=TOTAL_TOPICS,
                                    passes=20, eval_every=None)

In [73]:
topics = [[(term, round(wt, 3))
                for term, wt in model.show_topic(n, topn=20)]
                    for n in range(0, model.num_topics)]

topics_df = pd.DataFrame([[term for term, wt in topic]
                            for topic in topics],
                        columns = ['Term'+str(i) for i in range(1, 21)],
                        index=['Topic '+str(t) for t in range(1, model.num_topics+1)]).T

topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                                for topic in topics],
                                columns = ['Terms per Topic'],
                                index=['Topic'+str(t) for t in range(1, model.num_topics+1)])



tm_results = model[bow_corpus]

corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                 for topics in tm_results]

In [74]:
corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(descriptions))
corpus_topic_df['DominantTopic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['ContributionPercent'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['TopicDesc'] = [topics_df.iloc[t[0]]['Terms per Topic']
                                    for t in corpus_topics]
corpus_topic_df["Comment Body"] = df["Comment Body"]
corpus_topic_df['Cleaned_Comment'] = df["Cleaned_Comment"]

In [75]:
corpus_topic_df = corpus_topic_df.sort_values(by=["ContributionPercent"], ascending=False)
corpus_topic_df.to_csv(f"../data/results/{file_name}_topic_modelling.csv")
corpus_topic_df.groupby('DominantTopic').apply(lambda topic_set:( topic_set.sort_values(by=['ContributionPercent'], ascending=False).iloc[0]))

  corpus_topic_df.groupby('DominantTopic').apply(lambda topic_set:( topic_set.sort_values(by=['ContributionPercent'], ascending=False).iloc[0]))


Unnamed: 0_level_0,Document,DominantTopic,ContributionPercent,TopicDesc,Comment Body,Cleaned_Comment
DominantTopic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,239,1,99.87,"samson, fix, well, get, phone, like, one, appl...",Right and honestly like some people might not ...,right honestly like people might care throttle...
2,29,2,99.91,"phone, fix, samson, use, get, would, go, goose...",So I started my Journey with a Samsung S9 in 2...,I start journey samson last year last leg offe...
3,245,3,99.91,"fix, well, samson, use, get, phone, like, perf...",Well I don't think I would say everything. I m...,well I think I would say everything I mean man...


In [76]:
corpus_topic_df

Unnamed: 0,Document,DominantTopic,ContributionPercent,TopicDesc,Comment Body,Cleaned_Comment
29,29,2,99.91,"phone, fix, samson, use, get, would, go, goose...",So I started my Journey with a Samsung S9 in 2...,I start journey samson last year last leg offe...
245,245,3,99.91,"fix, well, samson, use, get, phone, like, perf...",Well I don't think I would say everything. I m...,well I think I would say everything I mean man...
8,8,2,99.89,"phone, fix, samson, use, get, would, go, goose...",The S25 (especially the Ultra) is better. The ...,especially ultra well snapdragon chip inside o...
325,325,2,99.87,"phone, fix, samson, use, get, would, go, goose...",I mean there are occasional bugs related to th...,I mean occasional bag relate third party launc...
126,126,2,99.87,"phone, fix, samson, use, get, would, go, goose...",Its.not fanboism as much as it's a disappointm...,itsnot fanboism much disappointment see large ...
239,239,1,99.87,"samson, fix, well, get, phone, like, one, appl...",Right and honestly like some people might not ...,right honestly like people might care throttle...
319,319,1,99.86,"samson, fix, well, get, phone, like, one, appl...","Software experience in Pixel has been subpar, ...",software experience fix supper least use case ...
208,208,2,99.86,"phone, fix, samson, use, get, would, go, goose...","Google is working to upgrade Tensor, but the m...",goose work upgrade tenor major upgrade next ye...
14,14,3,99.86,"fix, well, samson, use, get, phone, like, perf...",Wow such scrutinizing consumer reports here. /...,scrutinize consumer report honestly I think pl...
215,215,2,99.84,"phone, fix, samson, use, get, would, go, goose...",For me the fanboying is because of a couple th...,I annoying couple thing fix first non super bu...
