In [None]:
import requests
import os
from tqdm import tqdm
tqdm.pandas()

import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt


nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shaun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shaun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Shaun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
def download_file(url, filename):
    if os.path.exists(filename):
        print(f"File '{filename}' already exists. Skipping download.")
        return

    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size_in_bytes = int(r.headers.get("content-length", 0))
            block_size = 16384
            progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
            with open(filename, "wb") as file:
                for data in r.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
            progress_bar.close()
            if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
                print("ERROR, something went wrong")
            else:
                print(f"File downloades successfully as {filename}")

    except requests.exceptions.RequestException as e:
        print(f"Error during download: {e}")

file_url = "https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/Appliances.jsonl.gz"
local_filename = "Appliances.jsonl.gz"
download_file(file_url, local_filename)

File 'Appliances.jsonl.gz' already exists. Skipping download.


In [3]:
df = pd.read_json("Appliances.jsonl.gz", compression="gzip", lines=True)

In [34]:
reviews_df = df[["text"]].copy()

In [35]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

def tokenize(text):
    return nltk.word_tokenize(text)

stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

reviews_df['text'] = reviews_df['text'].progress_apply(clean_text)
reviews_df['tokens'] = reviews_df['text'].progress_apply(tokenize)
reviews_df['tokens'] = reviews_df['tokens'].progress_apply(remove_stopwords)
reviews_df['tokens'] = reviews_df['tokens'].progress_apply(lemmatize)

100%|██████████| 100000/100000 [00:00<00:00, 180356.42it/s]
100%|██████████| 100000/100000 [00:10<00:00, 9668.78it/s]
100%|██████████| 100000/100000 [00:00<00:00, 180321.30it/s]
100%|██████████| 100000/100000 [00:08<00:00, 11947.27it/s]


In [36]:
dictionary = corpora.Dictionary(reviews_df['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in reviews_df['tokens']]

In [37]:
num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

In [38]:
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')


Topic: 0 
Words: 0.019*"one" + 0.018*"part" + 0.011*"old" + 0.011*"new" + 0.010*"dryer" + 0.008*"year" + 0.008*"would" + 0.007*"month" + 0.007*"working" + 0.007*"back"

Topic: 1 
Words: 0.044*"ice" + 0.041*"coffee" + 0.026*"use" + 0.020*"make" + 0.019*"cup" + 0.016*"love" + 0.016*"maker" + 0.013*"great" + 0.012*"work" + 0.010*"easy"

Topic: 2 
Words: 0.010*"br" + 0.008*"use" + 0.008*"machine" + 0.007*"get" + 0.007*"like" + 0.007*"little" + 0.007*"well" + 0.007*"would" + 0.006*"washer" + 0.006*"stove"

Topic: 3 
Words: 0.082*"great" + 0.072*"fit" + 0.058*"work" + 0.052*"easy" + 0.036*"perfect" + 0.036*"install" + 0.035*"good" + 0.033*"product" + 0.032*"perfectly" + 0.027*"price"

Topic: 4 
Words: 0.067*"filter" + 0.050*"water" + 0.017*"one" + 0.012*"brand" + 0.011*"good" + 0.011*"much" + 0.011*"better" + 0.011*"taste" + 0.011*"work" + 0.011*"price"



In [39]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

In [40]:
for index, score in sorted(lda_model[corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.3439691662788391	 
Topic: 0.010*"br" + 0.008*"use" + 0.008*"machine" + 0.007*"get" + 0.007*"like" + 0.007*"little" + 0.007*"well" + 0.007*"would" + 0.006*"washer" + 0.006*"stove"

Score: 0.29765933752059937	 
Topic: 0.044*"ice" + 0.041*"coffee" + 0.026*"use" + 0.020*"make" + 0.019*"cup" + 0.016*"love" + 0.016*"maker" + 0.013*"great" + 0.012*"work" + 0.010*"easy"

Score: 0.1925359070301056	 
Topic: 0.082*"great" + 0.072*"fit" + 0.058*"work" + 0.052*"easy" + 0.036*"perfect" + 0.036*"install" + 0.035*"good" + 0.033*"product" + 0.032*"perfectly" + 0.027*"price"

Score: 0.1455492377281189	 
Topic: 0.067*"filter" + 0.050*"water" + 0.017*"one" + 0.012*"brand" + 0.011*"good" + 0.011*"much" + 0.011*"better" + 0.011*"taste" + 0.011*"work" + 0.011*"price"

Score: 0.020286384969949722	 
Topic: 0.019*"one" + 0.018*"part" + 0.011*"old" + 0.011*"new" + 0.010*"dryer" + 0.008*"year" + 0.008*"would" + 0.007*"month" + 0.007*"working" + 0.007*"back"


In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews_df['tokens'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.367801829609589

Coherence Score:  0.5366377020780211


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

start = 5
limit = 30
step = 5
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=reviews_df['tokens'], start=start, limit=limit, step=step)

x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))