## Topic modeling

In [None]:
# Data:
# https://www.kaggle.com/datasets/d4rklucif3r/restaurant-reviews/
# https://www.kaggle.com/datasets/choonkhonng/malaysia-restaurant-review-datasets
#https://www.kaggle.com/datasets/joebeachcapital/restaurant-reviews

In [15]:
from text_categorizer.build_model import BuildModel
import pandas as pd
import plotly.io as pio
import yaml
import pickle
import os
import urllib.request
from llama_cpp import Llama
pio.renderers.default = 'iframe'
pd.set_option('display.max_rows', 700)
# https://github.com/MaartenGr/BERTopic|
# https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6
# https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8

### Reading Data

In [16]:
file_path = '../data/Restaurant_Reviews.xlsx'
text_col = 'Review'
folder_name = file_path.split('/')[-1].split('.')[0]
model_path = f"../models/{folder_name}/Unsupervised_model"
data = pd.read_excel(file_path)
print(data.shape)

(1000, 2)


In [17]:
data.head(2)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0


In [18]:
# create object to train for first time
model_obj = BuildModel(use_pretrained_model=False, data=data,
                       col_name=text_col)

# # create object to load trained model
# model_obj_trained = BuildModel(use_pretrained_model=True, model_path=model_path )
# with open("../config.yml") as ymlfile:
#     cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

### Cleaning Data

In [19]:
# for first train
model_obj.clean_data(delete_null_flag=True,
                   split_flag=False,
                   remove_special_chart=False,
                   ignore_short_flag=True)

Data Cleaning for: 1000 rows
Data Cleaning - remove missing text: 1000 rows
Data Cleaning - remove white spaces: 1000 rows
Data Cleaning - ignore short text(less than 4 charchters): 1000 rows
Data Cleaning - Arabic text: 0
Data Cleaning - English text: 1000


### Train Topic model or Load trained model

In [20]:
# for first train
# train model
topic_model, topics, probs, embeddings, text_lst, data_lang = model_obj.train_model()

# save model
model_obj.save_model(model_path= model_path, config_file_path='../config.yml',config_sec=folder_name )



In [21]:
# # load trained model
# topic_model, topics, probs, embeddings, data_lang = model_obj_trained.get_trained_model()

#### Get top 20 topics:

In [29]:
# you can use any of apis provided by bertopic here
# https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html
topics_df = topic_model.get_topic_info().iloc[1:11, :]
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,310,-1_the_and_is_to,"Topic -1 ---> the, and, is, to","[the, and, is, to, of, was, it, had, with, my]","[The food is good., Of all the dishes, the sal..."
1,0,62,0_service_server_our_slow,"Topic 0 ---> service, server, our, slow","[service, server, our, slow, was, very, and, t...","[The service was extremely slow., I want to fi..."
2,1,60,1_back_be_will_go,"Topic 1 ---> back, be, will, go","[back, be, will, go, won, again, here, never, ...","[Will be back again!, Will not be back., Will ..."
3,2,58,2_place_this_recommend_to,"Topic 2 ---> place, this, recommend, to","[place, this, recommend, to, would, in, it, to...","[I would not recommend this place., This place..."
4,3,45,3_minutes_waited_we_took,"Topic 3 ---> minutes, waited, we, took","[minutes, waited, we, took, for, to, wait, wai...",[We waited for thirty minutes to be seated (al...
5,4,44,4_food_good_great_service,"Topic 4 ---> food, good, great, service","[food, good, great, service, and, prices, atmo...","[Food was good, service was good, Prices were ..."
6,5,43,5_disappointed_was_ambiance_so,"Topic 5 ---> disappointed, was, ambiance, so","[disappointed, was, ambiance, so, off, good, i...",[I guess maybe we went on an off night but it ...
7,6,38,6_flavor_and_was_bland,"Topic 6 ---> flavor, and, was, bland","[flavor, and, was, bland, tasteless, spicy, th...","[The descriptions said ""yum yum sauce"" and ano..."
8,7,32,7_food_was_terrible_gross,"Topic 7 ---> food, was, terrible, gross","[food, was, terrible, gross, wasn, the, dry, s...","[On the ground, right next to our table was a ..."
9,8,28,8_were_potato_fries_chips,"Topic 8 ---> were, potato, fries, chips","[were, potato, fries, chips, butter, also, tha...",[The sweet potato tots were good but the onion...


In [23]:
def create_topic_prompt(topic_id, prompt):
    keywords_list = topic_model.get_topic(topic_id)
    keywords = ", ".join(list(zip(*keywords_list))[0])
    prompt = prompt.replace("[KEYWORDS]", keywords)
    to_replace = ""
    docs = topic_model.representative_docs_[topic_id]
    for doc in docs:
        to_replace += f"- {doc}\n"
    prompt = prompt.replace("[DOCUMENTS]", to_replace)
    return prompt

prompt = f"""<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>
<|user|>
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>
<|assistant|>"""

def generate_text(
    prompt="",
    max_tokens=256,
    temperature=0.1,
    top_p=0.5,
    echo=False,
    stop=["#"],
):
    output = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )
    output_text = output["choices"][0]["text"].strip()
    return output_text
llm = Llama(model_path="../../models-open-source/zephyr-7b-beta.Q4_0.gguf", n_ctx=512, n_batch=126)



llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../../models-open-source/zephyr-7b-beta.Q4_0.gguf (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     [  4096,  4096,  

In [24]:
llm_topics = []
for i in range (0, 10):
    print(i)
    topic_prompt = create_topic_prompt(topic_id=i, prompt=prompt)
    #print(topic_prompt)
    topic = generate_text(
        topic_prompt,
        max_tokens=356,
        )
    llm_topics.append(topic)

0
1



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =     7.03 ms /     9 runs   (    0.78 ms per token,  1280.41 tokens per second)
llama_print_timings: prompt eval time = 30221.22 ms /   153 tokens (  197.52 ms per token,     5.06 tokens per second)
llama_print_timings:        eval time =  1517.67 ms /     8 runs   (  189.71 ms per token,     5.27 tokens per second)
llama_print_timings:       total time = 31776.90 ms
Llama.generate: prefix-match hit


2



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    15.43 ms /    20 runs   (    0.77 ms per token,  1296.09 tokens per second)
llama_print_timings: prompt eval time = 14451.48 ms /    93 tokens (  155.39 ms per token,     6.44 tokens per second)
llama_print_timings:        eval time =  3510.53 ms /    19 runs   (  184.76 ms per token,     5.41 tokens per second)
llama_print_timings:       total time = 18034.38 ms
Llama.generate: prefix-match hit


3



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =     5.87 ms /     8 runs   (    0.73 ms per token,  1362.40 tokens per second)
llama_print_timings: prompt eval time = 14929.68 ms /    98 tokens (  152.34 ms per token,     6.56 tokens per second)
llama_print_timings:        eval time =  1284.44 ms /     7 runs   (  183.49 ms per token,     5.45 tokens per second)
llama_print_timings:       total time = 16241.27 ms
Llama.generate: prefix-match hit


4



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    26.77 ms /    35 runs   (    0.76 ms per token,  1307.53 tokens per second)
llama_print_timings: prompt eval time = 29356.71 ms /   173 tokens (  169.69 ms per token,     5.89 tokens per second)
llama_print_timings:        eval time =  8518.43 ms /    34 runs   (  250.54 ms per token,     3.99 tokens per second)
llama_print_timings:       total time = 38009.81 ms
Llama.generate: prefix-match hit


5



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    13.32 ms /    18 runs   (    0.74 ms per token,  1350.95 tokens per second)
llama_print_timings: prompt eval time = 15238.12 ms /   104 tokens (  146.52 ms per token,     6.82 tokens per second)
llama_print_timings:        eval time =  3246.19 ms /    17 runs   (  190.95 ms per token,     5.24 tokens per second)
llama_print_timings:       total time = 18547.53 ms
Llama.generate: prefix-match hit


6



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =     7.46 ms /    10 runs   (    0.75 ms per token,  1340.30 tokens per second)
llama_print_timings: prompt eval time = 15152.71 ms /   109 tokens (  139.02 ms per token,     7.19 tokens per second)
llama_print_timings:        eval time =  1718.08 ms /     9 runs   (  190.90 ms per token,     5.24 tokens per second)
llama_print_timings:       total time = 16905.09 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    56.22 ms /    76 runs   (    0.74 ms per token,  1351.88 tokens per second)
llama_print_timings: prompt eval time = 27631.01 ms /   171 tokens (  161.58 ms per token,     6.19 tokens per second)
llama_print_timings:        eval time = 13946.67 ms /    75 runs   (  185.96 ms per token,     5.38 tokens per second)
llama_print_timings:       total time = 41851.46 ms
Llama.generate: prefix-match hit


7
8



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    13.21 ms /    18 runs   (    0.73 ms per token,  1362.09 tokens per second)
llama_print_timings: prompt eval time = 16099.52 ms /   126 tokens (  127.77 ms per token,     7.83 tokens per second)
llama_print_timings:        eval time =  3269.10 ms /    18 runs   (  181.62 ms per token,     5.51 tokens per second)
llama_print_timings:       total time = 19434.39 ms
Llama.generate: prefix-match hit


9



llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    31.98 ms /    40 runs   (    0.80 ms per token,  1250.66 tokens per second)
llama_print_timings: prompt eval time = 18897.51 ms /   144 tokens (  131.23 ms per token,     7.62 tokens per second)
llama_print_timings:        eval time =  9824.61 ms /    39 runs   (  251.91 ms per token,     3.97 tokens per second)
llama_print_timings:       total time = 28889.14 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time = 26327.25 ms
llama_print_timings:      sample time =    14.21 ms /    19 runs   (    0.75 ms per token,  1336.90 tokens per second)
llama_print_timings: prompt eval time = 18516.42 ms /   138 tokens (  134.18 ms per token,     7.45 tokens per second)
llama_print_timings:        eval time =  3547.45 ms /    18 runs   (  197.08 ms per token,     5.07 tokens per second)
llama_print_timings:       total time = 22133.40 ms


In [25]:
llm_topics

['Service variability: slow vs excellent',
 'Back (some/not all): Will, go, again, never, coming, not',
 'Place feedback: positive vs negative',
 'Long wait times: waited 20min to be acknowledged, 40min to place order, food took 35min with frequent forgetfulness at restaurant',
 'Food & Service Excellence: Good/Great Atmosphere & Prices',
 'Disappointing Ambiance: Off Performance',
 'Unflavored rolls with mislabeled sauces, overly sweet and bland food, tasteless burger with no charcoal flavor or spice. Keywords: flavorless, bland, tasteless, unflavored, mislabeled, overcooked, lacks flavor, sweetness dominant. Label: Dissatisfying Food Experience.',
 'Food quality: terrible, gross, dry, sucked, fried, but',
 'Sweet potato & potato sides: good totes, excellent rings, well-seasoned fries, homemade chips with bacon bits, served with hot bread and butter.',
 'Vegas buffet/restaurant: Highly recommended for food and service']

In [34]:
topics_df['LLM_topic'] = llm_topics
topics_df

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs,LLM_topic
1,0,62,0_service_server_our_slow,"Topic 0 ---> service, server, our, slow","[service, server, our, slow, was, very, and, t...","[The service was extremely slow., I want to fi...",Service variability: slow vs excellent
2,1,60,1_back_be_will_go,"Topic 1 ---> back, be, will, go","[back, be, will, go, won, again, here, never, ...","[Will be back again!, Will not be back., Will ...","Back (some/not all): Will, go, again, never, c..."
3,2,58,2_place_this_recommend_to,"Topic 2 ---> place, this, recommend, to","[place, this, recommend, to, would, in, it, to...","[I would not recommend this place., This place...",Place feedback: positive vs negative
4,3,45,3_minutes_waited_we_took,"Topic 3 ---> minutes, waited, we, took","[minutes, waited, we, took, for, to, wait, wai...",[We waited for thirty minutes to be seated (al...,Long wait times: waited 20min to be acknowledg...
5,4,44,4_food_good_great_service,"Topic 4 ---> food, good, great, service","[food, good, great, service, and, prices, atmo...","[Food was good, service was good, Prices were ...",Food & Service Excellence: Good/Great Atmosphe...
6,5,43,5_disappointed_was_ambiance_so,"Topic 5 ---> disappointed, was, ambiance, so","[disappointed, was, ambiance, so, off, good, i...",[I guess maybe we went on an off night but it ...,Disappointing Ambiance: Off Performance
7,6,38,6_flavor_and_was_bland,"Topic 6 ---> flavor, and, was, bland","[flavor, and, was, bland, tasteless, spicy, th...","[The descriptions said ""yum yum sauce"" and ano...","Unflavored rolls with mislabeled sauces, overl..."
8,7,32,7_food_was_terrible_gross,"Topic 7 ---> food, was, terrible, gross","[food, was, terrible, gross, wasn, the, dry, s...","[On the ground, right next to our table was a ...","Food quality: terrible, gross, dry, sucked, fr..."
9,8,28,8_were_potato_fries_chips,"Topic 8 ---> were, potato, fries, chips","[were, potato, fries, chips, butter, also, tha...",[The sweet potato tots were good but the onion...,"Sweet potato & potato sides: good totes, excel..."
10,9,28,9_vegas_buffet_in_breakfast,"Topic 9 ---> vegas, buffet, in, breakfast","[vegas, buffet, in, breakfast, for, to, is, se...",[This is one of the best bars with food in Veg...,Vegas buffet/restaurant: Highly recommended fo...


#### Some visulaization

In [28]:
fig = topic_model.visualize_documents(text_lst,
                                       embeddings=embeddings)
# fig
