# Part 2. Topic analysis of positive and negative comments using LDA models

In [1]:
import os
import csv
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis.gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

## 1. Calculation of the optimal number of topics

### Function Design : Use Perplexity and Coherence 

In [2]:
def get_optimal_topic_num_by_perplexity(file_path, num_topics_range):
    '''
    Compute the optimal number of topics.

    Parameters:
    file_path: str, the file path of the preprocessed text file.
    num_topics_range: list, the range of topic numbers to try.

    Returns:
    best_num_topics: int, the optimal number of topics.
    '''

    # Read the file and create the corpus.
    with open(file_path, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]

    # Create the dictionary.
    dictionary = Dictionary(corpus)

    # Train the LDA models with different topic numbers, and calculate the perplexity values.
    perplexity_values = []
    lda_models = []
    for num_topics in num_topics_range:
        lda_model = LdaModel(corpus=[dictionary.doc2bow(text) for text in corpus], id2word=dictionary, num_topics=num_topics)
        lda_models.append(lda_model)
        perplexity_values.append(lda_model.log_perplexity([dictionary.doc2bow(text) for text in corpus]))

    # Calculate the coherence scores using the perplexity values.
    coherence_scores = [CoherenceModel(model=lda_model, texts=corpus, dictionary=dictionary, coherence='c_v').get_coherence() for lda_model in lda_models]

    # Find the optimal number of topics based on the coherence scores.
    best_num_topics_index = coherence_scores.index(max(coherence_scores))
    best_num_topics = num_topics_range[best_num_topics_index]
    
    print("Optimal number of topics (using perplexity method):", best_num_topics)

    return best_num_topics


### Function Design : Use Cosine_Similarity

In [3]:
def get_optimal_topic_num_by_similarity(file_path, num_topics_range):
    '''
    Compute the optimal number of topics.

    Parameters:
    file_path: str, the file path of the preprocessed text file.
    num_topics_range: list, the range of topic numbers to try.

    Returns:
    best_num_topics: int, the optimal number of topics.
    '''
    # Read file and create corpus
    with open(file_path, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]

    # Create dictionary
    dictionary = gensim.corpora.Dictionary(corpus)

    # Train LDA model and compute cosine similarity between topics for different number of topics
    similarity_values = []
    for num_topics in num_topics_range:
        lda_model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(text) for text in corpus], id2word=dictionary, num_topics=num_topics)
        topics_matrix = lda_model.get_topics()
        cosine_similarities = cosine_similarity(topics_matrix)
        similarity_values.append(np.min(cosine_similarities[np.triu_indices(num_topics, k=1)]))

    # Find the optimal number of topics
    best_num_topics_index = similarity_values.index(max(similarity_values))
    best_num_topics = num_topics_range[best_num_topics_index]
    
    print("Optimal number of topics (using cosine similarity method):", best_num_topics)
    
    return best_num_topics


### Call the function to output the respective optimal number of topics

In [4]:
# Create a list of integers for the range of number of topics.
num_topics_range = list(range(3, 20))

#### For stores 1

In [5]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_1_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P1_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P1_best_num_topics)

Optimal number of topics (using perplexity method): 18
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 10


In [6]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_1_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N1_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N1_best_num_topics)

Optimal number of topics (using perplexity method): 12
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 7


#### For stores 2

In [7]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_2_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P2_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P2_best_num_topics)

Optimal number of topics (using perplexity method): 16
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 9


In [8]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_2_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N2_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N2_best_num_topics)

Optimal number of topics (using perplexity method): 14
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 8


#### For stores 3

In [9]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_3_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P3_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P3_best_num_topics)

Optimal number of topics (using perplexity method): 3
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


In [10]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_3_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N3_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N3_best_num_topics)

Optimal number of topics (using perplexity method): 17
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 10


#### For stores 4

In [11]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_4_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P4_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P4_best_num_topics)

Optimal number of topics (using perplexity method): 4
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


In [12]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_4_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N4_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N4_best_num_topics)

Optimal number of topics (using perplexity method): 19
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 11


## 2. Get Topic-Word Distribution

### Function design : LDA model training, topic-word distribution output and visualization

In [13]:
def lda_topic_modeling(file_path, num_topics):
    # Read the text file
    with open(file_path, "r", encoding="utf-8") as f:
        texts = [line.strip().split() for line in f]

    # Build the dictionary and bag-of-words model
    dictionary = Dictionary(texts)
    print(dictionary)
    
    # Filter out extreme terms based on frequency and document proportion
    dictionary.filter_extremes(no_below=5, no_above=0.1)
    print(dictionary)
    
    # Convert texts into bag-of-words format
    corpus = [dictionary.doc2bow(doc) for doc in texts]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, 
                         num_topics=num_topics, 
                         id2word=dictionary, 
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    # Print the top keywords for each topic
    for topic_id in range(num_topics):
        topic_words = lda_model.show_topic(topic_id, topn=20)
        print("Topic {}: {}".format(topic_id, ", ".join([word for word, prob in topic_words])))

    # Compute visualization data for the LDA model
    vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

    # Extract a portion of the file path as the filename
    filename = os.path.splitext(os.path.basename(file_path))[0]
    filename = "pyLDAvis/lda_visualization_{}.html".format(filename)

    # Save the visualization as an HTML file
    pyLDAvis.save_html(vis_data, filename)
    print("Visualization saved as:", filename)

### Call the function to Complete LDA theme analysis

#### For stores 1

#### Postive review

In [14]:
file_path = "../data/review/analysis/polarity/reviews_1_polarity_P.txt"
lda_topic_modeling(file_path, P1_best_num_topics)

Dictionary<15034 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Dictionary<3069 unique tokens: ['一圈', '一家', '云集', '交通', '人气']...>
Topic 0: 一家, 新, 东安市场, 期间, 想, 装修, 老, 新东安, 打折, 门口, 电影院, 进门, 少, 卖, 灯光, 价格, 有点, 有些, 玩具, 牌子
Topic 1: 正好, 地下, 那么, 两层, 升级, 五六层, 哈哈哈, 商城, 店面, 中心, 风格, 停车场, 首饰, 菜, 鞋子, 互动, 口罩, 改造, 装饰, 好久
Topic 2: 超级, 发现, 说, 拍照, 太, 好看, 完, 来到, 点, 开, 电影, 外面, 出门, 好多, 逛商场, 家, 开门, 领, 高大, 心动
Topic 3: 朋友, 好去处, 爱, 全, 适合, 外婆家, 泡泡玛特, 展览, 可爱, 赶上, 主题, 宝宝, 小样, 拍, 游戏, 算, 海绵, 过年, 中庭, 约
Topic 4: 一些, 做, 东西, 停车, 干净, 没, 希望, 好像, 消毒, 走, 找, 前, 电梯, 旁边, 营业, 过, 楼下, 扫码, 开业, 第一次
Topic 5: ️, 娱乐, 布局, 会员, 休闲, 集, 游客, 一体, 宽敞, 必, 导航, 年轻化, 入驻, ⃣, 力度, 商业区, 洗手间, 地处, 溜, 一体化
Topic 6: 店铺, 餐饮, 苹果, 步行街, 美食, 楼上, 衣服, 餐厅, 很大, 大牌, 不少, 排队, 不是, 体验, 一楼, 好吃, 不过, 服务, 每次, 齐全
Topic 7: 王府井大街, 时尚, 位置, 年轻人, 整体, 位于, 年轻, 吸引, 更, 消费, 可, 设计, 地铁, 走, 购物中心, 豆, 晚上, pm, 时间, 热闹
Topic 8: 高端, 新东安市场, 大气, 各类, 商品, 东来顺, 遛弯儿, 吃完饭, 上档次, 明亮, 休息, 全新, 几层, 优雅, 一线, 情况, 鞋, OK, 印象, 氛围
Topic 9: 玩, 中, 没, 带, 孩子, 折扣, 点评, 开心, 楼层, 娃娃, 免费, 顾客, 工作人员, 空间, 盲盒, 券, 机, 商铺, 

#### Negative review

In [15]:
file_path = "../data/review/analysis/polarity/reviews_1_polarity_N.txt"
lda_topic_modeling(file_path, N1_best_num_topics)

Dictionary<5557 unique tokens: ['休息', '地方', '坐下', '座位', '想']...>
Dictionary<686 unique tokens: ['休息', '想', '顾客', '不像', '不错']...>
Topic 0: 打折, 折扣, 划算, 年, 折, 一楼, 有个, 对面, 听说, 穿, 二层, 关门, 喜欢, 商户, 小样, 样机, 原因, 肉, 那种, 盲盒
Topic 1: 排队, 周末, 地下, 好多, 新东安, 里, 苹果, 走, 停车, 逛街, 停车场, 体验, 找, 朋友, 价格, 过, 开门, 那么, 发现, 位于
Topic 2: APM, 不错, 有点, 特别, 开, 溜达, 不少, 衣服, 每次, 服务, 希望, 中, 顾客, 一圈, 不过, 搞, 逛逛, 适合, 没什么, 王府井大街
Topic 3: 打卡, ️, 点评, 展览, 展, 拍照, 中庭, 月, 皮克斯, 电影院, 不会, 熊, 好看, 二楼, 区域, 失望, B, 快闪店, 草莓, 六层
Topic 4: 力度, 带, 里, 豆, 小朋友, 特意, 领, 朋友, 可爱, 晚上, 可, 百货大楼, 回忆, 不如, 一点, 老牌, 形象, 产品, 服务台, 想
Topic 5: 东安市场, 装修, 餐厅, 好像, 闭店, 楼上, 衣服, 美食, 一些, 牌子, 有些, 很大, 不过, 撤店, 兰蔻, 很少, 人气, 开业, 吃喝玩乐, 老
Topic 6: 店铺, 非常, 一些, 步行街, 层, 餐饮, 营业, 影响, 化妆品, 做, 改造, 店员, 工作人员, 设计, 完全, 算是, 设施, 好久没, 商品, 一楼
Visualization saved as: pyLDAvis/lda_visualization_reviews_1_polarity_N.html


#### For stores 2

#### Postive review

In [16]:
file_path = "../data/review/analysis/polarity/reviews_2_polarity_P.txt"
lda_topic_modeling(file_path, P2_best_num_topics)

Dictionary<13572 unique tokens: ['cafelandmark', '一共', '一块', '一杯', '万朵']...>
Dictionary<2483 unique tokens: ['一共', '一块', '一杯', '万朵', '上班']...>
Topic 0: 怪兽, 可爱, 可可, 孩子, 活动, 巧克力, 主题, 展, 超级, 圣诞, ️, 做, 装饰, 好多, 新年, 奇趣, 气氛, 元旦, 发现, 互动
Topic 1: 疫情, 云集, 一家, 朋友, 好吃, 不少, 推荐, 走, 体验, 排队, 更, 过, 高档, 这家, 兑换, 溜达, 节日, 期间, 位于, 价格
Topic 2: 拍照, 展览, 停车场, 有趣, 好看, 拍, 艺术, 值得, 进门, 故宫, 定位, 人气, 高级, 西座, 不用, 装置, 场景, 地下, 照片, 唯一
Topic 3: 适合, 一些, 太, 小朋友, 高大, 吸引, 设计, 逛逛, 王府中環, 步行街, 奢侈品, 选择, 美食, 高, 少, 布置, 逛街, 王府井大街, 时尚, 网红
Topic 4: 小时, 乐坊, 文化, 芝, 一线, 创意, 客人, 半个, 众多, 顾客, SKP, 分钟, 每家, 潮流, 总体, 完全, 周边, 面积, 逛一逛, 十足
Topic 5: 挺, 草坪, 蛋糕, 制作, 可, 广场, 没, 点评, 元, 几个, 商城, 室外, 完, 好玩, 工作人员, 周末, 搞, 只能, 一杯, 甜品
Topic 6: 中, 王府, 怪物, 玩偶, 寰, 点, 卡通, 闺蜜, 没什么, 美味, 造型, 言, 合影, 漂亮, 半小时, 奢华, 童趣, 享受, 驻足, 场所
Topic 7: 工厂, 带, 吃饭, 服务, 干净, 积分, 卫生间, 娃, 没, 设施, 东西, 很大, 停车, 不过, 楼上, 人少, 算是, 店铺, 洗手间, 每次
Topic 8: 说, 中环, 爱, 想, 开, 家, 绝对, 圣诞节, 开业, 顺便, 款式, 品质, skp, 找, 豪华, 前, 款, 游客, 面包, 正
Visualization saved as: pyLDAvis/lda_visualization_reviews_2_polarity_P.html


#### Negative review

In [17]:
file_path = "../data/review/analysis/polarity/reviews_2_polarity_N.txt"
lda_topic_modeling(file_path, N2_best_num_topics)

Dictionary<3895 unique tokens: ['B', 'DROPOFF', '不是', '东', '位置']...>
Dictionary<363 unique tokens: ['B', '不是', '位置', '停车场', '免费']...>
Topic 0: 牌子, 两个, 香港, 店, 逛街, 不好, 进, 只能, 积分, 服务, 货, 不会, 有个, 过, 不能, 奢侈品, 总体, 建议, 卖, 下次
Topic 1: 适合, 小时, 蛋糕, 二层, 早, 元, 咖啡节, 好多, 这样, 去年, 现场, 乐坊, 芝, 室外, 拍, 第一次, 不少, 点, 水, 分钟
Topic 2: 吃饭, 王府中環, 门口, 装修, 不少, 停车费, 位置, 带, ️, 奢侈, 位于, 购物中心, 大部分, 圣诞, 风格, 车, 步行街, 开门, 顾客, 娃
Topic 3: 工厂, 巧克力, 孩子, 怪兽, 展览, 怪物, 可可, 创意, 朋友, 小朋友, 走, 可惜, 有些, 步行街, 场景, 进门, skp, 还好, 好看, 发现
Topic 4: 特别, 冰场, 娃, 孩子, 喜欢, 卫生间, 带, 设施, 值得, 不过, 做, 购物, 外面, 滑雪, 够, 逛一逛, 打卡, 确实, 路过, 档次
Topic 5: 主题, 展, 一些, 每次, 赶上, 里, 全, 云集, 甜品, 冬天, 硬件, 照片, 相当, 感受, 几张, 应有尽有, 滑冰场, 逛逛, 低, 算是
Topic 6: 滑冰, 圣诞节, 没什么, 吸引, 不过, 店, 世界, 转, 爱, 真心, 想, 商业, 提前, 喝, 四层, 贵, 工作日, 奢华, 好喝, 停车
Topic 7: 停车场, 中环, 非常, 里, 布置, 期间, 高端, 做, 票, 网红, 一些, 有点, 饰品, 蛮, 点评, 风车, 拍照, 打卡, 逛逛, 店铺
Visualization saved as: pyLDAvis/lda_visualization_reviews_2_polarity_N.html


#### For stores 3

#### Postive review

In [18]:
file_path = "../data/review/analysis/polarity/reviews_3_polarity_P.txt"
lda_topic_modeling(file_path, P3_best_num_topics)

Dictionary<8982 unique tokens: ['一楼', '五花八门', '价格', '全国劳模', '北京']...>
Dictionary<1432 unique tokens: ['一楼', '价格', '全国劳模', '品类', '商品']...>
Topic 0: ️, 专柜, 化妆品, 喷泉, 推荐, 优惠, 没, 疫情, 期间, 羊毛, 薅, 门口, 搞, 力度, 活动, 路过, 元, 节日, 吃饭, 雅诗兰黛
Topic 1: 记忆, 带, 吃, 胡同, 年代, 拍照, 过, 有个, 生活, 装修, 逛逛, 想, 开, 有意思, 回到, 感, 穿越, 小卖部, 卖, 适合
Topic 2: 购物, 环境, 店, 张秉贵, 年, 位于, 老牌, 百货, 第一, 全国, 最, 大楼, 王府井百货, 大型, 游客, 建筑, 更, 大牌, 商店, 发展
Visualization saved as: pyLDAvis/lda_visualization_reviews_3_polarity_P.html


#### Negative review

In [19]:
file_path = "../data/review/analysis/polarity/reviews_3_polarity_N.txt"
lda_topic_modeling(file_path, N3_best_num_topics)

Dictionary<3461 unique tokens: ['一层', '之选', '优秀', '吃', '商场']...>
Dictionary<312 unique tokens: ['工作人员', '建议', '时代', '选择', '那种']...>
Topic 0: 步行街, 年, 问, 网红, 下次, 历史, 化妆品, 很大, 变化, 果局, 孩子, 购物中心, 旋转, 木马, 做, 游客, 喷泉, 人气, 打卡, 喜欢
Topic 1: 儿时, 拍照, 记忆, 张秉贵, 过年, 那么, 消费, 过, 北京市, 品牌, 店, 一楼, 装修, 旁边, 几十年, 化妆品, 活动, 位于, 糖果, 前
Topic 2: 火车, 排队, 卖, 满满的, 周末, 王府井大街, 小吃, 玩, 特别, 专柜, 转转, 建议, 王府井百货, 层, 平时, 负, 不好, 机器, 回到, 人多
Topic 3: 不少, 老式, 物件, 门口, 玩, 活动, 顾客, 商店, 印象, 一种, 最, 差, 哈姆雷, 柜台, 超级, 绿皮, 不够, 商城, 美好, 特别
Topic 4: 小样, 样机, 领, 停车, 兰蔻, 坐, 步行街, 爆米花, 雅诗兰黛, 路过, 薅, 羊毛, 发现, 老字号, 过, 不过, 眼霜, 走, 估计, 小朋友
Topic 5: 体验, 环境, 玩, 楼, 孩子, 生活, 玩具, 发现, 特别, 做, 卖, 北楼, 少, 小时, 小吃, 过, 进, 偶然, 走, 原因
Topic 6: 疫情, 每次, 算是, 不过, 满满的, 穿越, 确实, 希望, 大楼, 值得, 北京市, 这样, 大牌, 商户, 玩具, 地标, 建筑, 位置, 逛街, 期间
Topic 7: 一些, apm, 味道, 衣服, 朋友, 好多, 排队, 有些, 排, 录像厅, 对面, 一楼, 游戏机, 够, 活动, 牌子, 久, 好吃, 卖, 推荐
Topic 8: 不错, 年代, 童年, 怀旧, 品牌, 一些, 喜欢, 转, 餐饮, 去逛, 朋友, 一圈, 菓, 书店, 没想到, 局, 进门, 记忆, 步行街, 周边
Topic 9: 太, 疫情, 局, 菓, 场景, 期间, 有点, 时间, 晚上, 王府井百货, 打卡, 样子, 拍, 想, 果局, 繁华, 好久没, 还原, 

#### For stores 4

#### Postive review

In [20]:
file_path = "../data/review/analysis/polarity/reviews_4_polarity_P.txt"
lda_topic_modeling(file_path, P4_best_num_topics)

Dictionary<6685 unique tokens: ['一风堂', '不错', '停车', '吃', '商场']...>
Dictionary<964 unique tokens: ['地铁站', '小厨', '很大', '快乐', '想']...>
Topic 0: 位置, 走, 很大, 交通, 大牌, 高端, 服务, 里, 时尚, 选择, 写字楼, 地铁站, 美食, 步行街, 停车场, 整体, 站, 位于, 便利, 云集
Topic 1: 买, 没, 东西, 拍照, 逛逛, 适合, 真的, 好多, 想, 最, 朋友, 里, 发现, 时间, 逛街, 找, 衣服, 超级, 店铺, 有点
Topic 2: 新天地, 娱乐, 休闲, 年, 小时, 积分, 东方广场, 商业街, 商业, 免费, 会员, 面积, 游客, 主题, 不少, 体验, 中国, 消费, 场所, 人流量
Visualization saved as: pyLDAvis/lda_visualization_reviews_4_polarity_P.html


#### Negative review

In [21]:
file_path = "../data/review/analysis/polarity/reviews_4_polarity_N.txt"
lda_topic_modeling(file_path, N4_best_num_topics)

Dictionary<2630 unique tokens: ['一站式', '不便', '东长安街', '享受', '人流']...>
Dictionary<172 unique tokens: ['价格', '停车', '吃喝玩乐', '白领', '美食']...>
Topic 0: 带, 东方广场, 商圈, 很久没, 少, 写字楼, 大牌, 旁边, 位于, 商业, 长安街, 交通, 很大, 估计, 前, 做, 顾客, 路口, 不少, 一些
Topic 1: 逛逛, 太, 电影, 排队, 顾客, 喜欢, 价格, 步行街, 吃喝玩乐, 这样, 中午, 选择, 开车, 地理位置, 特别, 美食, 只能, 白领, 整体, 中庭
Topic 2: 保安, 太, 消费, 问, 门, 体验, 买, 人员, 不会, 差, 定位, 客人, 点评, 好, 两个, 有人, 西门, 路过, 不过, 购物
Topic 3: 积分, 过, 服务台, 位置, 站, 点, 购物, 找, 出, 口, 厕所, 好多, 不能, 王府井大街, 号线, 大牌, 很少, 小票, 完, 客人
Topic 4: 停车, 商铺, 停车场, 一圈, 溜达, 新天地, 找, 期间, 小时, 玩, 会员, 有些, 不错, 环境, 车库, 坐地铁, 逛街, 买, 新, 开车
Topic 5: 不错, 一些, 时间, 赶上, 店, 逛一逛, 餐饮, 拍照, 蛮, 第一次, 好久没, 逛街, 美食, 最, 长安街, 特别, 楼上, 样子, 开, 人员
Topic 6: 超市, 朋友, 有个, 特别, 服务, 跑, 休息, 只能, 找, 好多, 更, 整体, 面积, 问, 人员, 不到, 顾客, 新天地, 设施, 商户
Topic 7: 找到, 工作人员, 顾客, 餐饮, 打卡, 新开, 好, 店, 王府井大街, 美食, 更, 购物, 希望, 服务, 点, 商家, 买, 步行街, 位于, 超
Topic 8: 有点, 开门, 早, 中午, 冷清, 这家, 布局, 倒, 一家, 开, 咖啡, 人气, 特别, 上班, 乱, 长安街, 关门, 不过, 消费, 过
Topic 9: 第一次, 春节, 最, 人少, 转, 区, 希望, 店, 费劲, 写字楼, 找到, 餐厅, 好多, 东方, 不过, 购物, 特别, 影响, 商户, 公

## 3. LDA thematic analysis of all review texts in the business circle

### Get the optimal number of topics (same as below)

In [22]:
# Set the file path for a text file.
file_path = "../data/review/processed/reviews_merged.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", best_num_topics)

Optimal number of topics (using perplexity method): 11
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 7


### Improve " lda_topic_modeling " function, adjust parameters and add new functions

In [23]:
def lda_topic_modeling_with_assignment(file_path, num_topics, output_file):
    # Read the text file
    with open(file_path, "r", encoding="utf-8") as f:
        comments = [line.strip() for line in f]

    # Build the dictionary and bag-of-words model
    dictionary = Dictionary([comment.split() for comment in comments])
    print(dictionary)
    dictionary.filter_extremes(no_below=5, no_above=0.1)
    print(dictionary)
    
    # Convert comments into bag-of-words format
    corpus = [dictionary.doc2bow(comment.split()) for comment in comments]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, 
                         num_topics=num_topics, 
                         id2word=dictionary, 
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    # Assign topics to comments
    topic_assignments = []
    for bow in corpus:
        doc_topics = lda_model.get_document_topics(bow)
        topic_assignments.append(max(doc_topics, key=lambda x: x[1])[0])

    # Print the top keywords for each topic
    for topic_id in range(num_topics):
        topic_words = lda_model.show_topic(topic_id, topn=20)
        print("Topic {}: {}".format(topic_id, ", ".join([word for word, prob in topic_words])))

    # Write the topic assignments to a CSV file
    with open(output_file, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["review_splitting", "topic"])
        for comment, topic in zip(comments, topic_assignments):
            writer.writerow([comment, topic])
    print("Topic assignments saved in:", output_file)

    # Compute visualization data for the LDA model
    vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

    # Extract a portion of the file path as the filename
    filename = os.path.splitext(os.path.basename(file_path))[0]
    filename = "pyLDAvis/lda_visualization_{}.html".format(filename)

    # Save the visualization as an HTML file
    pyLDAvis.save_html(vis_data, filename)
    print("Visualization saved as:", filename)

In [24]:
file_path = "../data/review/processed/reviews_merged.txt"
output_file = "../data/review/analysis/topics/topic_assignments.csv"

lda_topic_modeling_with_assignment(file_path, best_num_topics, output_file)

Dictionary<30619 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Dictionary<6900 unique tokens: ['一圈', '一家', '云集', '交通', '人气']...>
Topic 0: 疫情, 东单, 餐饮, 位于, 王府井大街, 积分, 消费, 顾客, 写字楼, 会员, 购物中心, 商业, 商业街, 中国, 北京市, 新, 区, 面积, 原因, 大型
Topic 1: 购物, 地铁, 店铺, 大牌, 走, 位置, 最, 步行街, 长安街, 服务, 很大, 逛逛, 朋友, 交通, 餐厅, 逛街, 美食, 开, 整体, 说
Topic 2: 打卡, 时间, 和平, 拍照, 小时候, 价格, 菓, 路过, 局, 主题, B, 电影院, 可爱, 太, 转, 果局, 展览, 展, 综合性, 区域
Topic 3: 老, 东方广场, 更, 二层, 点, 过, 繁华, 进, 奢侈品, 味道, 多年, 营业, 不用, 人流量, 电梯, 关门, 第一, 历史, 逛商场, 想
Topic 4: 回忆, 记忆, 变化, 样子, 快, 客人, 布置, 扭蛋, 美食街, 儿时, 装饰, 依旧, 柜台, 显得, 很漂亮, 中, 有人, 童年, 鼠年, 发展
Topic 5: 做, 点评, 说, 胡同, 年代, 进门, 拍, 工作人员, 开业, 态度, 好看, 保安, 时代, 扫码, 不好, 随便, 真, 入口, 下班, A
Topic 6: 东方新天地, 停车, 东西, 吃饭, 一些, 不是, 不过, 不少, 新天地, 好多, 适合, 高端, 免费, 周末, 带, 年, 选择, 小时, 衣服, 少
Topic assignments saved in: ../data/review/analysis/topics/topic_assignments.csv
Visualization saved as: pyLDAvis/lda_visualization_reviews_merged.html
