# Part 2. Topic analysis of positive and negative comments using LDA models

In [40]:
import os
import csv
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis.gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

## 1. Calculation of the optimal number of topics

### Function Design : Use Perplexity and Coherence 

In [2]:
def get_optimal_topic_num_by_perplexity(file_path, num_topics_range):
    '''
    Compute the optimal number of topics.

    Parameters:
    file_path: str, the file path of the preprocessed text file.
    num_topics_range: list, the range of topic numbers to try.

    Returns:
    best_num_topics: int, the optimal number of topics.
    '''

    # Read the file and create the corpus.
    with open(file_path, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]

    # Create the dictionary.
    dictionary = Dictionary(corpus)

    # Train the LDA models with different topic numbers, and calculate the perplexity values.
    perplexity_values = []
    lda_models = []
    for num_topics in num_topics_range:
        lda_model = LdaModel(corpus=[dictionary.doc2bow(text) for text in corpus], id2word=dictionary, num_topics=num_topics)
        lda_models.append(lda_model)
        perplexity_values.append(lda_model.log_perplexity([dictionary.doc2bow(text) for text in corpus]))

    # Calculate the coherence scores using the perplexity values.
    coherence_scores = [CoherenceModel(model=lda_model, texts=corpus, dictionary=dictionary, coherence='c_v').get_coherence() for lda_model in lda_models]

    # Find the optimal number of topics based on the coherence scores.
    best_num_topics_index = coherence_scores.index(max(coherence_scores))
    best_num_topics = num_topics_range[best_num_topics_index]
    
    print("Optimal number of topics (using perplexity method):", best_num_topics)

    return best_num_topics


### Function Design : Use Cosine_Similarity

In [3]:
def get_optimal_topic_num_by_similarity(file_path, num_topics_range):
    '''
    Compute the optimal number of topics.

    Parameters:
    file_path: str, the file path of the preprocessed text file.
    num_topics_range: list, the range of topic numbers to try.

    Returns:
    best_num_topics: int, the optimal number of topics.
    '''
    # Read file and create corpus
    with open(file_path, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]

    # Create dictionary
    dictionary = gensim.corpora.Dictionary(corpus)

    # Train LDA model and compute cosine similarity between topics for different number of topics
    similarity_values = []
    for num_topics in num_topics_range:
        lda_model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(text) for text in corpus], id2word=dictionary, num_topics=num_topics)
        topics_matrix = lda_model.get_topics()
        cosine_similarities = cosine_similarity(topics_matrix)
        similarity_values.append(np.min(cosine_similarities[np.triu_indices(num_topics, k=1)]))

    # Find the optimal number of topics
    best_num_topics_index = similarity_values.index(max(similarity_values))
    best_num_topics = num_topics_range[best_num_topics_index]
    
    print("Optimal number of topics (using cosine similarity method):", best_num_topics)
    
    return best_num_topics


### Call the function to output the respective optimal number of topics

In [4]:
# Create a list of integers for the range of number of topics.
num_topics_range = list(range(3, 10))

#### For stores 1

In [5]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_1_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P1_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P1_best_num_topics)

Optimal number of topics (using perplexity method): 8
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 5


In [6]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_1_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N1_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N1_best_num_topics)

Optimal number of topics (using perplexity method): 4
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


#### For stores 2

In [7]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_2_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P2_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P2_best_num_topics)

Optimal number of topics (using perplexity method): 8
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 5


In [8]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_2_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N2_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N2_best_num_topics)

Optimal number of topics (using perplexity method): 6
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 4


#### For stores 3

In [9]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_3_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P3_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P3_best_num_topics)

Optimal number of topics (using perplexity method): 3
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


In [10]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_3_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N3_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N3_best_num_topics)

Optimal number of topics (using perplexity method): 9
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 6


#### For stores 4

In [11]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_4_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P4_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P4_best_num_topics)

Optimal number of topics (using perplexity method): 8
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 5


In [12]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_4_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N4_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N4_best_num_topics)

Optimal number of topics (using perplexity method): 6
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 4


## 2. Get Topic-Word Distribution

### Function design : LDA model training, topic-word distribution output and visualization

In [26]:
def lda_topic_modeling(file_path, num_topics):
    # Read the text file
    with open(file_path, "r", encoding="utf-8") as f:
        texts = [line.strip().split() for line in f]

    # Build the dictionary and bag-of-words model
    dictionary = Dictionary(texts)
    print(dictionary)
    
    # Filter out extreme terms based on frequency and document proportion
    dictionary.filter_extremes(no_below=5, no_above=0.1)
    print(dictionary)
    
    # Convert texts into bag-of-words format
    corpus = [dictionary.doc2bow(doc) for doc in texts]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, 
                         num_topics=num_topics, 
                         id2word=dictionary, 
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    # Print the top keywords for each topic
    for topic_id in range(num_topics):
        topic_words = lda_model.show_topic(topic_id, topn=20)
        print("Topic {}: {}".format(topic_id, ", ".join([word for word, prob in topic_words])))

    # Compute visualization data for the LDA model
    vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

    # Extract a portion of the file path as the filename
    filename = os.path.splitext(os.path.basename(file_path))[0]
    filename = "pyLDAvis/lda_visualization_{}.html".format(filename)

    # Save the visualization as an HTML file
    pyLDAvis.save_html(vis_data, filename)
    print("Visualization saved as:", filename)

### Call the function to Complete LDA theme analysis

#### For stores 1

#### Postive review

In [14]:
file_path = "../data/review/analysis/polarity/reviews_1_polarity_P.txt"
lda_topic_modeling(file_path, P1_best_num_topics)

Dictionary<15034 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Dictionary<3103 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Topic 0: 不, 没, 走, 没有, 新东安, 打折, 装修, 消费, 更, 品牌
Topic 1: 活动, 可爱, 拍照, 展览, 小样, 游戏, 兰蔻, 领, 中心, 中庭
Topic 2: 很, 王府井, 逛, 好, 吃, 多, 还, 喜欢, 品牌, 非常
Topic 3: 购物, 品牌, 苹果, 北京, 时尚, 王府井大街, 步行街, 店, 美食, 餐饮
Topic 4: apm, 北京, 做, 东安市场, 打卡, 朋友, 正好, 好看, 好去处, 超级
Visualization saved as: pyLDAvis/lda_visualization_reviews_1_polarity_P.html


#### Negative review

In [15]:
file_path = "../data/review/analysis/polarity/reviews_1_polarity_N.txt"
lda_topic_modeling(file_path, N1_best_num_topics)

Dictionary<5557 unique tokens: ['休息', '地方', '坐下', '座位', '想']...>
Dictionary<715 unique tokens: ['休息', '地方', '想', '没有', '顾客']...>
Topic 0: 商场, 王府井, 品牌, 很, apm, 北京, 一些, 一层, 逛, 东安市场
Topic 1: 活动, apm, 打卡, 拍照, ️, 北京, 展览, 月, 时间, 玩
Topic 2: 不, 还, 没, 多, 买, 没有, 挺, 说, 逛, 疫情
Visualization saved as: pyLDAvis/lda_visualization_reviews_1_polarity_N.html


#### For stores 2

#### Postive review

In [16]:
file_path = "../data/review/analysis/polarity/reviews_2_polarity_P.txt"
lda_topic_modeling(file_path, P2_best_num_topics)

Dictionary<13572 unique tokens: ['cafelandmark', '一共', '一块', '一杯', '万朵']...>
Dictionary<2515 unique tokens: ['一共', '一块', '一杯', '万朵', '上班']...>
Topic 0: 很, 好, 商场, 喜欢, 非常, 特别, 环境, 可爱, 最, 里
Topic 1: 商场, 王府井, 品牌, 王府中环, 高端, 餐厅, 大牌, 环境, 不错, 购物
Topic 2: 很, 还, 不, 多, 吃, 王府中环, 买, 适合, 好, 拍照
Topic 3: 怪兽, 孩子, 带, 积分, 圣诞, 展览, 有趣, 玩, 兑换, 装饰
Topic 4: 可可, 咖啡, ️, 互动, 制作, 文化, 活动, 一群, 举办, 创意
Visualization saved as: pyLDAvis/lda_visualization_reviews_2_polarity_P.html


#### Negative review

In [17]:
file_path = "../data/review/analysis/polarity/reviews_2_polarity_N.txt"
lda_topic_modeling(file_path, N2_best_num_topics)

Dictionary<3895 unique tokens: ['B', 'DROPOFF', '不是', '东', '位置']...>
Dictionary<395 unique tokens: ['B', '不是', '位置', '停车场', '免费']...>
Topic 0: 工厂, 巧克力, 活动, 怪物, 创意, 怪兽, 圣诞节, 展览, 停车费, skp
Topic 1: 咖啡, 冰场, 票, 喝, 场地, 青年节, 咖啡节, 现场, 户外, 咖啡店
Topic 2: 玩, 买, 孩子, 小朋友, 说, 真的, 地方, 滑冰, 朋友, 走
Topic 3: 商场, 不, 王府中环, 很, 没有, 王府井, 地方, 多, 还, 品牌
Visualization saved as: pyLDAvis/lda_visualization_reviews_2_polarity_N.html


#### For stores 3

#### Postive review

In [18]:
file_path = "../data/review/analysis/polarity/reviews_3_polarity_P.txt"
lda_topic_modeling(file_path, P3_best_num_topics)

Dictionary<8982 unique tokens: ['一楼', '五花八门', '价格', '全国劳模', '北京']...>
Dictionary<1468 unique tokens: ['一楼', '价格', '全国劳模', '北京', '品类']...>
Topic 0: 化妆品, ️, 专柜, 买, 一层, 优惠, 推荐, 好, 羊毛, 薅
Topic 1: 很, 王府井, 商场, 北京, 好, 还, 逛, 感觉, 不错, 不
Topic 2: 老, 北京, 和平, 地下, 菓, 局, 回忆, 北京市, 新, 记忆
Visualization saved as: pyLDAvis/lda_visualization_reviews_3_polarity_P.html


#### Negative review

In [19]:
file_path = "../data/review/analysis/polarity/reviews_3_polarity_N.txt"
lda_topic_modeling(file_path, N3_best_num_topics)

Dictionary<3461 unique tokens: ['一层', '之选', '优秀', '吃', '商场']...>
Dictionary<343 unique tokens: ['一层', '吃', '商场', '地下', '地方']...>
Topic 0: 商场, 不, 没有, 王府井, 很, 东西, 多, 说, 不是, 真的
Topic 1: 排队, 小样, 领, apm, 挺, 化妆品, 旋转, 木马, 玩, 雅诗兰黛
Topic 2: 带, 很, 小时候, 孩子, 回忆, 玩, 买, 挺, 生活, 感觉
Topic 3: 体验, 负, 可, 自行车, 仿佛, 火车, 零食, 带, 玩, 玩具
Topic 4: 北京, 老, 地下, 王府井, 二层, 一层, 和平, 还, 没, 胡同
Topic 5: 王府井, 儿时, 记忆, 北京, 步行街, 选择, 打卡, 地下, 北楼, 场景
Visualization saved as: pyLDAvis/lda_visualization_reviews_3_polarity_N.html


#### For stores 4

#### Postive review

In [20]:
file_path = "../data/review/analysis/polarity/reviews_4_polarity_P.txt"
lda_topic_modeling(file_path, P4_best_num_topics)

Dictionary<6685 unique tokens: ['一风堂', '不错', '停车', '吃', '商场']...>
Dictionary<996 unique tokens: ['不错', '停车', '吃', '商场', '地下']...>
Topic 0: 疫情, 不, 没, 期间, 热闹, 里, 真的, 几个, 顾客, 喝
Topic 1: 停车, 逛街, 免费, 周末, 小时, 积分, 价格, 会员, 活动, 溜达
Topic 2: 王府井, 东方新天地, 购物, 东单, 北京, 长安街, 餐饮, 新天地, 高端, 交通
Topic 3: 年, 写字楼, 可爱, 东方广场, 一家, 繁华, 楼下, 喷泉, 最, 广场
Topic 4: 商场, 很, 好, 品牌, 东方新天地, 逛, 多, 喜欢, 王府井, 非常
Visualization saved as: pyLDAvis/lda_visualization_reviews_4_polarity_P.html


#### Negative review

In [21]:
file_path = "../data/review/analysis/polarity/reviews_4_polarity_N.txt"
lda_topic_modeling(file_path, N4_best_num_topics)

Dictionary<2630 unique tokens: ['一站式', '不便', '东长安街', '享受', '人流']...>
Dictionary<202 unique tokens: ['价格', '停车', '北京', '吃喝玩乐', '白领']...>
Topic 0: 东方新天地, 王府井, 活动, 东单, 逛, 还, 一层, 疫情, 地铁, 很
Topic 1: 说, 不, 积分, 还, 疫情, 没, 保安, 非常, 问, 期间
Topic 2: 地下, 多, 没有, 很, 不, 一层, 品牌, 特别, 开门, 店铺
Topic 3: 没有, 吃, 不是, 还, 逛, 没, 第一次, 找, 很, 地方
Visualization saved as: pyLDAvis/lda_visualization_reviews_4_polarity_N.html


## 3. LDA thematic analysis of all review texts in the business circle

### Get the optimal number of topics (same as below)

In [28]:
# Set the file path for a text file.
file_path = "../data/review/processed/reviews_merged.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", best_num_topics)

Optimal number of topics (using perplexity method): 9
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 6


### Improve " lda_topic_modeling " function, adjust parameters and add new functions

In [50]:
def lda_topic_modeling_with_assignment(file_path, num_topics, output_file):
    # Read the text file
    with open(file_path, "r", encoding="utf-8") as f:
        comments = [line.strip() for line in f]

    # Build the dictionary and bag-of-words model
    dictionary = Dictionary([comment.split() for comment in comments])
    print(dictionary)
    dictionary.filter_extremes(no_below=5, no_above=0.1)
    print(dictionary)
    
    # Convert comments into bag-of-words format
    corpus = [dictionary.doc2bow(comment.split()) for comment in comments]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, 
                         num_topics=num_topics, 
                         id2word=dictionary, 
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    # Assign topics to comments
    topic_assignments = []
    for bow in corpus:
        doc_topics = lda_model.get_document_topics(bow)
        topic_assignments.append(max(doc_topics, key=lambda x: x[1])[0])

    # Print the top keywords for each topic
    for topic_id in range(num_topics):
        topic_words = lda_model.show_topic(topic_id, topn=20)
        print("Topic {}: {}".format(topic_id, ", ".join([word for word, prob in topic_words])))

    # Write the topic assignments to a CSV file
    with open(output_file, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["review_splitting", "topic"])
        for comment, topic in zip(comments, topic_assignments):
            writer.writerow([comment, topic])
    print("Topic assignments saved in:", output_file)

    # Compute visualization data for the LDA model
    vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

    # Extract a portion of the file path as the filename
    filename = os.path.splitext(os.path.basename(file_path))[0]
    filename = "pyLDAvis/lda_visualization_{}.html".format(filename)

    # Save the visualization as an HTML file
    pyLDAvis.save_html(vis_data, filename)
    print("Visualization saved as:", filename)

In [51]:
file_path = "../data/review/processed/reviews_merged.txt"
output_file = "../data/review/analysis/topics/topic_assignments.csv"

lda_topic_modeling_with_assignment(file_path, best_num_topics, output_file)

Dictionary<30619 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Dictionary<6900 unique tokens: ['一圈', '一家', '云集', '交通', '人气']...>
Topic 0: 购物, 东单, 餐饮, 停车, 东方广场, 新天地, 位于, 积分, 停车场, 消费, 小时, 顾客, 设施, 会员, 购物中心, 休闲, 商业, 娱乐, 商业街, 区
Topic 1: 东方新天地, 地铁, 吃饭, 店铺, 一些, 大牌, 逛街, 位置, 最, 步行街, 逛逛, 长安街, 不少, 很大, 适合, 朋友, 不过, 高端, 交通, 王府井大街
Topic 2: 时间, 免费, 小时候, 希望, 价格, 点, 打卡, 超市, 卖, 人少, 路过, 发现, 生活, 可爱, 记忆, 展览, 展, 综合性, 主题, 区域
Topic 3: 疫情, 说, 不是, 东西, 走, 想, 太, 过, 带, 找, 期间, 更, 少, 服务, 喷泉, 孩子, 北京市, 第一次, 有些, 晚上
Topic 4: 拍照, ️, 繁华, 有个, 拍, 分, 快, 满满的, 回到, 美食街, 儿时, 依旧, 下班, 中庭, 全国, 鼠年, B, 注册, 还原, 方向
Topic 5: 老, 和平, 年, 新, 做, 写字楼, 回忆, 二层, 中国, 广场, 点评, 前, 菓, 局, 建筑, 电影院, 胡同, 开门, 多年, 营业
Topic assignments saved in: ../data/review/analysis/topics/topic_assignments.csv
Visualization saved as: pyLDAvis/lda_visualization_reviews_merged.html
