# Part 2. Topic analysis of positive and negative comments using LDA models

In [1]:
import os
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis.gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

## 1. Calculation of the optimal number of topics

### Function Design : Use Perplexity and Coherence 

In [2]:
def get_optimal_topic_num_by_perplexity(file_path, num_topics_range):
    '''
    Compute the optimal number of topics.

    Parameters:
    file_path: str, the file path of the preprocessed text file.
    num_topics_range: list, the range of topic numbers to try.

    Returns:
    best_num_topics: int, the optimal number of topics.
    '''

    # Read the file and create the corpus.
    with open(file_path, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]

    # Create the dictionary.
    dictionary = Dictionary(corpus)

    # Train the LDA models with different topic numbers, and calculate the perplexity values.
    perplexity_values = []
    lda_models = []
    for num_topics in num_topics_range:
        lda_model = LdaModel(corpus=[dictionary.doc2bow(text) for text in corpus], id2word=dictionary, num_topics=num_topics)
        lda_models.append(lda_model)
        perplexity_values.append(lda_model.log_perplexity([dictionary.doc2bow(text) for text in corpus]))

    # Calculate the coherence scores using the perplexity values.
    coherence_scores = [CoherenceModel(model=lda_model, texts=corpus, dictionary=dictionary, coherence='c_v').get_coherence() for lda_model in lda_models]

    # Find the optimal number of topics based on the coherence scores.
    best_num_topics_index = coherence_scores.index(max(coherence_scores))
    best_num_topics = num_topics_range[best_num_topics_index]
    
    print("Optimal number of topics (using perplexity method):", best_num_topics)

    return best_num_topics


### Function Design : Use Cosine_Similarity

In [3]:
def get_optimal_topic_num_by_similarity(file_path, num_topics_range):
    '''
    Compute the optimal number of topics.

    Parameters:
    file_path: str, the file path of the preprocessed text file.
    num_topics_range: list, the range of topic numbers to try.

    Returns:
    best_num_topics: int, the optimal number of topics.
    '''
    # Read file and create corpus
    with open(file_path, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]

    # Create dictionary
    dictionary = gensim.corpora.Dictionary(corpus)

    # Train LDA model and compute cosine similarity between topics for different number of topics
    similarity_values = []
    for num_topics in num_topics_range:
        lda_model = gensim.models.ldamodel.LdaModel(corpus=[dictionary.doc2bow(text) for text in corpus], id2word=dictionary, num_topics=num_topics)
        topics_matrix = lda_model.get_topics()
        cosine_similarities = cosine_similarity(topics_matrix)
        similarity_values.append(np.min(cosine_similarities[np.triu_indices(num_topics, k=1)]))

    # Find the optimal number of topics
    best_num_topics_index = similarity_values.index(max(similarity_values))
    best_num_topics = num_topics_range[best_num_topics_index]
    
    print("Optimal number of topics (using cosine similarity method):", best_num_topics)
    
    return best_num_topics


### Call the function to output the respective optimal number of topics

In [4]:
# Create a list of integers for the range of number of topics.
num_topics_range = list(range(3, 10))

#### For stores 1

In [5]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_1_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P1_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P1_best_num_topics)

Optimal number of topics (using perplexity method): 5
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 4


In [6]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_1_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N1_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N1_best_num_topics)

Optimal number of topics (using perplexity method): 7
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 5


#### For stores 2

In [7]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_2_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P2_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P2_best_num_topics)

Optimal number of topics (using perplexity method): 3
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


In [8]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_2_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N2_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N2_best_num_topics)

Optimal number of topics (using perplexity method): 3
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


#### For stores 3

In [9]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_3_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P3_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P3_best_num_topics)

Optimal number of topics (using perplexity method): 4
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


In [10]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_3_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N3_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N3_best_num_topics)

Optimal number of topics (using perplexity method): 5
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 4


#### For stores 4

In [11]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_4_polarity_P.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
P4_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", P4_best_num_topics)

Optimal number of topics (using perplexity method): 3
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 3


In [12]:
# Set the file path for a text file.
file_path = "../data/review/analysis/polarity/reviews_4_polarity_N.txt"

# Get the optimal number of topics based on perplexity and cosine similarity.
best_num_topics_perplexity = get_optimal_topic_num_by_perplexity(file_path, num_topics_range)
best_num_topics_similarity = get_optimal_topic_num_by_similarity(file_path, num_topics_range)

# Print the ultimate optimal number of topics.
N4_best_num_topics = int((best_num_topics_perplexity + best_num_topics_similarity)/2)
print("Ultimate optimal number of topics :", N4_best_num_topics)

Optimal number of topics (using perplexity method): 7
Optimal number of topics (using cosine similarity method): 3
Ultimate optimal number of topics : 5


## 2. Get Topic-Word Distribution

### Function design : LDA model training, topic-word distribution output and visualization

In [36]:
def lda_topic_modeling(file_path, num_topics):
    # Read the text file
    with open(file_path, "r", encoding="utf-8") as f:
        texts = [line.strip().split() for line in f]

    # Build the dictionary and bag-of-words model
    dictionary = Dictionary(texts)
    print(dictionary)
    
    # Filter out extreme terms based on frequency and document proportion
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    print(dictionary)
    
    # Convert texts into bag-of-words format
    corpus = [dictionary.doc2bow(doc) for doc in texts]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, 
                         num_topics=num_topics, 
                         id2word=dictionary, 
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    # Print the top keywords for each topic
    for topic_id in range(num_topics):
        topic_words = lda_model.show_topic(topic_id, topn=10)
        print("Topic {}: {}".format(topic_id, ", ".join([word for word, prob in topic_words])))

    # Compute visualization data for the LDA model
    vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

    # Extract a portion of the file path as the filename
    filename = os.path.splitext(os.path.basename(file_path))[0]
    filename = "pyLDAvis/lda_visualization_{}.html".format(filename)

    # Save the visualization as an HTML file
    pyLDAvis.save_html(vis_data, filename)
    print("Visualization saved as:", filename)

### Call the function to Complete LDA theme analysis

#### For stores 1

#### Postive review

In [37]:
file_path = "../data/review/analysis/polarity/reviews_1_polarity_P.txt"
lda_topic_modeling(file_path, P1_best_num_topics)

Dictionary<15034 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Dictionary<3103 unique tokens: ['apm', '一圈', '一家', '不错', '云集']...>
Topic 0: 打卡, 拍照, 好看, 消毒, 主题, 电影院, 点评, 喜欢, 价格, 超级
Topic 1: apm, 活动, 北京, 兰蔻, 可爱, 泡泡玛特, pm, 小样, ️, 扫码
Topic 2: 很, 逛, 好, 吃, 多, 王府井, apm, 还, 喜欢, 非常
Topic 3: 北京, 品牌, 王府井, apm, 购物, 时尚, 步行街, 王府井大街, 餐饮, 美食
Visualization saved as: pyLDAvis/lda_visualization_reviews_1_polarity_P.html


#### Negative review

In [38]:
file_path = "../data/review/analysis/polarity/reviews_1_polarity_N.txt"
lda_topic_modeling(file_path, N1_best_num_topics)

Dictionary<5557 unique tokens: ['休息', '地方', '坐下', '座位', '想']...>
Dictionary<715 unique tokens: ['休息', '地方', '想', '没有', '顾客']...>
Topic 0: 商场, 王府井, 很, apm, 北京, 品牌, 真的, 东安市场, 地下, 非常
Topic 1: apm, 活动, 打卡, 北京, 拍照, ️, 展览, 月, 影响, 点评
Topic 2: 不, 多, 没, 还, 没有, 买, 说, 逛, 挺, 店
Topic 3: 店员, 券, 完全, 钱, 这样, 一件, 厕所, 懒得, 记得, 工作人员
Topic 4: 商场, 东西, 吃, 品牌, 王府井, 玩, 不, 很, 不错, 吃饭
Visualization saved as: pyLDAvis/lda_visualization_reviews_1_polarity_N.html


#### For stores 2

#### Postive review

In [39]:
file_path = "../data/review/analysis/polarity/reviews_2_polarity_P.txt"
lda_topic_modeling(file_path, P2_best_num_topics)

Dictionary<13572 unique tokens: ['cafelandmark', '一共', '一块', '一杯', '万朵']...>
Dictionary<2515 unique tokens: ['一共', '一块', '一杯', '万朵', '上班']...>
Topic 0: 怪兽, 可爱, 可可, 工厂, 王府中环, 展览, 吸引, 巧克力, 展, 主题
Topic 1: 商场, 很, 王府井, 好, 环境, 品牌, 王府中环, 逛, 非常, 喜欢
Topic 2: 很, 还, 买, 不, 孩子, 好, 喜欢, 带, 咖啡, 多
Visualization saved as: pyLDAvis/lda_visualization_reviews_2_polarity_P.html


#### Negative review

In [40]:
file_path = "../data/review/analysis/polarity/reviews_2_polarity_N.txt"
lda_topic_modeling(file_path, N2_best_num_topics)

Dictionary<3895 unique tokens: ['B', 'DROPOFF', '不是', '东', '位置']...>
Dictionary<395 unique tokens: ['B', '不是', '位置', '停车场', '免费']...>
Topic 0: 工厂, 活动, 巧克力, 怪兽, 主题, 展览, 很, 怪物, 可可, 创意
Topic 1: 孩子, 咖啡, 玩, 冰场, 带, 很, 票, 一些, 特别, 娃
Topic 2: 商场, 不, 没有, 王府中环, 很, 王府井, 说, 多, 还, 买
Visualization saved as: pyLDAvis/lda_visualization_reviews_2_polarity_N.html


#### For stores 3

#### Postive review

In [41]:
file_path = "../data/review/analysis/polarity/reviews_3_polarity_P.txt"
lda_topic_modeling(file_path, P3_best_num_topics)

Dictionary<8982 unique tokens: ['一楼', '五花八门', '价格', '全国劳模', '北京']...>
Dictionary<1468 unique tokens: ['一楼', '价格', '全国劳模', '北京', '品类']...>
Topic 0: 化妆品, ️, 专柜, 买, 一层, 优惠, 推荐, 好, 羊毛, 薅
Topic 1: 很, 王府井, 商场, 北京, 好, 还, 逛, 感觉, 不错, 不
Topic 2: 老, 北京, 和平, 地下, 菓, 局, 回忆, 北京市, 新, 记忆
Visualization saved as: pyLDAvis/lda_visualization_reviews_3_polarity_P.html


#### Negative review

In [42]:
file_path = "../data/review/analysis/polarity/reviews_3_polarity_N.txt"
lda_topic_modeling(file_path, N3_best_num_topics)

Dictionary<3461 unique tokens: ['一层', '之选', '优秀', '吃', '商场']...>
Dictionary<343 unique tokens: ['一层', '吃', '商场', '地下', '地方']...>
Topic 0: 王府井, 商场, 没有, 不, 很, 还, 说, 北京, 没, 真的
Topic 1: 老, 北京, 地下, 一层, 二层, 吃, 胡同, 回忆, 怀旧, 王府井
Topic 2: 感觉, 买, 小时候, 回忆, 很, 胡同, 东西, 地下, 带, 儿时
Topic 3: 菓, 和平, 局, 负, 店, 老, 几十年, 八十年代, 玩具, 餐饮
Visualization saved as: pyLDAvis/lda_visualization_reviews_3_polarity_N.html


#### For stores 4

#### Postive review

In [43]:
file_path = "../data/review/analysis/polarity/reviews_4_polarity_P.txt"
lda_topic_modeling(file_path, P4_best_num_topics)

Dictionary<6685 unique tokens: ['一风堂', '不错', '停车', '吃', '商场']...>
Dictionary<996 unique tokens: ['不错', '停车', '吃', '商场', '地下']...>
Topic 0: 不, 疫情, 里, 没, 拍照, 没有, 想, 买, 多, 好多
Topic 1: 很, 商场, 好, 品牌, 逛, 东方新天地, 喜欢, 多, 非常, 不错
Topic 2: 王府井, 东方新天地, 购物, 北京, 东单, 长安街, 餐饮, 商场, 新天地, 东方广场
Visualization saved as: pyLDAvis/lda_visualization_reviews_4_polarity_P.html


#### Negative review

In [44]:
file_path = "../data/review/analysis/polarity/reviews_4_polarity_N.txt"
lda_topic_modeling(file_path, N4_best_num_topics)

Dictionary<2630 unique tokens: ['一站式', '不便', '东长安街', '享受', '人流']...>
Dictionary<202 unique tokens: ['价格', '停车', '北京', '吃喝玩乐', '白领']...>
Topic 0: 东方新天地, 活动, 还, 消费, 非常, 店铺, 王府井, 东单, 不, 店
Topic 1: 说, 不, 积分, 没, 还, 疫情, 保安, 问, 没有, 服务台
Topic 2: 没有, 不, 地下, 还, 多, 服务, 品牌, 开门, 北京, 东方新天地
Topic 3: 王府井, 不是, 没有, 吃, 不, 电影, 写字楼, 没, 还, 长安街
Topic 4: 很, 东方新天地, 逛, 地下, 地方, 王府井, 一层, 多, 地铁, 品牌
Visualization saved as: pyLDAvis/lda_visualization_reviews_4_polarity_N.html
