##3. Content Analysis

In [None]:
#install
!pip install pandas numpy
!pip install torch==2.5.1
!pip install gensim==4.3.2 scipy==1.11.4
!pip install scikit-learn==1.6.1
!pip install umap-learn==0.5.6
!pip install bertopic==0.16.4


In [None]:
#import
# 基础库
import pandas as pd
import numpy as np
import random
import torch

# 词向量 & 文本处理
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

# 主题建模
from bertopic import BERTopic
from umap import UMAP


###3.1 CBOW embedding: prepare for topic modeling



In [None]:
#import text
df = pd.read_csv('ready_text.csv')
df.head()

Unnamed: 0,score,num_comments,date,text,clean_text,ready_text
0,12867,3277,2025-03-22 20:59:47,Assassin's Creed Shadows Hits 2 Million Player...,Assassin's Creed Shadows Hits 2 Million Player...,hit million player day release ubisoft say sur...
1,11555,1053,2025-03-21 01:00:58,Amid Japan Concern About Assassin's Creed Shad...,Amid Japan Concern About Assassin's Creed Shad...,amid japan concern ubisoft confirms dayone pat...
2,18144,2726,2024-09-25 10:52:29,Ubisoft Cancels Press Previews of Assassin's C...,Ubisoft Cancels Press Previews of Assassin's C...,ubisoft cancel press preview bet reassuring go...
3,11677,957,2024-12-12 01:44:30,"Assassin's Creed Shadows adds a ""canon mode"" t...","Assassin's Creed Shadows adds a ""canon mode"" t...",add canon mode make choice fan spent year unsu...
4,15573,1316,2024-10-22 11:00:38,Ubisoft Cancels Assassin's Creed Shadows Early...,Ubisoft Cancels Assassin's Creed Shadows Early...,ubisoft cancel early access wasnt announced wa...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221 entries, 0 to 220
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   score         221 non-null    int64 
 1   num_comments  221 non-null    int64 
 2   date          221 non-null    object
 3   text          221 non-null    object
 4   clean_text    221 non-null    object
 5   ready_text    221 non-null    object
dtypes: int64(2), object(4)
memory usage: 10.5+ KB


In [None]:
# fix random to fix model
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# CBOW modeling
model = Word2Vec(
    sentences=df['ready_text'].str.split(),
    vector_size=15,
    window=3,
    min_count=2,
    workers=4,
    sg=0
)
model.save("word2vec_cbow.model")

In [None]:
## use IF-IDF scores to weight vectors
df['ready_text'] = df['ready_text'].astype(str)
# caculate TF-IDF scores
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['ready_text'])
# convert result into dataframe
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = dict(zip(tfidf_feature_names, tfidf_matrix.toarray().mean(axis=0)))

In [None]:
## get weighted_embedding
def get_weighted_embedding(text, model, tfidf_scores):
    words = [word for word in text.split() if word in model.wv and word in tfidf_scores] # Split the text into tokens
    if len(words) == 0:
        return np.zeros(model.vector_size)  # return zero vector if no valid words
    # caculate weighted_vector
    weighted_vectors = [model.wv[word] * tfidf_scores[word] for word in words]
    return np.mean(weighted_vectors, axis=0)

# caculate vector for each text
df['embeddings'] = df['ready_text'].apply(lambda x: get_weighted_embedding(x, model, tfidf_scores))


###3.2 BERTopic modelling

In [None]:
##BERTopic Modeling
# Convert CBOW embeddings to array
embeddings_array = np.array(df['embeddings'].tolist())

# Remove zero vectors to avoid UMAP errors
non_zero_indices = np.any(embeddings_array != 0, axis=1)
filtered_embeddings = embeddings_array[non_zero_indices]
filtered_texts = df['ready_text'][non_zero_indices].tolist()

# Define UMAP parameters
umap_model = UMAP(
    n_neighbors=10,        # clustering range
    n_components=3,        # dimensions
    metric='cosine',
    random_state=42        # fix random
)


# BERTopic model with custom UMAP
topic_model = BERTopic(
    language="english",
    n_gram_range=(2, 3),         # can adjust according result
    top_n_words=15,               # Display 15 keywords per topic
    calculate_probabilities=True,
    min_topic_size=5,
    umap_model=umap_model
)

#topic modelling
topics, probs = topic_model.fit_transform(df['ready_text'], embeddings_array)


In [None]:
# Check Topic results
# 获取主题信息
topic_info = topic_model.get_topic_info()  # DataFrame，包含 Topic, Count, Name 等

# 获取每个主题的关键词
all_topics = topic_model.get_topics()  # dict: {topic_id: [(word, score), ...]}

topic_keywords = pd.DataFrame([
    {'Topic': topic_id, 'Keywords': ', '.join([word for word, _ in words])}
    for topic_id, words in all_topics.items()
])

# 合并主题信息和关键词
result = topic_info.merge(topic_keywords, on='Topic', how='left')

# 查看结果
result.head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Keywords
0,-1,38,-1_feel like_open world_monster hunter_ghost t...,"[feel like, open world, monster hunter, ghost ...",[physical copy leak nearly month ahead release...,"feel like, open world, monster hunter, ghost t..."
1,0,51,0_feel like_open world_ghost tsushima_black flag,"[feel like, open world, ghost tsushima, black ...",[skill wasnt much fun handson impression main ...,"feel like, open world, ghost tsushima, black f..."
2,1,20,1_dragon age_first time_wa great_felt like,"[dragon age, first time, wa great, felt like, ...",[still looking good crazy lot looking good par...,"dragon age, first time, wa great, felt like, f..."
3,2,20,2_final fantasy_super mario_legend zelda_metal...,"[final fantasy, super mario, legend zelda, met...",[favourite every console youve owned list chan...,"final fantasy, super mario, legend zelda, meta..."
4,3,19,3_mass effect_ive played_elden ring_feel like,"[mass effect, ive played, elden ring, feel lik...",[keep spreadsheet gaming please note ranked mu...,"mass effect, ive played, elden ring, feel like..."


In [None]:
# save
result.to_csv('topic_result.csv', index=False)