In [None]:
import os
import re

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np

import langid
from collections import Counter
from tqdm import tqdm
import json

# 读取文件

In [None]:
TWITTER_POST_PATH = "./data/2024_0131/Twitter/meibo.json"
post_all_data = []
with open(TWITTER_POST_PATH, 'r',encoding='utf-8') as file:
    lines = file.readlines()
    for line in tqdm(lines):
        data = json.loads(line)
        post_all_data.append(data)
print(len(post_all_data))

In [None]:
# 读取lang_distribution.json
import json
with open('./data/2024_0131/Twitter/lang_distribution.json', 'r',encoding='utf-8') as f:
    lang_dict = json.load(f)
# 统计每个语言的数量
lang_count = {}
for key in lang_dict:
    lang_count[key] = len(lang_dict[key])


# 处理英文语料
1. 目前的预处理考虑，去除换行符、将网址转换为空字符串、去掉@的人？

In [None]:
en_data = lang_dict['en']
print(len(en_data))

In [None]:

def remove_newlines(text):
    # 将换行符替换为空格
    clean_text = text.replace('\n', ' ')
    return clean_text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # 将匹配到的网址替换为空字符串
    clean_text = url_pattern.sub('', text)
    return clean_text
def remove_after_at(text):
    # 匹配@符号后面的单词的正则表达式
    after_at_pattern = re.compile(r'@\w+\s?')
    clean_text = after_at_pattern.sub('', text)
    return clean_text
def remove_punctuation(text):
    clean_text = re.sub(r'[^\w\s]', '', text)
    return clean_text
def convert_to_lowercase(text):
    return text.lower()
english_stopwords = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',
    'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these',
    'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
    'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
    'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
    'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now','RT'
]
def remove_stopwords(text, stopwords):
    words = text.split()  # 将文本分词为单词列表
    clean_words = [word for word in words if word not in stopwords]  # 去除停用词
    clean_text = ' '.join(clean_words)  # 将列表中的单词重新组合成文本
    return clean_text

In [None]:
idx = 0
for data in tqdm(en_data):
    original_text = remove_newlines(data['text'])
    remove_url = remove_urls(original_text)
    remove_after = remove_after_at(remove_url)
    remove_punctuation_text = remove_punctuation(remove_after)
    convert_lowercase = convert_to_lowercase(remove_punctuation_text)
    remove_stopword = remove_stopwords(convert_lowercase, english_stopwords)
    en_data[idx]['text'] = remove_stopword
    # print(f"original text: {original_text}")
    # print(f"remove url: {remove_url}")
    # print(f"remove after at: {remove_after}")
    # print(f"remove punctuation: {remove_punctuation_text}")
    # print(f"convert lowercase: {convert_lowercase}")
    # print(f"remove stopword: {remove_stopword}")
    # print()
    # idx += 1
    # if idx == 5:
    #     break

# 数据标签

In [None]:
en_query_dict = {}
for data in en_data:
    query = data['query']
    if query in en_query_dict:
        en_query_dict[query] += 1
    else:
        en_query_dict[query] = 1
en_query_dict



In [None]:
en_topic = []

for query_dict_key in en_query_dict:
    detect_lang = langid.classify(query_dict_key)
    if detect_lang[0] == 'en' or detect_lang[0] == 'es':
        if query_dict_key not in en_topic:
            en_topic.append(query_dict_key)
print(f"en_topic: {len(en_topic)}")
en_topic_dict = {
    'other': 0,
    'U.S. Presidential Election': 0
}
for data in en_data:
    if data['query'] in en_topic:
        if data['query'] in en_topic_dict:
            en_topic_dict[data['query']] += 1
        else:
            en_topic_dict[data['query']] = 1
    else:
        en_topic_dict['other'] += 1

In [None]:

plt.figure(figsize=(15, 5))
en_topic_count = Counter(en_topic_dict)
en_topic_count = dict(en_topic_count)
en_topic_count = dict(sorted(en_topic_count.items(), key=lambda item: item[1], reverse=True))
plt.bar(en_topic_count.keys(), en_topic_count.values(),color = 'skyblue')
plt.xticks(rotation=90)
# 显示每个柱状图的数值
for x, y in en_topic_count.items():
    plt.text(x, y + 5, '%d' % y, ha='center', va='bottom',rotation=90,fontsize=7)
plt.title('en topic count')
plt.show()

In [None]:
category_names = list(en_topic_count.keys())
print(f"category_names: {len(category_names)}")
# 去除other
category_names.pop(category_names.index('other'))
print(f"category_names: {len(category_names)}")

In [None]:
categories = []
for data in en_data:
    if data['query'] in category_names:
        categories.append(category_names.index(data['query']))
    else:
        categories.append(-1)

In [None]:
from bertopic import BERTopic
docs = [data['text'] for data in en_data]
topic_model = BERTopic(verbose=True,language="english").fit(docs, y=categories)

In [None]:
topic_model.save("225_semi_en_model",serialization="pickle")

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.reduce_topics(docs, nr_topics=100)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_topics()

In [None]:
classes = [category_names[i] for i in categories]
topics_per_class = topic_model.topics_per_class(docs, classes=classes)

In [None]:
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
from umap import UMAP
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.get_document_info(docs)

In [None]:
topic_model.get_topic_info()

# 读取数据

In [None]:
from bertopic import BERTopic
topic_model = BERTopic.load("225_semi_en_model")

In [None]:
docs = [data['text'] for data in en_data]
topic_model.reduce_topics(docs, nr_topics=100)
topic_model.get_topic_info()

In [None]:
import pandas as pd
pd_topic = pd.DataFrame(topic_model.get_topic_info())

In [None]:
pd_topic.iloc[20:40]