In [11]:
from sentence_transformers import SentenceTransformer, util
import os
# from utils import freq
import re
import pandas as pd
import numpy as np

os.environ['http_proxy'] = '127.0.0.1:7890'
os.environ['https_proxy'] = '127.0.0.1:7890'
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [14]:
######

# 查找Top 10的关键词，全是英文
from collections import Counter
import re
def remove_stopwords(text, stopwords):
    words = text.split()  # 将文本分词为单词列表
    clean_words = [word for word in words if word not in stopwords]  # 去除停用词
    clean_text = ' '.join(clean_words)  # 将列表中的单词重新组合成文本
    return clean_text
def remove_newlines(text):
    # 将换行符替换为空格
    clean_text = text.replace('\n', ' ')
    return clean_text
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # 将匹配到的网址替换为空字符串
    clean_text = url_pattern.sub('', text)
    return clean_text
def remove_after_at(text):
    # 匹配@符号后面的单词的正则表达式
    after_at_pattern = re.compile(r'@\w+\s?')
    clean_text = after_at_pattern.sub('', text)
    return clean_text
def remove_punctuation(text):
    clean_text = re.sub(r'[^\w\s]', '', text)
    return clean_text
def convert_to_lowercase(text):
    return text.lower()
english_stopwords = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',
    'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these',
    'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
    'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
    'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
    'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now' ,'RT' ,'weibo', 'rt' ,'cctv' ,'time',
    'daiichi' ,'the' ,'fukushima' ,'[#' ,'#]' ,'5th'
]
def preprocess_text(text, stopwords):
    text = remove_newlines(text)
    text = remove_urls(text)
    text = remove_after_at(text)
    text = remove_punctuation(text)
    text = convert_to_lowercase(text)
    text = remove_stopwords(text, stopwords)
    return text
def find_top_n_words(text, n):
    words = ' '.join(text).split()  # 将所有文本拼接成一个长字符串后分词
    word_freq = Counter(words)  # 统计词频
    top_n_words = word_freq.most_common(n)  # 获取词频最高的前n个单词
    return top_n_words


def get_word_freq_list(text):
    text = text.apply(lambda x: preprocess_text(x, english_stopwords))
    top_words = find_top_n_words(text, 10)
    return top_words




def cal_RW_SCORE(A_posts, B_posts):
    A_texts = get_word_freq_list(A_posts['text_trans'])
    B_texts = get_word_freq_list(B_posts['text_trans'])
    print(A_texts, B_texts)
    A_word_list = [word[0] for word in A_texts]
    B_word_list = [word[0] for word in B_texts]
    embed_A = model.encode(A_word_list, convert_to_tensor=True)
    embed_B = model.encode(B_word_list, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embed_A, embed_B)
    cosine_scores = cosine_scores.cpu().numpy()
    rw_res = []
    for i in range(len(A_word_list)):
        # print(f"{A_word_list[i]} <--> {B_word_list[cosine_scores[i].argmax()]}, score: {cosine_scores[i].max()}")
        rw_res.append({
            'source': A_word_list[i],
            'target': B_word_list[cosine_scores[i].argmax()],
            'score': cosine_scores[i].max()
        })
    if len(rw_res) == 0:
        rw_score = 0
    else:
        rw_score = sum([item['score'] for item in rw_res]) / len(rw_res)
    # print(f"rw_score: {rw_score}")
    return rw_score, rw_res


def cal_hashtag_score(A_posts, B_posts):
    A_hashtags = find_hashtags(A_posts).tolist()
    B_hashtags = find_hashtags(B_posts).tolist()
    A_hashtags_list = [item for sublist in A_hashtags for item in sublist]
    B_hashtags_list = [item for sublist in B_hashtags for item in sublist]
    # print(f"A_hashtags: {A_hashtags_list}")
    # print(f"B_hashtags: {B_hashtags_list}")
    embed_A = model.encode(A_hashtags_list, convert_to_tensor=True)
    embed_B = model.encode(B_hashtags_list, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embed_A, embed_B)
    cosine_scores = cosine_scores.cpu().numpy()
    hashtag_res = []
    for i in range(len(A_hashtags_list)):
        hashtag_res.append({
            'source': A_hashtags_list[i],
            'target': B_hashtags_list[cosine_scores[i].argmax()],
            'score': cosine_scores[i].max()
        })
    if len(hashtag_res) == 0:
        hashtag_score = 0
    else:
        hashtag_score = sum([item['score'] for item in hashtag_res]) / len(hashtag_res)
    # print(f"hashtag_score: {hashtag_score}")
    # print(hashtag_res)
    return hashtag_score, hashtag_res


def find_hashtags(df, col='text_trans'):
    return df[col].str.findall(r'#\w+#|\B#\w+\b')


def find_posts_with_url(df, col='text'):
    return df[df[col].str.contains(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',na=False)]


def find_same_url(df, col='text'):
    url_list = []
    post_id_list = []
    for index, row in df.iterrows():
        url = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', row[col])
        if url:
            for u in url:
                url_list.append(u)
                post_id_list.append(row['post_id'])
    return url_list, post_id_list


def find_posts_mention_other_platform(df, platform, col='text'):
    return df[df[col].str.contains(platform, na=False)]


def find_posts_with_engagement(df, threshold, col=["cnt_retweet", "cnt_agree", "cnt_comment"]):
    return df[df[col] > threshold]


def cal_cluster_factor(A, B, df_data, date, cycle, all_posts, all_users,debug = False):
    A_posts = df_data[df_data['from'] == A]
    B_posts = df_data[df_data['from'] == B]
    end_time = pd.to_datetime(date).strftime('%Y-%m-%d')
    start_time = pd.to_datetime(date) - pd.Timedelta(days=cycle)
    start_time = start_time.strftime('%Y-%m-%d')
    start_time_half = pd.to_datetime(date) - pd.Timedelta(days=cycle / 2)
    start_time_half = start_time_half.strftime('%Y-%m-%d')
    B_posts = B_posts[(B_posts['publish_time'] >= start_time_half) & (B_posts['publish_time'] <= end_time)]
    print(f"A: {A_posts.shape[0]} ,B: {B_posts.shape[0]}")
    ##############计算sal_factor######################
    all_A_posts = all_posts[all_posts['from'] == A]
    all_A_posts = all_A_posts[(all_A_posts['publish_time'] >= start_time) & (all_A_posts['publish_time'] <= end_time)]
    all_B_posts = all_posts[all_posts['from'] == B]
    all_B_posts = all_B_posts[(all_B_posts['publish_time'] >= start_time_half) & (all_B_posts['publish_time'] <= end_time)]
    print(f"all_A: {all_A_posts.shape[0]} ,all_B: {all_B_posts.shape[0]}")
    # 计算salience factor
    if all_A_posts.shape[0] == 0 or all_B_posts.shape[0] == 0:
        sal_factor = 0
    else:
        sal_factor = (A_posts.shape[0] * B_posts.shape[0]) / (all_A_posts.shape[0] * all_B_posts.shape[0])
    print(f"sal_factor: {sal_factor}")
    ##############计算A B的文本RW_SCORE #######################
    rw_score, rw_res = cal_RW_SCORE(A_posts, B_posts)
    # print(f"rw_score: {rw_score}")
    ##############计算A B的文本Hashtag_score#######################
    hashtag_score, hashtag_res = cal_hashtag_score(A_posts, B_posts)
    # print(f"hashtag_score: {hashtag_score}")
    ##############计算A B的SameURL #######################
    A_posts_url = find_posts_with_url(A_posts)
    B_posts_url = find_posts_with_url(B_posts)
    A_posts_url_list,_ = find_same_url(A_posts_url)
    B_posts_url_list,_ = find_same_url(B_posts_url)
    same_url = list(set(A_posts_url_list).intersection(set(B_posts_url_list)))
    same_url_count = len(same_url)
    # print(f"same_url_count: {same_url_count}")
    ##############计算A B的 DirectURL #######################
    A_direct_url = A_posts['url'].dropna().tolist()
    B_direct_url = B_posts['url'].dropna().tolist()
    direct_url = list(set(A_direct_url).intersection(set(B_direct_url)))
    direct_url_count = len(direct_url)
    # print(f"direct_url_count: {direct_url_count}")
    ##############计算A B的 Refer #######################
    B_mention_A = find_posts_mention_other_platform(B_posts, A, 'text_trans')
    B_mention_A_num = B_mention_A.shape[0]
    # print(f"B mention A: {B_mention_A_num}")
    ##############计算A B的 KOL Inf #######################
    # 注意只计算A的KOL
    Inf_posts = find_posts_with_engagement(A_posts, 100)
    Inf_posts_num = Inf_posts.shape[0]
    # 找出A_posts中的账号
    A_user = A_posts['user_id'].drop_duplicates().tolist()
    # 在self.all_users中找出这些账号
    A_user_info = all_users[all_users['user_id'].isin(A_user)]
    A_user_info['fan'] = A_user_info['fan'].astype(int)
    # 找出粉丝数量大于500的账号
    InfAccts = A_user_info[A_user_info['fan'] > 500].shape[0]
    max_Inf_post = find_posts_with_engagement(all_A_posts, 100)
    max_Inf_post_num = max_Inf_post.shape[0]
    all_A_user = all_A_posts['user_id'].drop_duplicates().tolist()
    all_A_user_info = all_users[all_users['user_id'].isin(all_A_user)]
    all_A_user_info['fan'] = all_A_user_info['fan'].astype(int)
    max_InfAccts = all_A_user_info[all_A_user_info['fan'] > 500].shape[0]
    # print(f"A engagement: {Inf_posts_num}, fan > 500: {InfAccts}, max_Inf_post: {max_Inf_post_num}, max_fan > 500: {max_InfAccts}")
    KOL_inf = Inf_posts_num / max_Inf_post_num + InfAccts / max_InfAccts
    # print(f"KOL_inf: {KOL_inf}")
    #######################################################
    sim_con = 0.1 * rw_score + 0.3 * same_url_count + 0.5 * direct_url_count + 0.4 * B_mention_A_num + 0.1 * hashtag_score
    # 计算指数
    p = 1 - np.exp(-1 * sal_factor - KOL_inf * sim_con)
    # print(f"p: {p}")
    def print_data(debug = False):
        if debug:
            print(f"A: {A_posts.shape[0]} ,B: {B_posts.shape[0]}")
            print(f"all_A: {all_A_posts.shape[0]} ,all_B: {all_B_posts.shape[0]}")
            print(f"sal_factor: {sal_factor}")
            print(f"rw_score: {rw_score}")
            print(f"hashtag_score: {hashtag_score}")
            print(f"same_url_count: {same_url_count}")
            print(f"direct_url_count: {direct_url_count}")
            print(f"B mention A: {B_mention_A_num}")
            print(f"A engagement: {Inf_posts_num}, fan > 500: {InfAccts}, max_Inf_post: {max_Inf_post_num}, max_fan > 500: {max_InfAccts}")
            print(f"KOL_inf: {KOL_inf}")
            print(f"p: {p}")
    print_data(debug)
    return p, rw_res, hashtag_res


In [15]:
df_all_posts = pd.read_csv('all_posts.csv')
df_all_accounts = pd.read_csv('all_accounts.csv')

((16731, 19), (6904, 12))

In [16]:
hash_table = [
    (0, '240_china_nuclear_pollution', 165),
    (1, '70_billion_japan_water', 165),
    (2, 'Great_Wave_Kanagawa', 49),
    (3, 'cooling_water_nuclear_wastewater', 238),
    (4, 'foreign_affairs_questions', 27),
    (5, 'japan_nuclear_wastewater', 27),
    (6, 'korean_...', 392),
    (7, 'radioactive_condemn_water', 28),
    (8, 'radioactive_pollution_japan_sea', 96),
    (9, 'sue_TEPCO_japan', 202),
    (10, 'treatment_japan_waste_nuclear', 63)
]

# 提取中间一列作为 Python 列表
cluster_names = [item[1] for item in hash_table]
cluster_names

['240_china_nuclear_pollution',
 '70_billion_japan_water',
 'Great_Wave_Kanagawa',
 'cooling_water_nuclear_wastewater',
 'foreign_affairs_questions',
 'japan_nuclear_wastewater',
 'korean_...',
 'radioactive_condemn_water',
 'radioactive_pollution_japan_sea',
 'sue_TEPCO_japan',
 'treatment_japan_waste_nuclear']

In [17]:
start_time = "2021-04-20"
end_time = "2021-04-29"
cluster = cluster_names[2]
A = 'weibo'
B = 'twitter'
cycle = 10
df_data = df_all_posts[df_all_posts['cluster'] == cluster]
cal_cluster_factor(A,B,df_data,end_time,cycle,df_all_posts,df_all_accounts,debug = True)


A: 364 ,B: 21
all_A: 472 ,all_B: 67
sal_factor: 0.24171515304831773


AttributeError: 'float' object has no attribute 'replace'