In [None]:
!pip install transformers

import pandas as pd
import json
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
import re



In [None]:
# 加载数据
def load_data(news_path, a_share_list_path):
    news_df = pd.read_excel(news_path)
    with open(a_share_list_path, 'r', encoding='utf-8') as file:
        a_share_list = json.load(file)
    return news_df, a_share_list

# 构建公司名称字典
def build_company_dict(a_share_list):
    company_dict = {}
    for company in a_share_list:
        company_dict[company['name']] = company['fullname']
    return company_dict

# 加载BERT模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

# 如果有GPU可用，将模型移至GPU
if torch.cuda.is_available():
    model = model.to('cuda')

def batch_text_to_vector(texts, batch_size=32, use_cls_token=False):
    vectors = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)

        if use_cls_token:
            # 使用CLS标记的向量
            batch_vectors = outputs.last_hidden_state[:, 0, :].cpu()
        else:
            # 使用整个序列的平均向量
            batch_vectors = outputs.last_hidden_state.mean(dim=1).cpu()

        vectors.extend(batch_vectors)
    return vectors

def precompute_company_vectors(company_dict):
    # 仅使用公司简称来生成向量
    unique_names = list(company_dict.keys())
    vectors = batch_text_to_vector([name for name in unique_names])

    company_vectors = {}
    for name, vector in tqdm(zip(unique_names, vectors), total=len(unique_names), desc="Computing Vectors"):
        company_vectors[name] = vector

    return company_vectors

def calculate_similarity(vector1, vector2):
    # 确保两个向量是二维的
    if vector1.dim() == 1:
        vector1 = vector1.unsqueeze(0)
    if vector2.dim() == 1:
        vector2 = vector2.unsqueeze(0)

    return F.cosine_similarity(vector1, vector2, dim=1).item()

def filter_news(news_df, company_vectors, news_path, threshold=0.8, use_cls_token=False):
    # 新增一个用于存储相似度得分的列
    news_df['SimilarityScore'] = 0.0
    news_df['SimilarityName'] = ''
    relevant_news = []

    # 批量处理新闻
    merged_texts = [row['News'] for _, row in news_df.iterrows()]
    news_vectors = batch_text_to_vector(merged_texts, use_cls_token=use_cls_token)

    for idx, news_vector in enumerate(tqdm(news_vectors, total=len(news_vectors))):
        highest_similarity = 0.0
        highest_name = ''
        for name, name_vector in company_vectors.items():
            similarity = calculate_similarity(news_vector, name_vector)
            if similarity > highest_similarity:
                highest_similarity = similarity
                highest_name = name

        # 保存最高相似度得分
        news_df.at[idx, 'SimilarityScore'] = highest_similarity
        news_df.at[idx, 'SimilarityName'] = highest_name

        # 如果有任何一个相似度高于设定阈值，则认为这则新闻是相关的
        if highest_similarity > threshold:
            relevant_news.append(news_df.iloc[idx])

    # 将更新后的DataFrame保存回Excel文件
    news_df.to_excel(news_path, index=False)

    return pd.DataFrame(relevant_news)



tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [None]:
# news_path = 'Irrelevant_News_All_副本.xlsx'
news_path = 'gen_shuffled.xlsx'
a_share_list_path = 'A_share_list.json'
news_df, a_share_list = load_data(news_path, a_share_list_path)

In [None]:
company_dict = build_company_dict(a_share_list)
company_vectors = precompute_company_vectors(company_dict)

Computing Vectors:   0%|          | 0/4625 [00:00<?, ?it/s]

In [None]:
filtered_news_df = filter_news(news_df, company_vectors, 'Scored_Irrelevant_New_sample.xlsx')

  0%|          | 0/80 [00:00<?, ?it/s]

In [None]:
filtered_news_df.to_excel('filtered_news_sample.xlsx')