# 调用模型进行情感分析

基于yiyanghkust/finbert-tone-chinese 的文本情感分析

https://huggingface.co/yiyanghkust/finbert-tone-chinese

In [None]:
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)  
    print(f"GPU 型号: {device_name}")
else:
    print("没有可用的 GPU")

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
local_model_path = "models/finbert-tone-chinese"

finbert_tone_chinese_tokenizer = AutoTokenizer.from_pretrained(local_model_path)
finbert_tone_chinese_model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to(device)


In [None]:
def analyze_sentiment(text):
    inputs = finbert_tone_chinese_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = finbert_tone_chinese_model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
    
    return probs

In [None]:
output_dir = 'finbert_tone_chinese_output'
os.makedirs(output_dir, exist_ok=True)
tqdm.pandas(desc="Processing sentiment analysis")

input_files = os.listdir("batch_save_xlcj_flash")

processed_files = set(os.listdir(output_dir))

# Only process files that haven't been processed yet
files_to_process = [file for file in input_files if f"{file}" not in processed_files]

for file in tqdm(files_to_process, desc="Processing sentiment analysis"):
    flash_batch_data = pd.read_feather(f"batch_save_xlcj_flash/{file}")
    flash_batch_data = flash_batch_data.dropna(subset = 'rich_text')
    if not flash_batch_data.empty:
        flash_batch_data[['Neutral', 'Positive', 'Negative']] = flash_batch_data['rich_text'].apply(lambda x: pd.Series(analyze_sentiment(x)))  # Labels: 0 -> Neutral; 1 -> Positive; 2 -> Negative
    
        flash_batch_data.to_feather(f"{output_dir}/{file}")
    else:
        continue

In [None]:
del finbert_tone_chinese_tokenizer
del finbert_tone_chinese_model
# 清除缓存

调用Helsinki-NLP/opus-mt-zh-en 将中文翻译为英文之后 再调用原始 FinBERT 进行情感分析

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import torch
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

local_translator_model_path = "models/opus-mt-zh-en"

translator_tokenizer = MarianTokenizer.from_pretrained(local_translator_model_path)
translator_model = MarianMTModel.from_pretrained(local_translator_model_path).to(device)

local_finbert_model_path = "models/FinancialBERT-Sentiment-Analysis"
finbert_tokenizer = AutoTokenizer.from_pretrained(local_finbert_model_path)
finbert_model = AutoModelForSequenceClassification.from_pretrained(local_finbert_model_path).to(device)

In [None]:
def translate_text(text):
    # 最大长度
    max_length = 512

    # 将输入文本编码为 token
    inputs = translator_tokenizer(text, return_tensors="pt", padding=True, truncation=False)
    tokens = inputs["input_ids"][0].tolist()  # 将第一个样本的 token 转为 list

    # 按 max_length 分段
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    
    translated_texts = []
    for chunk in chunks:
        # 构建输入张量并移动到 GPU
        input_ids = torch.tensor([chunk], dtype=torch.long).to(device)  # 显式指定设备
        inputs = {"input_ids": input_ids}
        
        # 翻译
        translated_ids = translator_model.generate(**inputs)
        translated_text = translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        translated_texts.append(translated_text)
    
    # 拼接翻译结果
    return " ".join(translated_texts)

# 函数：进行情绪分析
def analyze_sentiment(text):
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = finbert_model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
    return probs

def process_df(df):
    tqdm.pandas()
    df['translate_content'] = df['rich_text'].apply(translate_text)
    df[['Negative', 'Neutral', 'Positive']] = df['translate_content'].apply(lambda x: pd.Series(analyze_sentiment(x))) # Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive
    return df




In [None]:

import os
import pandas as pd
from tqdm import tqdm

output_dir = 'translate_en_finbert_origin_output'
os.makedirs(output_dir, exist_ok=True)
tqdm.pandas(desc="Processing sentiment analysis")

input_files = os.listdir("batch_save_xlcj_flash")
processed_files = set(os.listdir(output_dir))

# 记录错误文件列表
error_files = []

# 持续循环直到处理完所有文件
while True:
    # 只处理未处理的文件
    files_to_process = [file for file in input_files if file not in processed_files]

    if not files_to_process:
        print("所有文件已处理完成。")
        break

    for file in tqdm(files_to_process):
        try:
            flash_batch_data = pd.read_feather(f"batch_save_xlcj_flash/{file}")
            flash_batch_data = flash_batch_data.dropna(subset=['rich_text'])
            if not flash_batch_data.empty:
                flash_batch_data_with_senti = process_df(flash_batch_data)
                flash_batch_data_with_senti.to_feather(f"{output_dir}/{file}")
                processed_files.add(file)  # 更新已处理文件集合
            else:
                print(f"{file} 是空的，跳过。")
        except Exception as e:
            print(f"处理文件 {file} 时发生错误: {e}")
            error_files.append(file)  # 记录发生错误的文件
            continue

# 输出错误文件列表
if error_files:
    print("以下文件在处理时发生了错误:")
    for error_file in error_files:
        print(error_file)