基于yiyanghkust/finbert-tone-chinese 的文本情感分析

https://huggingface.co/yiyanghkust/finbert-tone-chinese

In [1]:
import pandas as pd
import os
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

finbert_tone_chinese_tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone-chinese")
finbert_tone_chinese_model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone-chinese").to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def analyze_sentiment(text):
    inputs = finbert_tone_chinese_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = finbert_tone_chinese_model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
    
    return probs


In [None]:
output_dir = 'finbert_tone_chinese_output'
os.makedirs(output_dir, exist_ok=True)
tqdm.pandas(desc="Processing sentiment analysis")

input_files = os.listdir("batch_save_xlcj_flash")

processed_files = set(os.listdir(output_dir))

# Only process files that haven't been processed yet
files_to_process = [file for file in input_files if f"{file}" not in processed_files]

for file in tqdm(files_to_process, desc="Processing sentiment analysis"):
    flash_batch_data = pd.read_feather(f"batch_save_xlcj_flash/{file}")
    
    flash_batch_data[['Neutral', 'Positive', 'Negative']] = flash_batch_data['rich_text'].apply(lambda x: pd.Series(analyze_sentiment(x)))  # Labels: 0 -> Neutral; 1 -> Positive; 2 -> Negative
    
    flash_batch_data.to_feather(f"{output_dir}/{file}")

Processing sentiment analysis:   0%|          | 10/2144 [00:14<57:21,  1.61s/it] 

调用Helsinki-NLP/opus-mt-zh-en 将中文翻译为英文之后 再调用原始 FinBERT 进行情感分析

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

translator_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
translator_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en").to(device)

finbert_tokenizer = AutoTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
finbert_model = AutoModelForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis").to(device)



In [None]:
def translate_text(text):
    inputs = translator_tokenizer(text, return_tensors="pt", padding=True).to(device)
    translated_ids = translator_model.generate(**inputs)
    translated_text = translator_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

# 函数：进行情绪分析
def analyze_sentiment(text):
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = finbert_model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
    return probs

def process_df(df):
    df['translate_content'] = df['rich_text'].apply(translate_text)
    df[['Negative', 'Neutral', 'Positive']] = df['translate_content'].apply(lambda x: pd.Series(analyze_sentiment(x))) # Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive
    return df


In [None]:
output_dir = 'translate_en_finbert_origin_output'
os.makedirs(output_dir, exist_ok=True)
tqdm.pandas(desc="Processing sentiment analysis")

input_files = os.listdir("batch_save_xlcj_flash")

processed_files = set(os.listdir(output_dir))

# Only process files that haven't been processed yet
files_to_process = [file for file in input_files if f"{file}" not in processed_files]

for file in tqdm(files_to_process, desc="Processing sentiment analysis"):
    flash_batch_data = pd.read_feather(f"batch_save_xlcj_flash/{file}")
    flash_batch_data_with_senti = process_df(flash_batch_data)
    flash_batch_data_with_senti.to_feather(f"{output_dir}/{file}")