In [None]:
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

# PDF 파일 경로 (애플 예시)
pdf_path = "./pdf_filings/AAPL/2023-11-02_10-K_aapl-20230930.htm.pdf"
full_text = extract_text_from_pdf(pdf_path)
print("전체 텍스트 길이:", len(full_text))

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# fast tokenizer를 이용해 전체 텍스트를 512 토큰 이하의 청크로 분할
encoding = tokenizer(
    full_text,
    max_length=512,
    truncation=True,
    return_overflowing_tokens=True,
    stride=50
)

# 각 청크를 512 토큰 이하로 잘라서 디코딩
chunks = [tokenizer.decode(enc.ids[:512], skip_special_tokens=True) for enc in encoding.encodings]
print("생성된 청크 개수:", len(chunks))

# 각 청크에 대해 감정 분석 실행
results_per_chunk = [sentiment_analyzer(chunk, truncation=True) for chunk in chunks]


# 결과 집계
positive_sum = 0.0
negative_sum = 0.0
neutral_sum  = 0.0
chunk_count  = 0

for result in results_per_chunk:
    for res in result:
        label = res['label'].lower()
        score = res['score']
        if label == 'positive':
            positive_sum += score
        elif label == 'negative':
            negative_sum += score
        elif label == 'neutral':
            neutral_sum += score
    chunk_count += 1

avg_positive = positive_sum / chunk_count
avg_negative = negative_sum / chunk_count
avg_neutral  = neutral_sum  / chunk_count

print("전체 감정 분석 결과 (청크 평균):")
print(f"긍정 점수: {avg_positive:.4f}")
print(f"부정 점수: {avg_negative:.4f}")
print(f"중립 점수: {avg_neutral:.4f}")


전체 텍스트 길이: 201124


Device set to use mps:0


생성된 청크 개수: 87


RuntimeError: The size of tensor a (515) must match the size of tensor b (512) at non-singleton dimension 1