In [9]:
import os
import sys
import PyPDF2
import re
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from collections import Counter, defaultdict
import numpy as np

# 사전 학습된 감성 분석 BERT 모델과 토크나이저 로드
model_name = "j-hartmann/emotion-english-distilroberta-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPU 사용 가능 여부 확인 및 장치 설정
if torch.cuda.is_available():
    device = 0
    model.to('cuda')
else:
    print("cuda is NOT available")
    sys.exit()

# 감정 분석 파이프라인 생성 (GPU 사용 설정)
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

# 폴더 경로 설정
pdf_folder = r'C:/Users/dlsdn/Desktop/paper/nuclear'

# PDF에서 텍스트 추출 함수
def extract_text_from_pdfs(folder_path):
    text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                for page_number, page in enumerate(reader.pages):
                    page_text = page.extract_text() or ""
                    text += page_text
    return text

# 텍스트 전처리 함수
def preprocess_text(text):
    print("Preprocessing text...")
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.replace('leader', 'kim').replace('leadership', 'kim')
    text = re.sub(r'[^\w\s.,]', '', text)
    months = [
        'january', 'february', 'march', 'april', 'may', 'june',
        'july', 'august', 'september', 'october', 'november', 'december'
    ]
    text = re.sub(r'\b(?:' + '|'.join(months) + r')\b', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    print("Text preprocessing completed.")
    return ' '.join(filtered_tokens)

# 사용자 정의 문장 분리 함수
def custom_sentence_split(text):
    print("Starting sentence splitting...")
    sentences = re.split(r'(?<=\.)\s+|(?<=\,)\s+', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    print("Sentence splitting completed.")
    return sentences

# 문장 단위로 분리하여 감정 분석 수행 함수
def analyze_sentiment_text(text):
    print("Starting sentiment analysis...")

    print("# 1: split")
    sentences = custom_sentence_split(text)

    print("# 2: filtering")
    filtered_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        if 5 <= len(tokens) <= 512:
            filtered_sentences.append(sentence)

    print("# 3: sentiment analysis")
    results = []
    for sentence in filtered_sentences:
        if len(tokenizer.encode(sentence, truncation=True)) > 512:
            continue
        result = sentiment_analysis(sentence)
        results.append(result)

    print("# 4: result")
    label_count = Counter()
    label_scores = defaultdict(list)

    for result in results:
        label = result[0]['label']
        score = result[0]['score']
        label_count[label] += 1
        label_scores[label].append(score)

    print("Sentiment analysis completed.")

    return label_count, label_scores

# 전체 파이프라인 함수
def sentiment_analysis_pipeline(folder_path):
    print("Starting sentiment analysis pipeline...")

    text = extract_text_from_pdfs(folder_path)
    cleaned_text = preprocess_text(text)
    result, scores = analyze_sentiment_text(cleaned_text)

    print("Sentiment analysis pipeline completed.")

    # 감정 분석 결과 출력
    for emotion in result:
        count = result[emotion]
        avg_score = np.mean(scores[emotion])
        print(f"{emotion.capitalize()}: {count} occurrences, Average Score: {avg_score:.4f}")

    return result, scores

# 전체 파이프라인 실행
result, scores = sentiment_analysis_pipeline(pdf_folder)



Starting sentiment analysis pipeline...
Processing file: 2006_message_1.pdf
Processing file: 2006_news_1.pdf
Processing file: 2009_news1.pdf
Processing file: 2012_news1.pdf
Processing file: 2012_news2.pdf
Processing file: 2012_news3.pdf
Processing file: 2014_news1.pdf
Processing file: 2014_news10.pdf
Processing file: 2014_news2.pdf
Processing file: 2014_news3.pdf
Processing file: 2014_news4.pdf
Processing file: 2014_news5.pdf
Processing file: 2014_news6.pdf
Processing file: 2014_news7.pdf
Processing file: 2014_news8.pdf
Processing file: 2014_news9.pdf
Processing file: 2014_statement1.pdf
Processing file: 2014_statement2.pdf
Processing file: 2014_statement3.pdf
Processing file: 2014_statement4.pdf
Processing file: 2015_news1.pdf
Processing file: 2015_news2.pdf
Processing file: 2015_news3.pdf
Processing file: 2015_news4.pdf
Processing file: 2015_news5.pdf
Processing file: 2015_news6.pdf
Processing file: 2015_statement1.pdf
Processing file: 2015_statement2.pdf
Processing file: 2015_statem