모델 학습코드

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import ConcatDataset, DataLoader
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import classification_report
import torch

# ================================
# ✅ 환경 설정
# ================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("kykim/bert-kor-base")  # KorSci-BERT
model = BertForSequenceClassification.from_pretrained(
    "kykim/bert-kor-base",
    num_labels=2,
    use_safetensors=True  # 보안 이슈 우회
).to(device)

# ================================
# ✅ 데이터셋 로드
# ================================
train_dataset = ConcatDataset([
    ClauseDataset('T_P', tokenizer, 1),
    ClauseDataset('T_N', tokenizer, 0)
])
val_dataset = ConcatDataset([
    ClauseDataset('V_P', tokenizer, 1),
    ClauseDataset('V_N', tokenizer, 0)
])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# ================================
# ✅ 클래스 불균형 보정 Loss 설정
# ================================
pos_count = 5932
neg_count = 3172
total = pos_count + neg_count

# 불리(label=0), 유리(label=1)
class_weights = torch.tensor([total / neg_count, total / pos_count], dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

# 옵티마이저 설정
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# ================================
# ✅ 학습 루프 함수
# ================================
def train(model, dataloader):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for batch in tqdm(dataloader, desc="Train"):
        input_ids, attn_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(dataloader), correct / total

# ================================
# ✅ 검증 루프 함수
# ================================
def evaluate(model, dataloader):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Val"):
            input_ids, attn_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    report = classification_report(all_labels, all_preds, target_names=["불리", "유리"], digits=4)
    return total_loss / len(dataloader), correct / total, report

# ================================
# ✅ 학습 실행 (최고 모델 저장 포함)
# ================================
best_acc = 0.0
early_stop_count = 0
EARLY_STOP = 3
EPOCHS = 10

for epoch in range(1, EPOCHS + 1):
    print(f"\n🔁 Epoch {epoch}")
    
    # 🔹 학습
    train_loss, train_acc = train(model, train_loader)
    
    # 🔹 검증
    val_loss, val_acc, val_report = evaluate(model, val_loader)

    print(f"📘 Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f}")
    print(f"📗 Val Loss:   {val_loss:.4f} | Acc: {val_acc:.4f}")
    print(f"\n{val_report}")

    # ✅ 최고 성능 모델 저장
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_korsci_bert_model.pt")
        print("✅ Best model saved.")
        early_stop_count = 0
    else:
        early_stop_count += 1
        print(f"⏸ No improvement. Early stop count: {early_stop_count}")

    # ⛔️ 조기 종료 조건
    if early_stop_count >= EARLY_STOP:
        print("⛔️ Early stopping triggered.")
        break

SHAP 분석 코드

In [None]:
import os
import re
import json
import torch
import shap
import numpy as np
from collections import defaultdict
from transformers import BertTokenizerFast, BertForSequenceClassification
import matplotlib.pyplot as plt
from soynlp.noun import LRNounExtractor_v2

# ✅ 모델 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "./domain_model"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
model.to(device)
model.eval()

# ✅ stopwords 불러오기
with open("stopwords_ko.txt", "r", encoding="utf-8") as f:
    STOPWORDS = set(line.strip() for line in f if line.strip())

# ✅ 유틸 함수
def is_structural_phrase(word):
    return re.match(r'^제\d+항$', word) or re.match(r'^제\d+조$', word) or re.match(r'^제\d+$', word) or re.match(r'^\d+$', word) or word in {'①','②','③','④','⑤','⑥','⑦'}

def remove_clause_title(text):
    lines = text.strip().split("\n")
    return "\n".join(lines[1:]).strip() if re.match(r'^제\d+조', lines[0]) else text

def load_all_clauses(folders):
    clauses = []
    for folder in folders:
        for fname in os.listdir(folder):
            if fname.endswith(".json"):
                with open(os.path.join(folder, fname), 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    clauses.extend(clause.strip() for clause in data.get("clauseArticle", []) if isinstance(clause, str))
    return clauses

# ✅ SHAP용 래퍼
class BertWrapper:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    def __call__(self, texts):
        inputs = self.tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True, max_length=128)
        input_ids = inputs['input_ids'].to(self.device)
        attention_mask = inputs['attention_mask'].to(self.device)
        with torch.no_grad():
            logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
            probs = torch.nn.functional.softmax(logits, dim=1)
        return probs.cpu().numpy()

# ✅ SHAP 단어 병합
def merge_tokens_by_offset(text, shap_values, tokenizer, label_idx):
    encoding = tokenizer(text, return_offsets_mapping=True, return_tensors='pt', truncation=True, max_length=128)
    tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
    offsets = encoding['offset_mapping'][0].tolist()
    values = shap_values.values[0][:, label_idx]
    words, scores, current_word, current_score, prev_end = [], [], '', 0.0, -1

    for token, (start, end), score in zip(tokens, offsets, values):
        if token in ['[CLS]', '[SEP]'] or start == end:
            continue
        if start != prev_end and current_word:
            words.append(current_word)
            scores.append(current_score)
            current_word, current_score = '', 0.0
        current_word += text[start:end]
        current_score += score
        prev_end = end

    if current_word:
        words.append(current_word)
        scores.append(current_score)

    filtered = [(w.strip(), s) for w, s in zip(words, scores)
                if len(w) > 1 and not re.match(r'^[\W\d]+$', w) and w not in STOPWORDS and not is_structural_phrase(w)]
    return filtered

# ✅ 핵심 구 추출
def extract_phrases_with_scores(sentence, top_words, word_score_dict, window=8):
    words = re.findall(r'[가-힣a-zA-Z0-9]+', sentence)
    seen, phrases = set(), []
    for i in range(len(words)):
        for j in range(i+1, min(len(words), i+window)+1):
            chunk = ' '.join(words[i:j]).strip()
            if chunk in seen: continue
            seen.add(chunk)
            matched = [w for w in top_words if w in chunk]
            if len(matched) >= 1:
                score_sum = sum(word_score_dict.get(w, 0.0) for w in matched)
                phrases.append((chunk, score_sum))
    phrases.sort(key=lambda x: abs(x[1]), reverse=True)
    return phrases[:1]

# ✅ 복합명사 기반 n-gram
def compute_ngram_shap_sum(nouns, word_score_dict, label_text, n=3):
    def contains_stopword(phrase): return any(sw in phrase for sw in {"제","조","항","호","번호"}) or re.search(r'[^\w가-힣 ]', phrase)
    filtered = [n for n in nouns if not contains_stopword(n) and not is_structural_phrase(n)]
    ngram_score_sum = {}
    for i in range(len(filtered) - n + 1):
        ngram = ' '.join(filtered[i:i+n])
        if ngram in ngram_score_sum: continue
        score_sum = sum(word_score_dict.get(w, 0.0) for w in filtered[i:i+n])
        if not contains_stopword(ngram) and score_sum > 0:
            ngram_score_sum[ngram] = score_sum
    results = sorted(ngram_score_sum.items(), key=lambda x: abs(x[1]), reverse=True)
    seen, final = set(), []
    for phrase, score in results:
        if any(p in seen for p in phrase.split()): continue
        final.append((phrase, score))
        seen.update(phrase.split())
        if len(final) >= 3: break
    print(f"\n📌 SHAP 기준, '{label_text}' 판단에 가장 큰 영향을 준 {n}-gram 표현 (복합명사 기반):")
    for phrase, score in final:
        print(f"  - '{phrase}': SUM = {score:.4f}")

# ✅ 복합명사 추출기 학습
all_folders = ["C:/data/TL_유리", "C:/data/TL_불리", "C:/data/VL_유리", "C:/data/VL_불리"]
corpus = load_all_clauses(all_folders)
with open("processed_clauses.txt", "r", encoding="utf-8") as f:
    corpus += [line.strip() for line in f if line.strip()]
print(f"✅ 전체 학습 문장 수: {len(corpus)}")
noun_extractor = LRNounExtractor_v2(verbose=False)
noun_extractor.train(corpus)
nouns_score_dict = noun_extractor.extract()

# ✅ 입력 문장 직접 지정
input_text = "② 이 경우 회사가 회원에게 환급을 지연한 때에는 그 지연기간에 대하여 전자상거래 등에서의 소비자보호에 관한 법률 및 시행령에서 정하는 이율을 곱하여 산정한 지연이자를 지급해야 합니다."
sentence = remove_clause_title(input_text)

# ✅ 예측 + 해석
wrapper = BertWrapper(model, tokenizer, device)
probs = wrapper([sentence])[0]
label = int(probs.argmax())
label_text = '유리' if label == 1 else '불리'
print(f"\n✅ 예측 결과: {label_text} ({probs[label]:.4f})")

explainer = shap.Explainer(wrapper, shap.maskers.Text(tokenizer))
shap_values = explainer([sentence])
shap.plots.text(shap_values[0])

merged = merge_tokens_by_offset(sentence, shap_values, tokenizer, label)
word_score_dict = defaultdict(float)
for w, s in merged:
    word_score_dict[w] += s

top_words = [w for w, _ in sorted(word_score_dict.items(), key=lambda x: abs(x[1]), reverse=True)[:10]]
print(f"\n🔍 SHAP 기준 상위 단어: {top_words}")

phrase = extract_phrases_with_scores(sentence, top_words, word_score_dict)
print(f"\n📌 {label_text} 판단 핵심 구(Phrase):")
for p, s in phrase:
    print(f"  - {p}: SUM = {s:.4f}")

nouns = [n for n in nouns_score_dict if n in sentence and not is_structural_phrase(n)]
compute_ngram_shap_sum(nouns, word_score_dict, label_text=label_text, n=3)

약관 조항 전처리 코드

In [None]:
# 1. 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 2. 필요 모듈 임포트
import os
import re
import json
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification

# 3. 조항 분리 함수 (간단히 '제 n 조' 기준 분리)
def split_clauses(text):
    pattern = re.compile(r'(제\s*\d+\s*조[^\n]*)')
    parts = pattern.split(text)
    clauses = []
    for i in range(1, len(parts), 2):
        title = parts[i].strip()
        content = parts[i+1].strip() if i+1 < len(parts) else ''
        full_text = title + " " + content
        clauses.append(full_text)
    return clauses

# 4. 모델 경로 지정 (자신의 경로로 수정)
MODEL_PATH = "/content/drive/MyDrive/domain_model"

# 5. 모델 및 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 6. 조항별 예측 함수 (softmax 확률 + 예측 라벨 반환)
def predict_with_probs(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)
        pred_label = torch.argmax(probs, dim=1).item()
        prob_vals = probs[0].cpu().tolist()
    return pred_label, prob_vals

# 7. 메인 실행 함수 (여러 파일 처리)
def main():
    # ✅ 분석할 약관 파일 경로들 (자신의 파일 경로 리스트로 수정)
    terms_paths = [
        '/content/drive/MyDrive/001_개인정보취급방침_가공.xml',
        '/content/drive/MyDrive/001_가맹계약_가공.xml',
        '/content/drive/MyDrive/001_결혼정보서비스_가공.xml',
        '/content/drive/MyDrive/001_공급계약_가공.xml'
    ]

    all_results = {}

    for path in terms_paths:
        with open(path, 'r', encoding='utf-8') as f:
            full_text = f.read()

        clauses = split_clauses(full_text)
        results = []

        for clause in clauses:
            label, probs = predict_with_probs(clause)
            results.append({
                "text": clause,
                "label": label,
                "prob_unfavorable": probs[0],  # 불리 확률
                "prob_favorable": probs[1]    # 유리 확률
            })

        filename = os.path.basename(path)
        all_results[filename] = results

        # 🔍 Top 조항 출력
        top_favorable = sorted(results, key=lambda x: x['prob_favorable'], reverse=True)[:3]
        top_unfavorable = sorted(results, key=lambda x: x['prob_unfavorable'], reverse=True)[:3]

        print(f"\n📄 {filename} 결과:")
        print("=== ✅ 유리 확률 Top 3 ===")
        for i, r in enumerate(top_favorable, 1):
            print(f"{i}. 확률: {r['prob_favorable']:.4f} | 텍스트: {r['text'][:100]}...")

        print("=== ⚠️ 불리 확률 Top 3 ===")
        for i, r in enumerate(top_unfavorable, 1):
            print(f"{i}. 확률: {r['prob_unfavorable']:.4f} | 텍스트: {r['text'][:100]}...")

    # 📝 결과 JSON 저장
    output_path = '/content/drive/MyDrive/clause_predictions_all.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)

    print(f"\n✅ 전체 결과 저장 완료: {output_path}")

# 8. 실행
if __name__ == '__main__':
    main()


TF-IDF 단어 추출 코드

In [None]:
import re
import logging
from collections import Counter
from typing import List, Optional

import pandas as pd
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer


# ─────────────────── LOGGING ───────────────────
logging.basicConfig(
    filename="terms_extractor_debug.log",
    filemode="w",
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__name__)


class TermsDifficultWordsExtractor:
    def __init__(
        self,
        *,
        min_word_length: int = 2,
        allow_single_char_noun: bool = True,
        debug: bool = True,
    ):
        self.okt = Okt()
        self.min_word_length = min_word_length
        self.allow_single_char_noun = allow_single_char_noun
        self.debug = debug

        # ── 기본+약식 TOS 불용어 ──
        self.stopwords = {
            "은", "는", "이", "가", "을", "를", "에", "의", "와", "과",
            "도", "로", "으로", "에서", "까지", "부터", "만", "라도",
            "조차", "마저", "에게", "한테", "및", "또는", "그리고",
            "하지만", "그러나", "따라서",
            "회사", "서비스", "회원", "이용", "약관", "고객", "사이트",
            "웹사이트", "본", "당사", "사용", "정보", "관련", "제공",
            "위", "이하", "경우", "때", "내용", "목적", "조항", "제공자",
            "회원원",
            "합니다", "합니다.",
        }

    # ── 내부 디버그 ──
    def _dbg(self, msg: str) -> None:
        if self.debug:
            log.debug(msg)

    # ── 전처리: 명사만 남기기 ──
    def preprocess_text(self, text: str) -> List[str]:
        text = re.sub(r"[^\w\s가-힣]", " ", text)
        morphs = self.okt.pos(text, stem=True)

        tokens: List[str] = []
        for word, pos in morphs:
            if pos != "Noun":
                continue
            length_ok = len(word) >= self.min_word_length
            if self.allow_single_char_noun and len(word) == 1:
                length_ok = True
            if not length_ok or word in self.stopwords or word.isdigit():
                continue
            tokens.append(word)

        self._dbg(
            f"Preprocessed {len(text)} chars → {len(tokens)} tokens "
            f"(sample: {tokens[:15]})"
        )
        return tokens

    # ── 파일 → 문서 리스트 ──
    def load_corpus_from_file(self, path: str, lines_per_doc: int = 5) -> List[str]:
        with open(path, encoding="utf-8") as f:
            lines = [ln.strip() for ln in f if ln.strip()]
        docs = [
            " ".join(lines[i : i + lines_per_doc])
            for i in range(0, len(lines), lines_per_doc)
        ]
        self._dbg(f"Loaded {len(lines)} lines → {len(docs)} docs")
        return docs

    # ── 핵심: TF-IDF 추출 ──
    def extract_difficult_words(
        self,
        terms_text: str,
        daily_corpus_path: str,
        *,
        top_n: int = 20,
        min_freq: int = 2,
        lines_per_doc: int = 5,
    ):
        daily_raw = self.load_corpus_from_file(daily_corpus_path, lines_per_doc)
        terms_tok = self.preprocess_text(terms_text)
        daily_tok_list = [self.preprocess_text(doc) for doc in daily_raw]

        docs = [" ".join(terms_tok)] + [" ".join(t) for t in daily_tok_list]

        vec = TfidfVectorizer(
            token_pattern=r"(?u)\b[\w가-힣]+\b", max_features=5000, min_df=1
        )
        tfidf_mat = vec.fit_transform(docs)
        feats = vec.get_feature_names_out()
        terms_tfidf = tfidf_mat[0].toarray().flatten()
        idf = vec.idf_

        terms_cnt = Counter(terms_tok)
        daily_cnt = Counter(tok for lst in daily_tok_list for tok in lst)

        words = []
        for i, w in enumerate(feats):
            tfidf_val = terms_tfidf[i]
            t_freq = terms_cnt.get(w, 0)
            d_freq = daily_cnt.get(w, 0)

            # ★ 필터: 일상 빈도 10 이상이면 제외 ★
            if (
                t_freq >= min_freq
                and tfidf_val > 0
                and d_freq < 10            # ← 핵심 조건
            ):
                words.append((w, tfidf_val, t_freq, idf[i], t_freq, d_freq))

        self._dbg(f"Candidates kept after daily_freq<10: {len(words)}")
        words.sort(key=lambda x: x[1], reverse=True)
        return words[:top_n]

    # ── 실행 & 출력 ──
    def analyze_and_display(
        self,
        terms_file: str,
        corpus_file: str,
        *,
        top_n: int = 20,
        min_freq: int = 2,
        lines_per_doc: int = 5,
    ) -> Optional[pd.DataFrame]:
        terms_text = open(terms_file, encoding="utf-8").read()

        res = self.extract_difficult_words(
            terms_text,
            corpus_file,
            top_n=top_n,
            min_freq=min_freq,
            lines_per_doc=lines_per_doc,
        )
        if not res:
            print("No difficult words found (after daily_freq < 10 filter).")
            return None

        df = pd.DataFrame(
            res, columns=["word", "tfidf", "tf", "idf", "terms_freq", "daily_freq"]
        )
        df["difficulty"] = df["terms_freq"] / (df["daily_freq"] + 1)

        # 메인 테이블
        print("=" * 90)
        print(f"Top {len(df)} Difficult Words  (daily_freq < 10)")
        print("=" * 90)
        print(
            f"{'Rank':<4} {'Word':<20} {'TF-IDF':<9} {'TF':<5} "
            f"{'IDF':<7} {'TermsFreq':<10} {'DailyFreq':<9} {'Difficulty':<10}"
        )
        print("-" * 90)
        for i, r in df.iterrows():
            print(
                f"{i+1:<4} {r.word:<20} {r.tfidf:<9.4f} {r.tf:<5} "
                f"{r.idf:<7.4f} {r.terms_freq:<10} {r.daily_freq:<9} "
                f"{r.difficulty:<10.2f}"
            )

        # sparse spotlight(daily_freq == 0)
        sparse = df[df["daily_freq"] == 0]
        if not sparse.empty:
            print("\n► Sparse words (absent from daily corpus):")
            for w in sparse["word"]:
                print("  •", w)

        return df


# ── Driver ──
def main() -> None:
    ext = TermsDifficultWordsExtractor(
        min_word_length=4, allow_single_char_noun=False, debug=True
    )
    ext.analyze_and_display(
        terms_file="term2.txt",
        corpus_file="corpus.txt",
        top_n=15,
        min_freq=1,
        lines_per_doc=10,
    )

    print("\n" + "=" * 90)
    print("Analysis completed.  See 'terms_extractor_debug.log' for details.")


if __name__ == "__main__":
    main()


도메인 분류 코드

In [None]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sklearn.metrics import f1_score, confusion_matrix

# ============ 1. JSON -> CSV 변환 함수 ============
def json_dir_to_csv(dir_path, output_csv_path, exclude_fields=None, label=None):
    exclude_fields = set(exclude_fields) if exclude_fields else set()
    rows = []

    for fname in os.listdir(dir_path):
        if fname.endswith('.json'):
            with open(os.path.join(dir_path, fname), encoding='utf-8') as f:
                data = json.load(f)
            clause_field = data.get('clauseField')
            if clause_field and clause_field.isdigit():
                if int(clause_field) in exclude_fields:
                    continue
                text = data.get('clauseArticle', [''])[0]
                rows.append({'clauseField': int(clause_field), 'clauseArticle': text, 'label': label})

    df = pd.DataFrame(rows)
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    print(f"CSV saved to {output_csv_path} (samples: {len(df)})")

# ============ 2. ClauseDataset (CSV 기반) ============
class ClauseDataset(Dataset):
    def __init__(self, csv_path, tokenizer):
        self.df = pd.read_csv(csv_path)
        self.tokenizer = tokenizer

        self.encodings = tokenizer(
            self.df['clauseArticle'].tolist(),
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        self.labels = torch.tensor(self.df['label'].values)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'label': self.labels[idx]
        }

# ============ 3. BertClassifier 정의 ============
class BertClassifier(nn.Module):
    def __init__(self, pretrained_model_name='kykim/bert-kor-base'):
        super().__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        return self.classifier(cls_output)

# ============ 4. 학습 / 평가 함수 ============
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, return_all=False):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    accuracy = sum([p == l for p, l in zip(all_preds, all_labels)]) / len(all_labels)

    if return_all:
        f1 = f1_score(all_labels, all_preds)
        cm = confusion_matrix(all_labels, all_preds)
        return accuracy, f1, cm
    else:
        return accuracy

# ============ 5. 메인 실행부 ============
if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: {device}")

    # 검증 데이터 clauseField (학습 데이터에서 제외할 필드)
    val_clause_fields = [1, 2, 3, 5, 6, 7, 8, 11, 25, 27, 28, 29, 30, 31, 32, 33, 39, 43]

    # 경로
    base_train_dir  = '/content/약관데이터/TL_2.약관/TL_2.약관/1.Training/라벨링데이터/TL_2.약관'
    base_val_dir = '/content/약관데이터/VL_2.약관/2.Validation/라벨링데이터/VL_2.약관'

    # JSON -> CSV (필요시 1회 실행)
    json_dir_to_csv(os.path.join(base_train_dir, '01.유리'), 'train_good.csv', exclude_fields=val_clause_fields, label=1)
    json_dir_to_csv(os.path.join(base_train_dir, '02.불리'), 'train_bad.csv', exclude_fields=val_clause_fields, label=0)
    json_dir_to_csv(os.path.join(base_val_dir, '01.유리'), 'val_good.csv', label=1)
    json_dir_to_csv(os.path.join(base_val_dir, '02.불리'), 'val_bad.csv', label=0)

    tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base')

    train_dataset = ConcatDataset([
        ClauseDataset('train_good.csv', tokenizer),
        ClauseDataset('train_bad.csv', tokenizer)
    ])
    val_dataset = ConcatDataset([
        ClauseDataset('val_good.csv', tokenizer),
        ClauseDataset('val_bad.csv', tokenizer)
    ])

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=8, pin_memory=True, num_workers=4)

    model = BertClassifier().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    # 학습 루프
    epochs = 3
    for epoch in range(epochs):
        print(f"\n🌟 Epoch {epoch+1}")
        train_loss = train(model, train_loader, optimizer, criterion, device)
        val_acc = evaluate(model, val_loader, device)
        print(f"✅ Train Loss: {train_loss:.4f}, 🔍 Validation Accuracy: {val_acc:.4f}")

    # 전체 평가 지표 출력
    final_acc, final_f1, final_cm = evaluate(model, val_loader, device, return_all=True)
    print("\n📊 전체 평가 지표:")
    print(f"✅ Accuracy: {final_acc:.4f}")
    print(f"🎯 F1 Score: {final_f1:.4f}")
    print("🧮 Confusion Matrix:")
    print(final_cm)

    # 모델 저장
    torch.save(model.state_dict(), 'bert_clause_model.pt')


LLM 프롬프트

In [None]:
import os
from typing import List
from pathlib import Path
from dotenv import load_dotenv

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import LLMChain

# ────────────────────────────────────
# 1. 환경 변수 & LLM
# ────────────────────────────────────
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise EnvironmentError("❗ OPENAI_API_KEY 환경 변수가 비어 있습니다.")

llm = ChatOpenAI(model_name="gpt-4o",
                 temperature=0.2,
                 openai_api_key=api_key)

# ────────────────────────────────────
# 2. 도메인 정의
# ────────────────────────────────────
DOMAIN_CATS = [
    "A. 금융기관","B. 전자지급·핀테크","C. 보험","D. 증권·투자",
    "E. 유통·사이버몰","F. 프랜차이즈·공급·분양·신탁","G. 부동산·임대차·리스",
    "H. 운송·물류","I. 여행·레저·게임","J. 생활서비스","K. 기타 계약·보증",
]

# ────────────────────────────────────
# 3. 약관 요약 + 도메인 분류
# ────────────────────────────────────
summary_schema = ResponseSchema(
    name="terms_summary",
    description="약관 전문 이해하기 쉽도록 요약"
)
domain_schema = ResponseSchema(
    name="domains",
    description="조항 목록과 동일한 순서로 도메인 문자열 배열"
)
parser = StructuredOutputParser.from_response_schemas([summary_schema, domain_schema])
format_instr = parser.get_format_instructions()

prompt = ChatPromptTemplate.from_template(
    """{format_instr}

약관 전문:
{terms_text}

조항 목록:
{clauses}

작업:
1) 약관 전문을 소비자 관점에서 이해하기 쉽도록 요약
2) 각 조항의 산업·서비스 분야(아래 11개 중 하나)를 선택하여 순서대로 나열

카테고리:
{domain_list}
"""
)
summary_chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser)

# ────────────────────────────────────
# 4. 조항 처리 함수 (임베딩 제거)
# ────────────────────────────────────
def process_clause(clause, label, domain):
    prompt = f"""
[조항 원문]
{clause}

작업:
1) 위 조항이 소비자에게 왜 {label}한지 2~3문장으로 설명하세요.
2) {"불리 조항을 소비자에게 유리하게 개정하세요. 형식: 개정 전: / 개정 후:" 
     if label == "불리" 
     else "유리 조항은 그대로 두고, 개선할 부분이 있으면 간단히 제안하세요."}
3) 가능하면 일반적인 관련 법령 예시도 함께 제공하세요.
"""
    llm_result = llm.predict(prompt).strip()


    # SHAP 설명 대체 또는 제거
    shap_info = {"note": "SHAP 설명 생략됨 (임베딩 제거됨)"}

    return {
        "domain": domain,
        "llm_result": llm_result,
        "law_refs": [],
        "shap_explanation": shap_info
    }

# ────────────────────────────────────
# 5. 메인 파이프라인
# ────────────────────────────────────
def run_terms_analysis(terms_text: str, clauses: List[str], labels: List[str]):
    if len(clauses) != len(labels):
        raise ValueError("clauses와 labels 길이가 다릅니다.")

    # 요약 + 도메인 분류
    parsed = summary_chain.predict_and_parse(
        terms_text=terms_text,
        clauses="\n".join(f"{i+1}) {c}" for i, c in enumerate(clauses)),
        format_instr=format_instr,
        domain_list="\n".join(DOMAIN_CATS)
    )
    terms_summary = parsed["terms_summary"]
    domains = parsed["domains"]

    if len(domains) != len(clauses):
        domains = ["K. 기타 계약·보증"] * len(clauses)

    clause_results = [
        process_clause(c, l, d)
        for c, l, d in zip(clauses, labels, domains)
    ]

    return {
        "terms_summary": terms_summary,
        "clause_results": clause_results
    }
# ────────────────────────────────────
# 9. 데모
# ────────────────────────────────────

if __name__ == "__main__":
    terms = Path("terms.txt").read_text(encoding="utf-8")
    clauses_demo = [
        "제22조 (환불) 회사는 해지 신청 후 7일 내 환불한다.",
        "제5조 (청약철회) 고객은 상품 수령 후 7일 이내 청약철회 가능하다."
    ]
    labels_demo = ["불리", "유리"]

    report = run_terms_analysis(terms, clauses_demo, labels_demo)

    print("\n◆ 약관 요약 ◆\n", report["terms_summary"])
    for i, (r, lab) in enumerate(zip(report["clause_results"], labels_demo), 1):
        print(f"\n◇ 조항 {i} ({lab})")
        print("도메인:", r["domain"])
        print(r["llm_result"])
        if r["law_refs"]:
            print("관련 법령:", ", ".join(r["law_refs"])) 


배포) 프론트 코드

In [None]:
import React, { useState } from 'react';
import ReactMarkdown from 'react-markdown';
import './App.css';

// 각 분석 결과를 표시하기 위한 분리된 컴포넌트들

// 1. TF-IDF 분석 결과 컴포넌트 (수치 제거)
const TfidfResult = ({ result }) => (
  <section className="result-section">
    <h2>📚 어려운 단어 및 용어 설명</h2>
    {result.difficult_words && result.difficult_words.length > 0 ? (
      <ul>
        {result.difficult_words.map((item, index) => (
          <li key={index}>
            {/* 요구사항 4: TF-IDF 점수 수치를 제거합니다. */}
            <strong>{item.word}:</strong> {item.definition}
          </li>
        ))}
      </ul>
    ) : (
      <p>특별히 어려운 단어는 발견되지 않았습니다.</p>
    )}
  </section>
);

// 2. SHAP/BERT 단일 조항 심층 분석 결과 컴포넌트
const ShapResult = ({ result }) => (
  <section className="result-section">
    <h2>💡 AI의 상세 설명 및 솔루션</h2>
    <div className="summary-box">
      <h3>📌 AI 판단 요약</h3>
      <p>
        AI가 이 조항을 <strong>'{result.prediction}'</strong>하다고 판단했습니다.<br/>
        주요 판단 근거는 <strong>"{result.key_phrase}"</strong> 와(과) 관련된 내용으로 보입니다.
      </p>
    </div>
    <div className="solution-box">
      <h3>💬 상세 해설</h3>
      <ReactMarkdown children={result.llm_explanation} />
    </div>
    {/* 관련 키워드 섹션은 TfidfResult와 겹치므로 여기서는 생략하거나 다르게 표현할 수 있습니다. */}
  </section>
);

// 3. 전체 약관 Top 3 필터링 결과 컴포넌트 (수치 제거)
const Top3Result = ({ result }) => (
  <section className="result-section">
    <h2>📊 전체 약관 핵심 조항 필터링</h2>
    <p>총 {result.total_clauses_found}개의 조항 중에서 가장 주목할 만한 조항들입니다.</p>
    <div className="result-card danger">
      <h3>❌ 가장 불리한 Top 3 조항</h3>
      {result.top_unfavorable_clauses.map((c, i) => (
        <div key={`unfavorable-${i}`} className="clause-item">
          {/* 요구사항 4: 확률(%) 수치를 제거하고 조항 텍스트만 표시합니다. */}
          <p>{c.text}</p>
        </div>
      ))}
    </div>
    <div className="result-card safe">
      <h3>✅ 가장 유리한 Top 3 조항</h3>
      {result.top_favorable_clauses.map((c, i) => (
        <div key={`favorable-${i}`} className="clause-item">
          {/* 요구사항 4: 확률(%) 수치를 제거하고 조항 텍스트만 표시합니다. */}
          <p>{c.text}</p>
        </div>
      ))}
    </div>
  </section>
);


// 메인 앱 컴포넌트
function App() {
  const [inputText, setInputText] = useState("");
  const [tfidfResult, setTfidfResult] = useState(null);
  const [shapResult, setShapResult] = useState(null);
  const [top3Result, setTop3Result] = useState(null);
  const [isLoading, setIsLoading] = useState(false);
  const [error, setError] = useState(null);

  const handleComprehensiveAnalysis = async () => {
    if (!inputText.trim()) {
      setError("분석할 텍스트를 입력해주세요.");
      return;
    }
    
    setIsLoading(true);
    setError(null);
    setTfidfResult(null);
    setShapResult(null);
    setTop3Result(null);

    try {
      // Promise.all을 사용해 3개의 API를 동시에 호출
      const [tfidfResponse, shapResponse, top3Response] = await Promise.all([
        fetch("http://localhost:8080/extract-difficult-words", {
          method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: inputText }),
        }),
        fetch("http://localhost:8080/analyze-clause-with-shap", {
          method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: inputText }),
        }),
        fetch("http://localhost:8080/analyze-full-terms-top3", {
          method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ full_text: inputText }),
        }),
      ]);

      if (!tfidfResponse.ok || !shapResponse.ok || !top3Response.ok) {
        throw new Error("하나 이상의 분석 API에서 오류가 발생했습니다.");
      }

      const tfidfData = await tfidfResponse.json();
      const shapData = await shapResponse.json();
      const top3Data = await top3Response.json();
      
      setTfidfResult(tfidfData);
      setShapResult(shapData);
      setTop3Result(top3Data);

    } catch (err) {
      setError(err.message);
    } finally {
      setIsLoading(false);
    }
  };

  return (
    <div className="container">
      <header>
        <h1>🧠 AI 약관 종합 분석</h1>
        <p>입력한 약관에 대해 3가지 AI 분석 결과를 한 번에 제공합니다.</p>
      </header>

      <section className="input-section">
        <textarea
          rows="10"
          value={inputText}
          onChange={(e) => setInputText(e.target.value)}
          placeholder="여기에 약관 조항 또는 약관 전체를 입력하세요..."
        />
        <button onClick={handleComprehensiveAnalysis} disabled={isLoading}>
          {isLoading ? "🔄 모든 AI 분석 중..." : "🔍 종합 분석하기"}
        </button>
      </section>
      
      {isLoading && <p className="loading-message">AI가 약관을 다각도로 분석 중입니다. 잠시만 기다려주세요...</p>}
      {error && <p className="error-message">오류: {error}</p>}

      {/* 모든 결과가 도착했을 때만, 요구사항에 맞게 재배치된 순서로 렌더링 */}
      {tfidfResult && shapResult && top3Result && (
        <div className="all-results-container">
          {/* 요구사항 1: Top3 결과를 가장 먼저 표시 */}
          <Top3Result result={top3Result} />
          
          {/* 요구사항 2: AI 상세 설명을 두 번째로 표시 */}
          <ShapResult result={shapResult} />
          
          {/* 요구사항 3: 단어 설명을 마지막으로 표시 */}
          <TfidfResult result={tfidfResult} />
        </div>
      )}
    </div>
  );
}

export default App;

배포) 백엔드 코드

In [None]:
# ===================================================================
# 1. 라이브러리 임포트
# ===================================================================
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import os
import re
from typing import List
from dotenv import load_dotenv
from fastapi.middleware.cors import CORSMiddleware
import torch
import torch.nn.functional as F
import urllib.request
import urllib.parse
import xml.etree.ElementTree as ET
import shap
from collections import defaultdict, Counter
from transformers import BertTokenizerFast, BertForSequenceClassification
import openai
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.schema.runnable import RunnableSequence
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer

# ===================================================================
# 2. 초기 설정 (FastAPI 앱, API 키, 모델 로드)
# ===================================================================
app = FastAPI(
    title="[최종] 통합 약관 분석 API",
    description="모든 분석 기능을 제공하는 최종 버전입니다.",
    version="5.0.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:3000", "http://127.0.0.1:3000"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

load_dotenv()

OPENAI_API_KEY = "open_ai_key"

NAVER_CLIENT_ID = "naver_api"

NAVER_CLIENT_SECRET = "naver_api_key"

openai.api_key = OPENAI_API_KEY

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "C:/Users/xison/Desktop/텍마배포/ML/backend/model/domain_model"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
model.to(device)
model.eval()

llm = ChatOpenAI(model_name="gpt-4o", temperature=0.2, openai_api_key=OPENAI_API_KEY)

# ===================================================================
# 3. 모든 기능별 로직 및 헬퍼 함수
# ===================================================================

# main.py 파일의 다른 헬퍼 함수들과 함께 아래 함수를 추가해주세요.

def split_clauses(text: str) -> List[str]:
    """'제 n조' 패턴을 기준으로 텍스트에서 조항들을 분리"""
    # '제 n조(제목)' 형식까지 포함하여 분리하는 정규식
    pattern = re.compile(r'(제\s*\d+\s*조\s*(?:\([^)]*\))?[^\n]*)')
    
    # 정규식으로 텍스트를 분리
    parts = pattern.split(text)
    
    # 분리된 조항들을 재조합
    # parts 리스트는 [전문, 제1조, 제1조내용, 제2조, 제2조내용, ...] 형식으로 나뉨
    clauses = []
    if len(parts) > 1:
        # 첫 번째 요소는 '제1조' 이전의 서문이므로 제외하고 시작
        for i in range(1, len(parts), 2):
            if i + 1 < len(parts) and parts[i+1].strip():
                # "제n조(제목)" + "내용" 을 합쳐서 하나의 조항으로 만듦
                full_clause = parts[i].strip() + "\n" + parts[i+1].strip()
                clauses.append(full_clause)

    # 만약 '제 n조' 패턴이 하나도 없다면, 전체 텍스트를 단일 조항으로 간주
    if not clauses and text.strip():
        return [text.strip()]
        
    return clauses

# --- 기능 A: TF-IDF 로직 ---
class TermsDifficultWordsExtractor:
    def __init__(self, min_word_length: int = 2):
        self.okt = Okt()
        self.min_word_length = min_word_length
        self.stopwords = {"은", "는", "이", "가", "을", "를", "에", "의", "와", "과", "도", "로", "으로", "에서", "회사", "서비스", "회원", "이용", "약관", "고객", "사이트", "본", "당사", "사용", "정보", "관련", "제공", "위", "이하", "경우", "때", "내용", "목적", "조항"}
    def preprocess_text(self, text: str) -> List[str]:
        text = re.sub(r"[^\w\s가-힣]", " ", text)
        morphs = self.okt.pos(text, stem=True)
        return [word for word, pos in morphs if pos == "Noun" and len(word) >= self.min_word_length and word not in self.stopwords and not word.isdigit()]
    def load_corpus_from_file(self, path: str, lines_per_doc: int = 5) -> List[str]:
        try:
            with open(path, encoding="utf-8") as f: lines = [ln.strip() for ln in f if ln.strip()]
            return [" ".join(lines[i : i + lines_per_doc]) for i in range(0, len(lines), lines_per_doc)]
        except FileNotFoundError: return []
    def extract_difficult_words(self, terms_text: str, daily_corpus_path: str, top_n: int = 5):
        daily_raw = self.load_corpus_from_file(daily_corpus_path)
        terms_tok = self.preprocess_text(terms_text)
        daily_tok_list = [self.preprocess_text(doc) for doc in daily_raw]
        docs = [" ".join(terms_tok)] + [" ".join(t) for t in daily_tok_list]
        vec = TfidfVectorizer(token_pattern=r"(?u)\b[\w가-힣]+\b", max_features=5000, min_df=1)
        tfidf_mat = vec.fit_transform(docs)
        feats = vec.get_feature_names_out()
        terms_tfidf = tfidf_mat[0].toarray().flatten()
        terms_cnt = Counter(terms_tok)
        daily_cnt = Counter(tok for lst in daily_tok_list for tok in lst)
        words = [{"word": w, "tfidf": terms_tfidf[i], "terms_freq": terms_cnt.get(w, 0), "daily_freq": daily_cnt.get(w, 0)} for i, w in enumerate(feats) if terms_cnt.get(w, 0) >= 1 and terms_tfidf[i] > 0 and daily_cnt.get(w, 0) < 10]

        words = [word_data for word_data in words if len(word_data["word"]) == 4]
        words.sort(key=lambda x: x["tfidf"], reverse=True)
        return words[:top_n]


tfidf_extractor = TermsDifficultWordsExtractor()

# --- 기능 B, C, D 공용 헬퍼 함수 ---
def predict_clause_probabilities(text: str) -> dict:
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1)[0].cpu().tolist()
    return {"prob_unfavorable": probs[0], "prob_favorable": probs[1]}

#네이버 api 백과사전 호출 기능
def search_naver_encyc(query: str):
    encText = urllib.parse.quote(query)
    url = f"https://openapi.naver.com/v1/search/encyc.xml?query={encText}"
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", NAVER_CLIENT_ID)
    request.add_header("X-Naver-Client-Secret", NAVER_CLIENT_SECRET)
    
    try:
        with urllib.request.urlopen(request, timeout=5) as response:
            if response.getcode() == 200:
                item = ET.fromstring(response.read()).find("channel/item")
                if item is not None and item.find("description") is not None and item.find("description").text:
                    return re.sub('<.+?>', '', item.find("description").text).strip()
        return "사전 검색 결과가 없습니다."
    except Exception as e: return f"사전 API 호출 중 오류({type(e).__name__}) 발생"

# --- 기능 B: SHAP/BERT 개선된 분석 로직 ---
class BertWrapper:
    def __init__(self, model, tokenizer, device):
        self.model, self.tokenizer, self.device = model, tokenizer, device
    def __call__(self, texts):
        inputs = self.tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True, max_length=128, add_special_tokens=True).to(self.device)
        with torch.no_grad(): return F.softmax(self.model(**inputs).logits, dim=1).cpu().numpy()
explainer = shap.Explainer(BertWrapper(model, tokenizer, device), shap.maskers.Text(tokenizer))

def analyze_clause_with_shap(text: str):
    probabilities = list(predict_clause_probabilities(text).values())
    label_idx = torch.argmax(torch.tensor(probabilities)).item()
    label_text = '유리' if label_idx == 1 else '불리'
    shap_values = explainer([text])
    encoding = tokenizer(text, return_offsets_mapping=True, return_tensors='pt', truncation=True, max_length=128)
    tokens, offsets, values = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]), encoding['offset_mapping'][0].tolist(), shap_values.values[0][:, label_idx]
    words, scores, current_word, current_score, prev_end = [], [], '', 0.0, -1
    for token, (start, end), score in zip(tokens, offsets, values):
        if token in ['[CLS]', '[SEP]'] or start == end: continue
        if start != prev_end and current_word:
            words.append(current_word); scores.append(current_score); current_word, current_score = '', 0.0
        current_word += text[start:end]; current_score += score; prev_end = end
    if current_word: words.append(current_word); scores.append(current_score)
    word_score_dict = defaultdict(float, {w.strip(): s for w, s in zip(words, scores)})
    top_words = [w for w, _ in sorted(word_score_dict.items(), key=lambda x: abs(x[1]), reverse=True) if len(w)>1][:4]
    sentence_words = re.findall(r'[가-힣a-zA-Z0-9]+', text)
    seen, phrases = set(), []
    for i in range(len(sentence_words)):
        for j in range(i + 1, min(len(sentence_words), i + 8) + 1):
            chunk = ' '.join(sentence_words[i:j]).strip()
            if chunk in seen or not any(w in chunk for w in top_words): continue
            seen.add(chunk)
            phrases.append((chunk, sum(word_score_dict.get(w, 0.0) for w in chunk.split() if w in word_score_dict)))
    phrases.sort(key=lambda x: abs(x[1]), reverse=True)
    awkward_key_phrase = phrases[0][0] if phrases else "핵심 구를 찾을 수 없음"
    return {"prediction": label_text, "awkward_key_phrase": awkward_key_phrase, "top_words": top_words}

# --- 기능 C: LangChain 로직 ---
DOMAIN_CATS = ["A. 금융기관","B. 전자지급·핀테크","C. 보험","D. 증권·투자","E. 유통·사이버몰","F. 프랜차이즈·공급·분양·신탁","G. 부동산·임대차·리스","H. 운송·물류","I. 여행·레저·게임","J. 생활서비스","K. 기타 계약·보증"]
summary_schema, domain_schema = ResponseSchema(name="terms_summary", description="약관 전문 요약"), ResponseSchema(name="domains", description="조항별 도메인 문자열 배열")
parser = StructuredOutputParser.from_response_schemas([summary_schema, domain_schema])
prompt_template = ChatPromptTemplate.from_template("""{format_instr}\n\n약관 전문:\n{terms_text}\n\n조항 목록:\n{clauses}\n\n작업:\n1) 약관 전문을 요약하세요.\n2) 각 조항의 산업 분야를 아래 카테고리 중에서 선택하여 나열하세요.\n\n카테고리:\n{domain_list}""")
summary_chain = prompt_template | llm | parser

def process_clause_with_langchain(clause: str, label: str, domain: str):
    prompt_text = f"""[조항 원문]
{clause}

[분석 작업]
당신은 소비자의 입장에서 약관을 분석하는 AI 법률 전문가입니다.
1. 위 조항이 왜 소비자에게 '{label}'한지 명확하고 이해하기 쉽게 2~3문장으로 설명하세요.
2. {'불리 조항을 소비자에게 유리하게 개정하는 안을 제시하세요. "개정 제안:" 이라는 제목으로 시작해주세요.' if label == "불리" else '해당 조항은 소비자에게 유리하지만, 더 개선할 부분이 있다면 제안해주세요. "개선 제안:" 이라는 제목으로 시작해주세요.'}
3. 이 조항과 관련된 일반적인 법률(예: 약관규제법, 전자상거래법 등)이 있다면 어떤 것인지 언급해주세요."""
    llm_result = llm.predict(prompt_text).strip()
    return {"domain": domain, "original_clause": clause, "label": label, "analysis_and_suggestion": llm_result}


def run_terms_analysis(terms_text: str, clauses: List[str], labels: List[str]):
    try:
        parsed = summary_chain.invoke({"terms_text": terms_text, "clauses": "\n".join(f"{i+1}) {c}" for i, c in enumerate(clauses)), "format_instr": parser.get_format_instructions(), "domain_list": "\n".join(DOMAIN_CATS)})
        terms_summary, domains = parsed["terms_summary"], parsed["domains"]
    except Exception as e:
        terms_summary, domains = f"요약/도메인 분류 실패: {e}", ["K. 기타 계약·보증"] * len(clauses)
    if len(domains) != len(clauses): domains = ["K. 기타 계약·보증"] * len(clauses)
    return {"terms_summary": terms_summary, "clause_results": [process_clause_with_langchain(c, l, d) for c, l, d in zip(clauses, labels, domains)]}


# ===================================================================
# 5. FastAPI Pydantic 모델 및 엔드포인트
# ===================================================================

# --- 입력 모델 ---
class TextIn(BaseModel): text: str
class FullTermsIn(BaseModel): full_text: str
class LangchainIn(BaseModel): terms_text: str; clauses: List[str]; labels: List[str]

# --- 엔드포인트 ---

@app.post("/extract-difficult-words", summary="[기능 1] TF-IDF 어려운 단어 추출 (기존 기능)")
async def endpoint_extract_difficult_words(payload: TextIn):
    try:
        difficult_words = tfidf_extractor.extract_difficult_words(payload.text, "corpus.txt", top_n=5)
        for word_data in difficult_words: word_data["definition"] = search_naver_encyc(word_data["word"])
        return {"difficult_words": difficult_words}
    except Exception as e: raise HTTPException(500, f"TF-IDF 분석 오류: {e}")

@app.post("/analyze-clause-with-shap", summary="[기능 2] 단일 조항 심층 분석 (개선됨)")
async def endpoint_analyze_clause_with_shap(payload: TextIn):
    analysis = analyze_clause_with_shap(payload.text)
    natural_phrase_prompt = f"다음 문장은 컴퓨터 분석 결과로 어색합니다. 핵심 의미는 유지하되, 자연스러운 한국어 한 문장으로 다듬어 주세요: '{analysis['awkward_key_phrase']}'"
    natural_key_phrase = llm.predict(natural_phrase_prompt)
    explanation_prompt = f"""[분석 대상 조항]\n"{payload.text}"\n\n[AI 분석 결과]\n- 이 조항은 '{analysis['prediction']}'하다고 판단됩니다.\n- 판단 근거는 '{natural_key_phrase}' 내용입니다.\n\n[요청 작업]\nAI 법률 전문가로서, 위 분석 결과를 바탕으로 아래 항목에 대해 Markdown 형식으로 설명해주세요.\n\n- **핵심 문제점/혜택**:\n- **소비자에게 미치는 영향**:\n- **권장 대응 방안**:"""
    llm_explanation = llm.predict(explanation_prompt)
    definitions = [{"word": w, "definition": search_naver_encyc(w)} for w in analysis['top_words']]
    return {"prediction": analysis['prediction'], "key_phrase": natural_key_phrase, "llm_explanation": llm_explanation, "keywords_definitions": definitions}

@app.post("/analyze-terms-with-langchain", summary="[기능 3] LangChain 전체 약관 분석 (기존 기능)")
async def endpoint_analyze_terms_with_langchain(payload: LangchainIn):
    if len(payload.clauses) != len(payload.labels): raise HTTPException(400, "clauses와 labels 개수 불일치")
    try: return run_terms_analysis(payload.terms_text, payload.clauses, payload.labels)
    except Exception as e: raise HTTPException(500, f"LangChain 분석 오류: {e}")

@app.post("/analyze-full-terms-top3", summary="[기능 4] 전체 약관 Top 3 필터링 (신규 추가)")
async def endpoint_analyze_full_terms_top3(payload: FullTermsIn):
    clauses = split_clauses(payload.full_text)
    if not clauses: raise HTTPException(400, "분석할 조항을 찾을 수 없음")
    all_results = [dict(text=c, **predict_clause_probabilities(c)) for c in clauses]
    top_favorable = sorted(all_results, key=lambda x: x['prob_favorable'], reverse=True)[:3]
    top_unfavorable = sorted(all_results, key=lambda x: x['prob_unfavorable'], reverse=True)[:3]
    return {"total_clauses_found": len(clauses), "top_favorable_clauses": top_favorable, "top_unfavorable_clauses": top_unfavorable}

@app.get("/", summary="서버 상태 확인")
def read_root(): return {"status": "약관 분석 API 서버 v5.0 (모든 기능 통합)이 정상 동작 중입니다."} 