In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ======================================================
# Cell 1. 라이브러리 설치 및 기본 설정
# ======================================================
# PyTorch Geometric (PyG) 및 관련 라이브러리 설치
import torch
import os

# PyTorch 버전에 맞는 PyG 라이브러리 설치
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-geometric

# Hugging Face 및 기타 라이브러리 설치
!pip install -q transformers datasets scikit-learn pandas numpy matplotlib seaborn networkx

# 기본 설정
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# ======================================================
# Cell 2. 데이터 로드 및 전처리
# ======================================================
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 데이터셋 경로 설정
base_path = '/content/drive/MyDrive/졸업논문/data/processed/'
contextual_path = os.path.join(base_path, 'contextual_merged.csv')
paraphrase_path = os.path.join(base_path, 'paraphrase_merged.csv')
template_path = os.path.join(base_path, 'template_merged.csv')

# 데이터프레임 로드
try:
    contextual_df = pd.read_csv(contextual_path)
    paraphrase_df = pd.read_csv(paraphrase_path)
    template_df = pd.read_csv(template_path) # 변수명 오타 수정 (templete -> template)
    print("✅ 데이터셋 로드 완료.")
except FileNotFoundError as e:
    print(f"🚨 파일 로드 오류: {e}")
    print("파일 경로를 다시 확인해주세요. 예: /content/drive/MyDrive/path/to/your/data.csv")
    # 파일이 없으면 이후 코드 실행을 중단
    # 실제 환경에서는 이 부분에서 실행을 멈추는 것이 좋습니다.
    contextual_df, paraphrase_df, template_df = None, None, None

datasets = {
    "contextual": contextual_df,
    "paraphrase": paraphrase_df,
    "template": template_df
}

# ======================================================
# Cell 3. GNN 모델 및 학습 루프 정의
# ======================================================
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel
from sklearn.neighbors import NearestNeighbors
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from torch_geometric.utils import to_networkx

# --- GCN 모델 정의 ---
class GCNClassifier(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_dim, hidden_dim))
        for _ in range(n_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
        self.lin = torch.nn.Linear(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        out = self.lin(x)
        return out

# --- 평균 풀링 함수 ---
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# --- 학습 및 평가 파이프라인 함수 ---
def run_gnn_pipeline(df, dataset_name, base_save_dir):
    print("\n" + "="*20)
    print(f"🚀 {dataset_name.upper()} 데이터셋 GNN 파이프라인 실행 시작")
    print("="*20)

    # --- 0. 파라미터 설정 ---
    EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    K_NEIGHBORS = 8
    HIDDEN_DIM = 128
    GNN_LAYERS = 2
    LR = 0.01
    EPOCHS = 200 # 학습 성능을 위해 Epochs 증가
    BATCH_SIZE = 32

    # --- 1. 결과 저장 디렉토리 생성 ---
    SAVE_DIR = os.path.join(base_save_dir, f"gnn_{dataset_name}_model")
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f"결과 저장 경로: {SAVE_DIR}")

    # --- 2. 데이터 전처리 및 라벨 인코딩 ---
    df['predicate'].fillna('Not Dark Pattern', inplace=True)
    sentences = df['sentence'].tolist()
    labels_raw = df['predicate'].tolist()

    target_name = "Not Dark Pattern"
    binary_labels = [1 if label == target_name else 0 for label in labels_raw]
    labels_enc = np.array(binary_labels)
    num_classes = len(np.unique(labels_enc))

    # LabelEncoder는 클래스 이름 시각화를 위해 사용
    le = LabelEncoder()
    le.fit([0, 1]) # 0: Dark Pattern, 1: Not Dark Pattern
    le.classes_ = np.array([f'Dark Pattern (0)', f'{target_name} (1)'])


    print(f"데이터 샘플 수: {len(sentences)}")
    print(f"클래스 수: {num_classes}")
    print(f"클래스: {le.classes_}")

    # --- 3. 문장 임베딩 생성 ---
    embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL)
    embed_model = AutoModel.from_pretrained(EMBED_MODEL).to(DEVICE)
    embed_model.eval()

    all_embeddings = []
    for i in tqdm(range(0, len(sentences), BATCH_SIZE), desc=f"[{dataset_name}] Embedding"):
        batch_texts = sentences[i:i+BATCH_SIZE]
        inputs = embed_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            outputs = embed_model(**inputs)
            emb = mean_pooling(outputs, inputs['attention_mask'])
            emb = emb.cpu().numpy()
        all_embeddings.append(emb)
    all_embeddings = np.vstack(all_embeddings)
    print(f"임베딩 Shape: {all_embeddings.shape}")

    # --- 4. kNN 그래프 생성 ---
    emb = all_embeddings
    n_neighbors = min(K_NEIGHBORS, len(emb) - 1)
    nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, metric="cosine").fit(emb)
    _, indices = nbrs.kneighbors(emb)

    edge_index_list = []
    for i in range(len(indices)):
        for j in indices[i][1:]:
            edge_index_list.append((i, j))
            edge_index_list.append((j, i))
    edge_index = np.array(edge_index_list).T
    print(f"Edge index Shape: {edge_index.shape}")

    # --- 5. PyG Data 객체 생성 ---
    x = torch.tensor(emb, dtype=torch.float)
    y = torch.tensor(labels_enc, dtype=torch.long)
    edge_index_t = torch.tensor(edge_index, dtype=torch.long)

    data = Data(x=x, edge_index=edge_index_t, y=y)

    num_nodes = data.num_nodes
    perm = np.random.permutation(num_nodes)
    train_ratio = 0.8
    n_train = int(train_ratio * num_nodes)
    train_idx, val_idx = perm[:n_train], perm[n_train:]

    data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    data.train_mask[train_idx] = True
    data.val_mask[val_idx] = True
    print(data)

    # --- 6. GNN 모델 초기화 ---
    model_gnn = GCNClassifier(in_dim=x.size(1), hidden_dim=HIDDEN_DIM, out_dim=num_classes, n_layers=GNN_LAYERS).to(DEVICE)
    optimizer = torch.optim.Adam(model_gnn.parameters(), lr=LR, weight_decay=1e-5)
    criterion = torch.nn.CrossEntropyLoss()
    data = data.to(DEVICE)

    # --- 7. 학습 루프 ---
    best_val_f1 = 0
    best_epoch = 0
    print("\n GNN 모델 학습 시작...")
    for epoch in range(1, EPOCHS + 1):
        model_gnn.train()
        optimizer.zero_grad()
        out = model_gnn(data.x, data.edge_index)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        # Validation
        model_gnn.eval()
        with torch.no_grad():
            out_val = model_gnn(data.x, data.edge_index)
            preds_val = out_val[data.val_mask].argmax(dim=1).cpu().numpy()
            true_val = data.y[data.val_mask].cpu().numpy()
            val_acc = accuracy_score(true_val, preds_val)
            p, r, f, _ = precision_recall_fscore_support(true_val, preds_val, average='binary', zero_division=0)

        if f > best_val_f1:
            best_val_f1 = f
            best_epoch = epoch
            torch.save(model_gnn.state_dict(), os.path.join(SAVE_DIR, "gnn_model.pt"))


        if epoch % 20 == 0:
            print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {f:.4f}")

    print(f"✅ 학습 완료. Best F1: {best_val_f1:.4f} at Epoch {best_epoch}")
    print(f"최적 모델 저장 완료: {os.path.join(SAVE_DIR, 'gnn_model.pt')}")


    # --- 8. 최종 평가 및 결과 저장 ---
    # 최적 모델 로드
    model_gnn.load_state_dict(torch.load(os.path.join(SAVE_DIR, "gnn_model.pt")))
    model_gnn.eval()
    with torch.no_grad():
        out_final = model_gnn(data.x, data.edge_index)
        preds = out_final.argmax(dim=1).cpu().numpy()

    # 예측 결과 저장
    df_res = pd.DataFrame({
        "sentence": sentences,
        "true_label": labels_raw,
        "true_label_enc": labels_enc,
        "pred_label_enc": preds
    })
    pred_path = os.path.join(SAVE_DIR, "gnn_predictions.csv")
    df_res.to_csv(pred_path, index=False, encoding="utf-8-sig")
    print(f"예측 결과 저장 완료: {pred_path}")

    # 최종 성능 리포트
    y_true_all = df_res['true_label_enc']
    y_pred_all = df_res['pred_label_enc']
    class_names = le.classes_
    report = classification_report(y_true_all, y_pred_all, target_names=[str(name) for name in class_names])
    print("\n" + "="*15, "최종 성능 평가 리포트", "="*15)
    print(report)
    print("="*55)

    # Confusion Matrix 시각화
    cm = confusion_matrix(y_true_all, y_pred_all)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'GNN Model Confusion Matrix ({dataset_name})')
    cm_path = os.path.join(SAVE_DIR, "gnn_confusion_matrix.png")
    plt.savefig(cm_path)
    plt.show()
    print(f"Confusion Matrix 저장 완료: {cm_path}")

    # kNN 그래프 시각화
    G = to_networkx(data.cpu(), to_undirected=True)
    color_map = ['#ff7f0e' if label == 0 else '#1f77b4' for label in data.y.cpu().numpy()]
    plt.figure(figsize=(15, 15))
    pos = nx.spring_layout(G, iterations=50, seed=42)
    nx.draw(G, pos, node_color=color_map, with_labels=False, node_size=60, width=0.5, edge_color='grey')
    plt.title(f'kNN Graph of Sentence Embeddings ({dataset_name})', fontsize=20)

    blue_patch = plt.scatter([],[], c='#1f77b4', label=f'Class 1 ({target_name})')
    orange_patch = plt.scatter([],[], c='#ff7f0e', label=f'Class 0 (Dark Pattern)')
    plt.legend(handles=[blue_patch, orange_patch], loc='upper right', fontsize=14, title='Labels')

    graph_path = os.path.join(SAVE_DIR, "gnn_graph.png")
    plt.savefig(graph_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"GNN 그래프 저장 완료: {graph_path}")
    print(f"🎉 {dataset_name.upper()} 데이터셋 파이프라인 종료.")


# ======================================================
# Cell 4. 메인 실행 루프
# ======================================================
base_save_directory = '/content/drive/MyDrive/졸업논문/models/'

if all(df is not None for df in datasets.values()):
    for name, df in datasets.items():
        if df is not None and not df.empty:
            run_gnn_pipeline(df.copy(), name, base_save_directory)
        else:
            print(f"⚠️ {name} 데이터셋이 비어있거나 로드되지 않아 건너뜁니다.")
else:
    print("🚨 데이터셋 로드에 실패하여 GNN 파이프라인을 실행할 수 없습니다.")

Looking in links: https://data.pyg.org/whl/torch-2.8.0+cu126.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_scatter-2.1.2%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_sparse-0.6.18%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_cluster-1.6.3%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-spline-conv
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_s

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['predicate'].fillna('Not Dark Pattern', inplace=True)


KeyError: 'sentence'