In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import pandas as pd
contextual_df = pd.read_csv('/content/drive/MyDrive/졸업논문/data/processed/contextual_merged.csv')
paraphrase_df = pd.read_csv('/content/drive/MyDrive/졸업논문/data/processed/paraphrase_merged.csv')
templete_df = pd.read_csv('/content/drive/MyDrive/졸업논문/data/processed/template_merged.csv')
contextual_df['predicate'].fillna('Not Dark Pattern', inplace=True)
paraphrase_df['predicate'].fillna('Not Dark Pattern', inplace=True)
templete_df['predicate'].fillna('Not Dark Pattern', inplace=True)
contextual_df['predicate'].unique()
paraphrase_df['predicate'].unique()
templete_df['predicate'].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  contextual_df['predicate'].fillna('Not Dark Pattern', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  paraphrase_df['predicate'].fillna('Not Dark Pattern', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

array(['Not Dark Pattern', 'Activity Notifications', 'Countdown Timers',
       'Limited-time Messages', 'Low-stock Messages', 'Confirmshaming',
       'Pressured Selling', 'High-demand Messages', 'Trick Questions',
       'Testimonials of Uncertain Origin'], dtype=object)

In [16]:
pip install pandas scikit-learn torch transformers datasets



In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
import joblib

# ==============================================================================
# 1. 예시 데이터프레임 (contextual_df는 미리 정의되어 있어야 함)
# ==============================================================================
datasets = {
    "contextual": contextual_df
}
print("✅ 예시 데이터프레임 준비 완료.")

# ==============================================================================
# 2. 파인튜닝 파이프라인 함수 정의
# ==============================================================================
def fine_tune_and_evaluate(df, dataset_name, text_col='String', label_col='predicate'):
    print("\n" + "="*50)
    print(f"   [{dataset_name}] 데이터셋 파인튜닝 시작")
    print("="*50)

    # --- LabelEncoder ---
    le = LabelEncoder()
    df['label'] = le.fit_transform(df[label_col])
    num_labels = len(le.classes_)

    # --- 데이터 분할 ---
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # --- 토크나이저 ---
    model_name = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize_function(examples):
        return tokenizer(examples[text_col], padding="max_length", truncation=True)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # --- 모델 로드 ---
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # --- 평가 지표 ---
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            'accuracy': accuracy_score(labels, predictions),
            'f1': f1_score(labels, predictions, average='weighted'),
            'precision': precision_score(labels, predictions, average='weighted', zero_division=0),
            'recall': recall_score(labels, predictions, average='weighted', zero_division=0)
        }

    # --- 절대경로 지정 ---
    output_dir = f"/content/drive/MyDrive/졸업논문/models/{dataset_name}_model"
    os.makedirs(output_dir, exist_ok=True)

    # --- 학습 설정 ---
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        logging_steps=1,
        learning_rate=5e-5,
        report_to="none"
    )

    # --- Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    print("\n--- 모델 파인튜닝 시작 ---")
    trainer.train()
    print("✅ 모델 파인튜닝 완료.")

    # --- 모델 저장 ---
    print(f"\n--- 최종 모델 저장 중... 경로: {output_dir} ---")
    trainer.save_model(output_dir)
    print(f"✅ 모델 저장 완료.")

    # --- 토크나이저 저장 추가 ---
    tokenizer.save_pretrained(output_dir)  # 반드시 추가

    # --- LabelEncoder 저장 ---
    le_path = os.path.join(output_dir, f"label_encoder_{dataset_name}.pkl")
    joblib.dump(le, le_path)
    print(f"✅ LabelEncoder 저장 완료 → {le_path}")

    # --- 최종 성능 평가 ---
    print("\n--- 최종 성능 평가 ---")
    eval_results = trainer.evaluate()
    print(f"[{dataset_name}] 최종 성능:")
    print(f"  Accuracy = {eval_results['eval_accuracy']:.4f}")
    print(f"  Precision = {eval_results['eval_precision']:.4f}")
    print(f"  Recall = {eval_results['eval_recall']:.4f}")
    print(f"  F1-Score = {eval_results['eval_f1']:.4f}")

    return eval_results, le

# ==============================================================================
# 3. 메인 실행 블록
# ==============================================================================
all_results = {}
label_encoders = {}

for name, df in datasets.items():
    results, le = fine_tune_and_evaluate(df, name)
    all_results[name] = results
    label_encoders[name] = le

print("\n\n" + "="*50)
print("         모든 데이터셋에 대한 최종 성능 요약")
print("="*50)
for name, result in all_results.items():
    print(f"[{name}] Accuracy: {result['eval_accuracy']:.4f}, "
          f"Precision: {result['eval_precision']:.4f}, "
          f"Recall: {result['eval_recall']:.4f}, "
          f"F1-Score: {result['eval_f1']:.4f}")

✅ 예시 데이터프레임 준비 완료.

   [contextual] 데이터셋 파인튜닝 시작


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- 모델 파인튜닝 시작 ---


Step,Training Loss
1,2.5554
2,2.554
3,2.426
4,2.3541
5,2.0486
6,2.2207
7,1.8087
8,2.5137
9,1.675
10,1.8768


✅ 모델 파인튜닝 완료.

--- 최종 모델 저장 중... 경로: /content/drive/MyDrive/졸업논문/models/contextual_model ---
✅ 모델 저장 완료.
✅ LabelEncoder 저장 완료 → /content/drive/MyDrive/졸업논문/models/contextual_model/label_encoder_contextual.pkl

--- 최종 성능 평가 ---


[contextual] 최종 성능:
  Accuracy = 0.9717
  Precision = 0.9712
  Recall = 0.9717
  F1-Score = 0.9709


         모든 데이터셋에 대한 최종 성능 요약
[contextual] Accuracy: 0.9717, Precision: 0.9712, Recall: 0.9717, F1-Score: 0.9709


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
import joblib

# ==============================================================================
# 1. 예시 데이터프레임 (contextual_df는 미리 정의되어 있어야 함)
# ==============================================================================
datasets = {
    "paraphrase": paraphrase_df
}
print("✅ 예시 데이터프레임 준비 완료.")

# ==============================================================================
# 2. 파인튜닝 파이프라인 함수 정의
# ==============================================================================
def fine_tune_and_evaluate(df, dataset_name, text_col='String', label_col='predicate'):
    print("\n" + "="*50)
    print(f"   [{dataset_name}] 데이터셋 파인튜닝 시작")
    print("="*50)

    # --- LabelEncoder ---
    le = LabelEncoder()
    df['label'] = le.fit_transform(df[label_col])
    num_labels = len(le.classes_)

    # --- 데이터 분할 ---
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # --- 토크나이저 ---
    model_name = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize_function(examples):
        return tokenizer(examples[text_col], padding="max_length", truncation=True)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # --- 모델 로드 ---
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # --- 평가 지표 ---
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            'accuracy': accuracy_score(labels, predictions),
            'f1': f1_score(labels, predictions, average='weighted'),
            'precision': precision_score(labels, predictions, average='weighted', zero_division=0),
            'recall': recall_score(labels, predictions, average='weighted', zero_division=0)
        }

    # --- 절대경로 지정 ---
    output_dir = f"/content/drive/MyDrive/졸업논문/models/{dataset_name}_model"
    os.makedirs(output_dir, exist_ok=True)

    # --- 학습 설정 ---
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        logging_steps=1,
        learning_rate=5e-5,
        report_to="none"
    )

    # --- Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    print("\n--- 모델 파인튜닝 시작 ---")
    trainer.train()
    print("✅ 모델 파인튜닝 완료.")

    # --- 모델 저장 ---
    print(f"\n--- 최종 모델 저장 중... 경로: {output_dir} ---")
    trainer.save_model(output_dir)
    print(f"✅ 모델 저장 완료.")

    # --- 토크나이저 저장 추가 ---
    tokenizer.save_pretrained(output_dir)  # 반드시 추가

    # --- LabelEncoder 저장 ---
    le_path = os.path.join(output_dir, f"label_encoder_{dataset_name}.pkl")
    joblib.dump(le, le_path)
    print(f"✅ LabelEncoder 저장 완료 → {le_path}")

    # --- 최종 성능 평가 ---
    print("\n--- 최종 성능 평가 ---")
    eval_results = trainer.evaluate()
    print(f"[{dataset_name}] 최종 성능:")
    print(f"  Accuracy = {eval_results['eval_accuracy']:.4f}")
    print(f"  Precision = {eval_results['eval_precision']:.4f}")
    print(f"  Recall = {eval_results['eval_recall']:.4f}")
    print(f"  F1-Score = {eval_results['eval_f1']:.4f}")

    return eval_results, le

# ==============================================================================
# 3. 메인 실행 블록
# ==============================================================================
all_results = {}
label_encoders = {}

for name, df in datasets.items():
    results, le = fine_tune_and_evaluate(df, name)
    all_results[name] = results
    label_encoders[name] = le

print("\n\n" + "="*50)
print("         모든 데이터셋에 대한 최종 성능 요약")
print("="*50)
for name, result in all_results.items():
    print(f"[{name}] Accuracy: {result['eval_accuracy']:.4f}, "
          f"Precision: {result['eval_precision']:.4f}, "
          f"Recall: {result['eval_recall']:.4f}, "
          f"F1-Score: {result['eval_f1']:.4f}")

✅ 예시 데이터프레임 준비 완료.

   [paraphrase] 데이터셋 파인튜닝 시작


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- 모델 파인튜닝 시작 ---


Step,Training Loss
1,2.5554
2,2.554
3,2.1603
4,2.4104
5,1.9639
6,2.2667
7,1.7898
8,2.4174
9,1.6662
10,1.7395


✅ 모델 파인튜닝 완료.

--- 최종 모델 저장 중... 경로: /content/drive/MyDrive/졸업논문/models/paraphrase_model ---
✅ 모델 저장 완료.
✅ LabelEncoder 저장 완료 → /content/drive/MyDrive/졸업논문/models/paraphrase_model/label_encoder_paraphrase.pkl

--- 최종 성능 평가 ---


[paraphrase] 최종 성능:
  Accuracy = 0.9633
  Precision = 0.9634
  Recall = 0.9633
  F1-Score = 0.9630


         모든 데이터셋에 대한 최종 성능 요약
[paraphrase] Accuracy: 0.9633, Precision: 0.9634, Recall: 0.9633, F1-Score: 0.9630


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
import joblib

# ==============================================================================
# 1. 예시 데이터프레임 (contextual_df는 미리 정의되어 있어야 함)
# ==============================================================================
datasets = {
    "templete": templete_df
}
print("✅ 예시 데이터프레임 준비 완료.")

# ==============================================================================
# 2. 파인튜닝 파이프라인 함수 정의
# ==============================================================================
def fine_tune_and_evaluate(df, dataset_name, text_col='String', label_col='predicate'):
    print("\n" + "="*50)
    print(f"   [{dataset_name}] 데이터셋 파인튜닝 시작")
    print("="*50)

    # --- LabelEncoder ---
    le = LabelEncoder()
    df['label'] = le.fit_transform(df[label_col])
    num_labels = len(le.classes_)

    # --- 데이터 분할 ---
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    # --- 토크나이저 ---
    model_name = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize_function(examples):
        return tokenizer(examples[text_col], padding="max_length", truncation=True)
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # --- 모델 로드 ---
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # --- 평가 지표 ---
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {
            'accuracy': accuracy_score(labels, predictions),
            'f1': f1_score(labels, predictions, average='weighted'),
            'precision': precision_score(labels, predictions, average='weighted', zero_division=0),
            'recall': recall_score(labels, predictions, average='weighted', zero_division=0)
        }

    # --- 절대경로 지정 ---
    output_dir = f"/content/drive/MyDrive/졸업논문/models/{dataset_name}_model"
    os.makedirs(output_dir, exist_ok=True)

    # --- 학습 설정 ---
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        logging_steps=1,
        learning_rate=5e-5,
        report_to="none"
    )

    # --- Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    print("\n--- 모델 파인튜닝 시작 ---")
    trainer.train()
    print("✅ 모델 파인튜닝 완료.")

    # --- 모델 저장 ---
    print(f"\n--- 최종 모델 저장 중... 경로: {output_dir} ---")
    trainer.save_model(output_dir)
    print(f"✅ 모델 저장 완료.")

    # --- 토크나이저 저장 추가 ---
    tokenizer.save_pretrained(output_dir)  # 반드시 추가

    # --- LabelEncoder 저장 ---
    le_path = os.path.join(output_dir, f"label_encoder_{dataset_name}.pkl")
    joblib.dump(le, le_path)
    print(f"✅ LabelEncoder 저장 완료 → {le_path}")

    # --- 최종 성능 평가 ---
    print("\n--- 최종 성능 평가 ---")
    eval_results = trainer.evaluate()
    print(f"[{dataset_name}] 최종 성능:")
    print(f"  Accuracy = {eval_results['eval_accuracy']:.4f}")
    print(f"  Precision = {eval_results['eval_precision']:.4f}")
    print(f"  Recall = {eval_results['eval_recall']:.4f}")
    print(f"  F1-Score = {eval_results['eval_f1']:.4f}")

    return eval_results, le

# ==============================================================================
# 3. 메인 실행 블록
# ==============================================================================
all_results = {}
label_encoders = {}

for name, df in datasets.items():
    results, le = fine_tune_and_evaluate(df, name)
    all_results[name] = results
    label_encoders[name] = le

print("\n\n" + "="*50)
print("         모든 데이터셋에 대한 최종 성능 요약")
print("="*50)
for name, result in all_results.items():
    print(f"[{name}] Accuracy: {result['eval_accuracy']:.4f}, "
          f"Precision: {result['eval_precision']:.4f}, "
          f"Recall: {result['eval_recall']:.4f}, "
          f"F1-Score: {result['eval_f1']:.4f}")

✅ 예시 데이터프레임 준비 완료.

   [templete] 데이터셋 파인튜닝 시작


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- 모델 파인튜닝 시작 ---


Step,Training Loss
1,2.5554
2,2.554
3,2.4054
4,2.3673
5,2.0484
6,2.2107
7,1.7892
8,2.4812
9,1.7363
10,1.9073


✅ 모델 파인튜닝 완료.

--- 최종 모델 저장 중... 경로: /content/drive/MyDrive/졸업논문/models/templete_model ---
✅ 모델 저장 완료.
✅ LabelEncoder 저장 완료 → /content/drive/MyDrive/졸업논문/models/templete_model/label_encoder_templete.pkl

--- 최종 성능 평가 ---


[templete] 최종 성능:
  Accuracy = 0.9700
  Precision = 0.9704
  Recall = 0.9700
  F1-Score = 0.9692


         모든 데이터셋에 대한 최종 성능 요약
[templete] Accuracy: 0.9700, Precision: 0.9704, Recall: 0.9700, F1-Score: 0.9692


In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib, torch

dataset_name = "contextual"
model_dir = f"/content/drive/MyDrive/졸업논문/models/{dataset_name}_model"

# 모델/토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# LabelEncoder 로드
le = joblib.load(f"{model_dir}/label_encoder_{dataset_name}.pkl")

# 새 문장 예측
new_text = "This is fantastic!"
inputs = tokenizer(new_text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    logits = model(**inputs).logits

pred_id = torch.argmax(logits, dim=-1).item()
pred_label = le.inverse_transform([pred_id])[0]

print(f"예측 결과: {pred_label}")

예측 결과: Not Dark Pattern


In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib, torch

dataset_name = "paraphrase"
model_dir = f"/content/drive/MyDrive/졸업논문/models/{dataset_name}_model"

# 모델/토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# LabelEncoder 로드
le = joblib.load(f"{model_dir}/label_encoder_{dataset_name}.pkl")

# 새 문장 예측
new_text = "This is fantastic!"
inputs = tokenizer(new_text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    logits = model(**inputs).logits

pred_id = torch.argmax(logits, dim=-1).item()
pred_label = le.inverse_transform([pred_id])[0]

print(f"예측 결과: {pred_label}")

예측 결과: Not Dark Pattern


In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib, torch

dataset_name = "templete"
model_dir = f"/content/drive/MyDrive/졸업논문/models/{dataset_name}_model"

# 모델/토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)  # 이제 오류 없음

# LabelEncoder 로드
le = joblib.load(f"{model_dir}/label_encoder_{dataset_name}.pkl")

# 새 문장 예측
new_text = "This is fantastic!"
inputs = tokenizer(new_text, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    logits = model(**inputs).logits

pred_id = torch.argmax(logits, dim=-1).item()
pred_label = le.inverse_transform([pred_id])[0]

print(f"예측 결과: {pred_label}")

예측 결과: Not Dark Pattern
