In [None]:
! pip install --upgrade transformers
! pip install --upgrade accelerate



In [None]:
!pip install transformers torch sentencepiece



In [None]:
!pip install -q -U transformers accelerate datasets

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# ✅ STEP 1) 데이터 로드 및 전처리

In [None]:
google = pd.read_csv('/content/drive/MyDrive/멋쟁이사자처럼_DAB6/데이터톤/데이터/리뷰 데이터/googlever.csv')

In [None]:
naver = pd.read_csv('/content/drive/MyDrive/멋쟁이사자처럼_DAB6/데이터톤/데이터/리뷰 데이터/naver_review.csv')

In [None]:
# 텍스트 & 평점 컬럼 찾기
text_col = 'content'
rating_col = 'rating'

# === 2) 평점을 라벨로 변환 (5점=1(긍정), 나머지=0(부정)) ===
google["label"] = google[rating_col].apply(lambda x: 1 if float(x)==5 else 0)

# === 3) Train/Test split ===
train_text, val_text, train_label, val_label = train_test_split(
    google[text_col], google["label"], test_size=0.2, random_state=42, stratify=google["label"]
)

------
## 1번째 모델 ("beomi/KcELECTRA-base")

In [None]:
import os, re, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
)
from pathlib import Path

In [None]:
# =========================================================
# 딥러닝 감정 분류 정리본 (ELECTRA 계열 / KoELECTRA 등)
# - Google: 5점=1, 그 외=0 → 학습/검증
# - EarlyStopping + best 모델 자동 선택(load_best_model_at_end)
# - Accuracy/Precision/Recall/F1 + 분류리포트/혼동행렬 출력
# - Naver 리뷰 예측(0/1 + 긍정확률) 저장
# =========================================================

# ------------------------------
# 0) 경로 & 컬럼 설정 (너의 경로/컬럼 그대로)
# ------------------------------

text_col   = "content"   # 구글 리뷰 텍스트 컬럼
rating_col = "rating"    # 구글 평점 컬럼
text_col_naver = "리뷰"  # 네이버 리뷰 텍스트 컬럼

# ✅ 여기서 모델만 바꿔서 다른 모델 실험 가능
MODEL_NAME = "beomi/KcELECTRA-base"
# MODEL_NAME = "daekeun-ml/koelectra-small-v3-nsmc"  # ← 너가 요청한 모델

# ------------------------------
# 1) 데이터 로드 & 라벨링 (5점=1, else=0)
# ------------------------------

google["label"] = google[rating_col].apply(lambda x: 1 if float(x)==5 else 0)

def clean_text_series(s: pd.Series):
    s = s.fillna("").astype(str)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    # 필요시 특수문자 정리 (모델 토크나이저가 robust해서 과한 정제는 불필요)
    return s

google[text_col] = clean_text_series(google[text_col])
naver[text_col_naver] = clean_text_series(naver[text_col_naver])

# ------------------------------
# 2) Train/Validation split
# ------------------------------
train_text, val_text, train_label, val_label = train_test_split(
    google[text_col],
    google["label"],
    test_size=0.2,
    random_state=42,
    stratify=google["label"]
)




In [None]:
# ------------------------------
# 3) 토크나이저 / Dataset 정의
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts: pd.Series, labels: pd.Series, tokenizer, max_length=160):
        self.texts = texts.fillna("").astype(str).tolist()
        self.labels = labels.values if isinstance(labels, pd.Series) else labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_text, train_label, tokenizer)
val_dataset   = ReviewDataset(val_text,   val_label,   tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/514 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# ------------------------------
# 4) 모델 준비
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

# ------------------------------
# 5) 지표 함수 & TrainingArguments (EarlyStopping + Best load)
# ------------------------------
from transformers.trainer_utils import IntervalStrategy

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    y_pred = np.argmax(preds, axis=1)
    return {
        "accuracy":  accuracy_score(labels, y_pred),
        "precision": precision_score(labels, y_pred, zero_division=0),
        "recall":    recall_score(labels, y_pred, zero_division=0),
        "f1":        f1_score(labels, y_pred, zero_division=0),
    }

args = TrainingArguments(
    output_dir="./result",
    # Removed evaluation_strategy, save_strategy, logging_dir, logging_steps, report_to
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # Reduced epochs for quick test
    weight_decay=0.01,
    warmup_ratio=0.06,
    fp16=torch.cuda.is_available(),
    # Removed callbacks
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # Removed callbacks
)

# Note: Training and evaluation will not happen in this step as part of troubleshooting
# trainer.train()
# trainer.evaluate()

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# ------------------------------
# 6) 학습 & 평가 (지표 출력 + 리포트/혼동행렬)
# ------------------------------
train_out = trainer.train()
best_eval = trainer.evaluate()

print("\n=== Best Eval (auto-selected) ===")
for k, v in best_eval.items():
    if k.startswith("eval_") or k == "epoch":
        print(f"{k}: {v}")

# 상세 리포트/혼동행렬
pred_out = trainer.predict(val_dataset)
y_true = pred_out.label_ids
y_pred = np.argmax(pred_out.predictions, axis=1)
print("\n=== Classification report ===")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

# (선택) 베스트 모델 저장
best_dir = Path("./best_model"); best_dir.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(best_dir))
tokenizer.save_pretrained(str(best_dir))
print(f"[Saved best model] {best_dir}")


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33michaeyeon6366[0m ([33michaeyeon6366-no[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss



=== Best Eval (auto-selected) ===
eval_loss: 0.5637186169624329
eval_accuracy: 0.7236842105263158
eval_precision: 0.7386861313868613
eval_recall: 0.8433333333333334
eval_f1: 0.7875486381322957
eval_runtime: 2.9449
eval_samples_per_second: 335.496
eval_steps_per_second: 21.053
epoch: 1.0

=== Classification report ===
              precision    recall  f1-score   support

           0     0.6898    0.5387    0.6049       388
           1     0.7387    0.8433    0.7875       600

    accuracy                         0.7237       988
   macro avg     0.7142    0.6910    0.6962       988
weighted avg     0.7195    0.7237    0.7158       988

Confusion matrix:
 [[209 179]
 [ 94 506]]
[Saved best model] best_model


---
## 2번째 모델 (daekeun-ml/koelectra-small-v3-nsmc)

In [None]:
# =========================================================
# 딥러닝 감정 분류 정리본 (ELECTRA 계열 / KoELECTRA 등)
# - Google: 5점=1, 그 외=0 → 학습/검증
# - EarlyStopping + best 모델 자동 선택(load_best_model_at_end)
# - Accuracy/Precision/Recall/F1 + 분류리포트/혼동행렬 출력
# - Naver 리뷰 예측(0/1 + 긍정확률) 저장
# =========================================================

# ------------------------------
# 0) 경로 & 컬럼 설정 (너의 경로/컬럼 그대로)
# ------------------------------

text_col   = "content"   # 구글 리뷰 텍스트 컬럼
rating_col = "rating"    # 구글 평점 컬럼
text_col_naver = "리뷰"  # 네이버 리뷰 텍스트 컬럼

# ✅ 여기서 모델만 바꿔서 다른 모델 실험 가능
# MODEL_NAME = "beomi/KcELECTRA-base"
MODEL_NAME = "daekeun-ml/koelectra-small-v3-nsmc"  # ← 너가 요청한 모델

# ------------------------------
# 1) 데이터 로드 & 라벨링 (5점=1, else=0)
# ------------------------------

google["label"] = google[rating_col].apply(lambda x: 1 if float(x)==5 else 0)

def clean_text_series(s: pd.Series):
    s = s.fillna("").astype(str)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    # 필요시 특수문자 정리 (모델 토크나이저가 robust해서 과한 정제는 불필요)
    return s

google[text_col] = clean_text_series(google[text_col])
naver[text_col_naver] = clean_text_series(naver[text_col_naver])

# ------------------------------
# 2) Train/Validation split
# ------------------------------
train_text, val_text, train_label, val_label = train_test_split(
    google[text_col],
    google["label"],
    test_size=0.2,
    random_state=42,
    stratify=google["label"]
)




In [None]:
# ------------------------------
# 3) 토크나이저 / Dataset 정의
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts: pd.Series, labels: pd.Series, tokenizer, max_length=160):
        self.texts = texts.fillna("").astype(str).tolist()
        self.labels = labels.values if isinstance(labels, pd.Series) else labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_text, train_label, tokenizer)
val_dataset   = ReviewDataset(val_text,   val_label,   tokenizer)


tokenizer_config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# ------------------------------
# 4) 모델 준비
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

# ------------------------------
# 5) 지표 함수 & TrainingArguments (EarlyStopping + Best load)
# ------------------------------
from transformers.trainer_utils import IntervalStrategy

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    y_pred = np.argmax(preds, axis=1)
    return {
        "accuracy":  accuracy_score(labels, y_pred),
        "precision": precision_score(labels, y_pred, zero_division=0),
        "recall":    recall_score(labels, y_pred, zero_division=0),
        "f1":        f1_score(labels, y_pred, zero_division=0),
    }

args = TrainingArguments(
    output_dir="./result",
    # Removed evaluation_strategy, save_strategy, logging_dir, logging_steps, report_to
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # Reduced epochs for quick test
    weight_decay=0.01,
    warmup_ratio=0.06,
    fp16=torch.cuda.is_available(),
    # Removed callbacks
)

trainer = Trainer(
    model=model_2,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # Removed callbacks
)

# Note: Training and evaluation will not happen in this step as part of troubleshooting
# trainer.train()
# trainer.evaluate()

config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/56.5M [00:00<?, ?B/s]

In [None]:

# ------------------------------
# 6) 학습 & 평가 (지표 출력 + 리포트/혼동행렬)
# ------------------------------
train_out = trainer.train()
best_eval = trainer.evaluate()

print("\n=== Best Eval (auto-selected) ===")
for k, v in best_eval.items():
    if k.startswith("eval_") or k == "epoch":
        print(f"{k}: {v}")

# 상세 리포트/혼동행렬
pred_out = trainer.predict(val_dataset)
y_true = pred_out.label_ids
y_pred = np.argmax(pred_out.predictions, axis=1)
print("\n=== Classification report ===")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

# (선택) 베스트 모델 저장
best_dir = Path("./best_model"); best_dir.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(best_dir))
tokenizer.save_pretrained(str(best_dir))
print(f"[Saved best model] {best_dir}")


Step,Training Loss



=== Best Eval (auto-selected) ===
eval_loss: 0.5483928918838501
eval_accuracy: 0.7388663967611336
eval_precision: 0.7329700272479565
eval_recall: 0.8966666666666666
eval_f1: 0.8065967016491754
eval_runtime: 1.4244
eval_samples_per_second: 693.61
eval_steps_per_second: 43.526
epoch: 1.0

=== Classification report ===
              precision    recall  f1-score   support

           0     0.7559    0.4948    0.5981       388
           1     0.7330    0.8967    0.8066       600

    accuracy                         0.7389       988
   macro avg     0.7444    0.6958    0.7024       988
weighted avg     0.7420    0.7389    0.7247       988

Confusion matrix:
 [[192 196]
 [ 62 538]]
[Saved best model] best_model


In [None]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# (선택) 디버깅에 도움: 커널 동기화
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# 1) model_2와 "같은" 모델 ID로 토크나이저 생성
MODEL_ID_2 = "daekeun-ml/koelectra-small-v3-nsmc"  # model_2를 이 ID로 로드했다면 동일하게
tokenizer2 = AutoTokenizer.from_pretrained(MODEL_ID_2)

# 2) 디바이스 설정 (GPU 에러 시 cpu로 강제 전환 가능)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try_cpu_if_error = True  # True면 GPU 에러시 CPU로 자동 재시도

model_2.eval()
model_2.to(device)
print(f"Using device: {device}")

# 3) 텍스트 정리
def to_str_list(series: pd.Series):
    s = series.fillna("").astype(str)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s.tolist()

text_col_naver = "리뷰"  # 네이버 텍스트 컬럼 고정
naver_texts = to_str_list(naver[text_col_naver])

print(f"Type of naver_texts: {type(naver_texts)}")
print(f"First 5 samples: {naver_texts[:5]}")

# 4) 배치 추론 (문제 배치 자동 격리 & 재시도)
pred_labels, pos_probs = [], []
bs = 64

with torch.no_grad():
    i = 0
    while i < len(naver_texts):
        batch = naver_texts[i:i+bs]
        try:
            # 반드시 model_2와 동일한 tokenizer 사용!
            enc = tokenizer2(
                batch,
                padding=True,
                truncation=True,
                max_length=160,
                return_tensors="pt"
            )
            # token_type_ids 수동 추가 불필요
            enc = {k: v.to(device) for k, v in enc.items()}
            logits = model_2(**enc).logits
            probs = torch.softmax(logits, dim=1)[:, 1]
            labels = (probs >= 0.5).long()

            pos_probs.extend(probs.detach().cpu().numpy().tolist())
            pred_labels.extend(labels.detach().cpu().numpy().tolist())
            i += bs

        except Exception as e:
            print(f"[Batch Error] idx {i}~{min(i+bs, len(naver_texts))} on {device}: {repr(e)}")
            if try_cpu_if_error and device.type == "cuda":
                # GPU에서 문제가 생기면 해당 배치를 CPU로 재시도
                print("→ Retrying this batch on CPU...")
                cpu_enc = tokenizer2(
                    batch,
                    padding=True,
                    truncation=True,
                    max_length=160,
                    return_tensors="pt"
                )
                cpu_logits = model_2.cpu()(**cpu_enc).logits
                cpu_probs = torch.softmax(cpu_logits, dim=1)[:, 1]
                cpu_labels = (cpu_probs >= 0.5).long()

                pos_probs.extend(cpu_probs.detach().numpy().tolist())
                pred_labels.extend(cpu_labels.detach().numpy().tolist())

                # 끝나고 다시 원래 디바이스로 복귀
                model_2.to(device)
                i += bs
            else:
                # 재시도 원치 않으면 문제 텍스트를 로그로 남기고 스킵
                print("→ Skipping this batch. Problematic texts snapshot:")
                print(batch[:3])
                i += bs

# 5) 결과 저장
naver_out = naver.copy()
naver_out["pred_label"] = np.array(pred_labels, dtype=int)
naver_out["positive_prob"] = np.array(pos_probs, dtype=float)

save_csv = "/content/naver_predict_딥러닝2.csv"
naver_out.to_csv(save_csv, index=False)
print(f"[Saved predictions] {save_csv}")

Using device: cuda
Type of naver_texts: <class 'list'>
First 5 samples: ['겸사겸사 에버랜드 다시 방문요😇😇 🍄🥬🍄🥬🍄🥬🍄🥬🍄🥬🍄🥬🍄🥬🍄 어머낫~아들이 좋아하는 케이팝~~~열광😍😍 요즘 한참 인기있는 케이팝이죠~~~👍🥰😍 🍁🍂🍃🍁🍂🍃🍁🍂🍃🍁🍂🍃🍁🍂🍃🍁 아들이 줄이 길어도 기다려서 보고가야 한다네요😪😭😱 뜨악😱😱기다리는 시간이 1시간을 넘어서 입장요🥹🥹😂 🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩🍒🍋\u200d🟩 들어가니.....🤣🤣🤣그냥 좋아요 🤩😍🥰🤣😂😍 😅😇🤣😂🥰😍😇😅🤣🥰😍😂😇😅🥰🤣😂😍 아드님은 너무 좋았다고 😇😇😇 또 선물까지🥰👍🤩 역시 에버는 의자🪑가 필수입니다 🤣🤣🤣 🧸🐸🧸🐸🧸🐸🧸🐸🧸🐸🧸🐸🧸🐸🧸 아드님과 하루 마무리 즐겁게 잘보내고 왓지요😇😇 🌱🌹🐅🦧🐒⚘️🌱🌺💐🌼🌴🪴💐🌳☘️🥀🌲 접기', '키이스케이프와 콜라보한 방탈출이 있다고 해서 다녀왔는데 너무너무너무 재미있었습니다. 표값이 조금도 아깝지 않을 정도로 너무 재밌게 즐기고 왔습니다 !! 에버랜드에서 이 공간를 잘 활용해서 앞으로도 체험형 방탈출 많이 콜라보해주면 좋겠어요~ 그럼 표 끊어서 또 갈게요~ 접기', '에버랜드 1박2일 가족 나들이~ 화창한 날씨가 아닌 흐리고 비도 오고 아쉬움이 많긴했지만 미리 예약한 3번의 사파리투어를 통해 동물 친구들과 즐거운 시간도 보내고... 첫째날은 다행히 비가 오지 않아 야간 퍼레이드도 신나게... 우중 놀이기구도 색다른 즐거움을 주는 시간이었어요... 기다림이 많은 에버랜드는 큐패스와 부지런히 예약을 하고 온다면 가족들과 알차고 즐겁게 보낼 수 있어욤~ 가을 맞이 다양한 행사도 많으니 시간이 허락한다면 고고!!! 접기', '에버랜드에 오랜만에 갔어요^^ 오랜만에가니 새로워요 먹거리도 많고 볼거리 놀거리 진짜 최고네요 팬더랑 동물 보는데 아이가 너무좋아해서 덩달아 행복합니다 시설도 깔끔하고 관리잘되어있고 테

---
## 3번째 모델(klue/roberta-base)

In [None]:
# =========================================================
# 딥러닝 감정 분류 정리본 (ELECTRA 계열 / KoELECTRA 등)
# - Google: 5점=1, 그 외=0 → 학습/검증
# - EarlyStopping + best 모델 자동 선택(load_best_model_at_end)
# - Accuracy/Precision/Recall/F1 + 분류리포트/혼동행렬 출력
# - Naver 리뷰 예측(0/1 + 긍정확률) 저장
# =========================================================

# ------------------------------
# 0) 경로 & 컬럼 설정 (너의 경로/컬럼 그대로)
# ------------------------------

text_col   = "content"   # 구글 리뷰 텍스트 컬럼
rating_col = "rating"    # 구글 평점 컬럼
text_col_naver = "리뷰"  # 네이버 리뷰 텍스트 컬럼

# ✅ 여기서 모델만 바꿔서 다른 모델 실험 가능
MODEL_NAME = "klue/roberta-base"

# ------------------------------
# 1) 데이터 로드 & 라벨링 (5점=1, else=0)
# ------------------------------

google["label"] = google[rating_col].apply(lambda x: 1 if float(x)==5 else 0)

def clean_text_series(s: pd.Series):
    s = s.fillna("").astype(str)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    # 필요시 특수문자 정리 (모델 토크나이저가 robust해서 과한 정제는 불필요)
    return s

google[text_col] = clean_text_series(google[text_col])
naver[text_col_naver] = clean_text_series(naver[text_col_naver])

# ------------------------------
# 2) Train/Validation split
# ------------------------------
train_text, val_text, train_label, val_label = train_test_split(
    google[text_col],
    google["label"],
    test_size=0.2,
    random_state=42,
    stratify=google["label"]
)




In [None]:
# ------------------------------
# 3) 토크나이저 / Dataset 정의
# ------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts: pd.Series, labels: pd.Series, tokenizer, max_length=160):
        self.texts = texts.fillna("").astype(str).tolist()
        self.labels = labels.values if isinstance(labels, pd.Series) else labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(int(self.labels[idx]))
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_text, train_label, tokenizer)
val_dataset   = ReviewDataset(val_text,   val_label,   tokenizer)


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
# ------------------------------
# 4) 모델 준비
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_3 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

# ------------------------------
# 5) 지표 함수 & TrainingArguments (EarlyStopping + Best load)
# ------------------------------
from transformers.trainer_utils import IntervalStrategy

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    y_pred = np.argmax(preds, axis=1)
    return {
        "accuracy":  accuracy_score(labels, y_pred),
        "precision": precision_score(labels, y_pred, zero_division=0),
        "recall":    recall_score(labels, y_pred, zero_division=0),
        "f1":        f1_score(labels, y_pred, zero_division=0),
    }

args = TrainingArguments(
    output_dir="./result",
    # Removed evaluation_strategy, save_strategy, logging_dir, logging_steps, report_to
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1, # Reduced epochs for quick test
    weight_decay=0.01,
    warmup_ratio=0.06,
    fp16=torch.cuda.is_available(),
    # Removed callbacks
)

trainer = Trainer(
    model=model_3,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    # Removed callbacks
)

# Note: Training and evaluation will not happen in this step as part of troubleshooting
# trainer.train()
# trainer.evaluate()

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# ------------------------------
# 6) 학습 & 평가 (지표 출력 + 리포트/혼동행렬)
# ------------------------------
train_out = trainer.train()
best_eval = trainer.evaluate()

print("\n=== Best Eval (auto-selected) ===")
for k, v in best_eval.items():
    if k.startswith("eval_") or k == "epoch":
        print(f"{k}: {v}")

# 상세 리포트/혼동행렬
pred_out = trainer.predict(val_dataset)
y_true = pred_out.label_ids
y_pred = np.argmax(pred_out.predictions, axis=1)
print("\n=== Classification report ===")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

# (선택) 베스트 모델 저장
best_dir = Path("./best_model"); best_dir.mkdir(parents=True, exist_ok=True)
trainer.save_model(str(best_dir))
tokenizer.save_pretrained(str(best_dir))
print(f"[Saved best model] {best_dir}")


Step,Training Loss



=== Best Eval (auto-selected) ===
eval_loss: 0.5379731059074402
eval_accuracy: 0.7419028340080972
eval_precision: 0.765793528505393
eval_recall: 0.8283333333333334
eval_f1: 0.7958366693354684
eval_runtime: 2.6842
eval_samples_per_second: 368.073
eval_steps_per_second: 46.195
epoch: 1.0

=== Classification report ===
              precision    recall  f1-score   support

           0     0.6962    0.6082    0.6492       388
           1     0.7658    0.8283    0.7958       600

    accuracy                         0.7419       988
   macro avg     0.7310    0.7183    0.7225       988
weighted avg     0.7384    0.7419    0.7383       988

Confusion matrix:
 [[236 152]
 [103 497]]
[Saved best model] best_model


----
# 텍스트분석

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 1s (10.8 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package fonts-nanum.
(Reading database ... 125081 files and direc

In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.0 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (495 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.9/495.9 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.6.0 konlpy-0.6.0
