In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AlbertConfig, AutoConfig
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
class BERT:
    def __init__(self, df, model_name='bert-base-uncased', max_length=64, learning_rate=2e-5, batch_size=16, epochs=10, patience=3, device=None):
        self.df = df
        self.model_name = model_name
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        # Tokenizer와 Config 불러오기
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.config = AutoConfig.from_pretrained(self.model_name, hidden_dropout_prob=0.5, num_labels=1)  # num_labels=1로 설정
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config)

        # 분류기 레이어를 이진 분류에 맞게 수정
        self.model.classifier = torch.nn.Linear(self.model.config.hidden_size, 1)

        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-2)
        self.best_model_state_dict = None
        self.best_accuracy = 0

    @staticmethod
    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def preprocess_data(self, df):
        inputs = self.tokenizer(
            list(df['prepro']),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        labels = torch.tensor(df['label'].values)
        return DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=self.batch_size, shuffle=True)

    def train(self):
      df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
      df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
      train_loader = self.preprocess_data(df_train)
      val_loader = self.preprocess_data(df_val)

      self.model.to(self.device)

      patience_counter = 0
      min_val_loss = float('inf')

      try:
          for epoch in range(self.epochs):
              print(f"\nEpoch {epoch + 1}/{self.epochs}")
              self.model.train()

              for input_batch in tqdm(train_loader, desc="Training Batches", leave=False):
                  input_ids, attention_mask, label_batch = [tensor.to(self.device) for tensor in input_batch]

                  self.optimizer.zero_grad()
                  outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

                  logits = outputs.logits[:, 0].squeeze(dim=-1)
                  loss = F.binary_cross_entropy_with_logits(logits, label_batch.float())

                  loss.backward()
                  self.optimizer.step()

              # Validation 단계
              self.model.eval()
              val_loss_total = 0
              val_predictions_all = []
              val_labels_all = []

              with torch.inference_mode():
                  for val_batch in tqdm(val_loader, desc="Validation Batches", leave=False):
                      input_ids, attention_mask, val_labels = [tensor.to(self.device) for tensor in val_batch]

                      val_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                      val_logits = val_outputs.logits[:, 0].squeeze(dim=-1)

                      val_predictions = torch.round(torch.sigmoid(val_logits))
                      val_loss = F.binary_cross_entropy_with_logits(val_logits, val_labels.float())
                      val_loss_total += val_loss.item()

                      val_predictions_all.append(val_predictions.cpu())
                      val_labels_all.append(val_labels.cpu())

              val_predictions_all = torch.cat(val_predictions_all)
              val_labels_all = torch.cat(val_labels_all)

              val_accuracy = accuracy_score(val_labels_all, val_predictions_all)
              val_f1 = f1_score(val_labels_all, val_predictions_all)
              val_recall = recall_score(val_labels_all, val_predictions_all)
              val_precision = precision_score(val_labels_all, val_predictions_all)

              val_loss_total /= len(val_loader)
              print(f'\nValidation Loss: {val_loss_total:.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}, Recall: {val_recall:.4f}, Precision: {val_precision:.4f}')

              if val_loss_total < min_val_loss:
                  min_val_loss = val_loss_total
                  patience_counter = 0
                  self.best_model_state_dict = self.model.state_dict().copy()
              else:
                  patience_counter += 1

              if patience_counter >= self.patience:
                print(f"Early stopping at epoch {epoch + 1}")
                # 가장 좋은 모델 상태 저장
                torch.save(self.best_model_state_dict, "best_model(BERT)_checkpoint.pth")
                break

      except Exception as e:
          print(f"An error occurred during training: {str(e)}")
          print("Saving current model weights...")
          # 현재까지의 가중치를 저장
          torch.save(self.model.state_dict(), "error_model(Elec).pth")
          raise  # 오류를 다시 발생시켜서 코드 실행 중단

    def evaluate(self):
      if self.best_model_state_dict is None:
          raise ValueError("No trained model found. Please train the model first.")

      df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
      df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
      test_loader = self.preprocess_data(df_test)

      # 가장 좋은 모델 가중치 로드
      best_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config)
      best_model.load_state_dict(self.best_model_state_dict, strict=False)  # strict=False로 설정하여 일부 키 불일치를 무시
      best_model.to(self.device)

      best_model.eval()
      test_predictions_all = []
      test_labels_all = []

      try:
          with torch.inference_mode():
              for test_batch in tqdm(test_loader, desc="Test Batches", leave=False):
                  input_ids, attention_mask, test_labels = [tensor.to(self.device) for tensor in test_batch]

                  test_outputs = best_model(input_ids=input_ids, attention_mask=attention_mask)
                  test_logits = test_outputs.logits[:, 0].squeeze(dim=-1)

                  test_predictions = torch.round(torch.sigmoid(test_logits))

                  test_predictions_all.append(test_predictions.cpu())
                  test_labels_all.append(test_labels.cpu())

          test_predictions_all = torch.cat(test_predictions_all)
          test_labels_all = torch.cat(test_labels_all)

          accuracy = accuracy_score(test_labels_all, test_predictions_all)
          f1 = f1_score(test_labels_all, test_predictions_all)
          recall = recall_score(test_labels_all, test_predictions_all)
          precision = precision_score(test_labels_all, test_predictions_all)

          print(f'Test Accuracy: {accuracy:.4f}')
          print(f'Test F1 Score: {f1:.4f}')
          print(f'Test Recall: {recall:.4f}')
          print(f'Test Precision: {precision:.4f}')

          # 최종 모델 저장
          torch.save(best_model.state_dict(), "best_model(BERT).pth")

      except Exception as e:
          print(f"An error occurred during evaluation: {str(e)}")
          print("Saving current model weights...")
          torch.save(best_model.state_dict(), "error_model(BERT_evaluation).pth")
          raise


def main():
    df = pd.read_csv("/content/drive/MyDrive/IMCOM_Edtech_apps(prepro+sentiment).csv")
    classifier = BERT(df)
    classifier.set_seed(42)
    classifier.train()
    classifier.evaluate()

if __name__ == "__main__":
    main()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10





Validation Loss: 0.3622, Accuracy: 0.8626, F1: 0.8657, Recall: 0.9391, Precision: 0.8029

Epoch 2/10





Validation Loss: 0.3328, Accuracy: 0.8879, F1: 0.8851, Recall: 0.9161, Precision: 0.8562

Epoch 3/10





Validation Loss: 0.3236, Accuracy: 0.8947, F1: 0.8848, Recall: 0.8577, Precision: 0.9137

Epoch 4/10





Validation Loss: 0.3179, Accuracy: 0.8976, F1: 0.8934, Recall: 0.9106, Precision: 0.8768

Epoch 5/10





Validation Loss: 0.3311, Accuracy: 0.8974, F1: 0.8938, Recall: 0.9161, Precision: 0.8726

Epoch 6/10





Validation Loss: 0.3241, Accuracy: 0.9011, F1: 0.8955, Recall: 0.8991, Precision: 0.8919

Epoch 7/10





Validation Loss: 0.3186, Accuracy: 0.9007, F1: 0.8961, Recall: 0.9081, Precision: 0.8844
Early stopping at epoch 7


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.8968
Test F1 Score: 0.8907
Test Recall: 0.9032
Test Precision: 0.8785
