In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import torch
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AlbertConfig, AutoConfig
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
class Electra:
    def __init__(self, df, model_name="google/electra-small-discriminator", max_length=64, learning_rate=2e-5, batch_size=16, epochs=10, patience=3, device=None):
        self.df = df
        self.model_name = model_name
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")

        # Tokenizer와 ALBERT 전용 Config 불러오기
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.config = AutoConfig.from_pretrained(self.model_name, hidden_dropout_prob=0.5, num_labels=1)  # num_labels=1로 설정
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config, ignore_mismatched_sizes=True)

        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-2)
        self.best_model_state_dict = None
        self.best_accuracy = 0

    @staticmethod
    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def preprocess_data(self, df):
        inputs = self.tokenizer(
            list(df['prepro']),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        labels = torch.tensor(df['label'].values)
        return DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=self.batch_size, shuffle=True)

    def train(self):
        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        train_loader = self.preprocess_data(df_train)
        val_loader = self.preprocess_data(df_val)

        self.model.to(self.device)

        patience_counter = 0
        min_val_loss = float('inf')

        for epoch in range(self.epochs):
            print(f"\nEpoch {epoch + 1}/{self.epochs}")
            self.model.train()

            for input_batch in tqdm(train_loader, desc="Training Batches", leave=False):
                input_ids, attention_mask, label_batch = [tensor.to(self.device) for tensor in input_batch]

                self.optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

                # Logits 크기 및 타겟 크기 확인
                #print(f"Logits size: {outputs.logits.size()}, Labels size: {label_batch.size()}")  # 로깅

                # 이진 분류의 경우 logits 차원 축소
                logits = outputs.logits.squeeze(dim=-1)
                #print(f"Logits after squeeze: {logits.size()}")  # Log the logits after squeezing

                # loss 계산 시 레이블을 float으로 변환
                loss = F.binary_cross_entropy_with_logits(logits, label_batch.float())  # Binary classification

                loss.backward()
                self.optimizer.step()

            # Validation 단계
            self.model.eval()
            val_loss_total = 0
            val_predictions_all = []
            val_labels_all = []

            with torch.inference_mode():
                for val_batch in tqdm(val_loader, desc="Validation Batches", leave=False):
                    input_ids, attention_mask, val_labels = [tensor.to(self.device) for tensor in val_batch]

                    val_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    val_logits = val_outputs.logits.squeeze(dim=-1)

                    # Binary classification: use sigmoid and round for predictions
                    val_predictions = torch.round(torch.sigmoid(val_logits))

                    # Validation loss 계산
                    val_loss = F.binary_cross_entropy_with_logits(val_logits, val_labels.float())
                    val_loss_total += val_loss.item()

                    val_predictions_all.append(val_predictions.cpu())
                    val_labels_all.append(val_labels.cpu())

            # 리스트들을 연결
            val_predictions_all = torch.cat(val_predictions_all)
            val_labels_all = torch.cat(val_labels_all)

            # Validation 지표 계산
            val_accuracy = accuracy_score(val_labels_all, val_predictions_all)
            val_f1 = f1_score(val_labels_all, val_predictions_all)
            val_recall = recall_score(val_labels_all, val_predictions_all)
            val_precision = precision_score(val_labels_all, val_predictions_all)

            val_loss_total /= len(val_loader)
            print(f'\nValidation Loss: {val_loss_total:.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}, Recall: {val_recall:.4f}, Precision: {val_precision:.4f}')

            # Early stopping logic
            if val_loss_total < min_val_loss:
                min_val_loss = val_loss_total
                patience_counter = 0
                self.best_model_state_dict = self.model.state_dict().copy()
            else:
                patience_counter += 1

            if patience_counter >= self.patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    def evaluate(self):
        if self.best_model_state_dict is None:
            raise ValueError("No trained model found. Please train the model first.")

        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        test_loader = self.preprocess_data(df_test)

        best_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        best_model.load_state_dict(self.best_model_state_dict)
        best_model.to(self.device)

        best_model.eval()
        test_predictions_all = []
        test_labels_all = []

        with torch.inference_mode():
            for test_batch in tqdm(test_loader, desc="Test Batches", leave=False):
                input_ids, attention_mask, test_labels = [tensor.to(self.device) for tensor in test_batch]

                test_outputs = best_model(input_ids=input_ids, attention_mask=attention_mask)
                test_logits = test_outputs.logits.squeeze(dim=-1)

                # Binary classification: use sigmoid and round for predictions
                test_predictions = torch.round(torch.sigmoid(test_logits))

                test_predictions_all.append(test_predictions.cpu())
                test_labels_all.append(test_labels.cpu())

        # 리스트들을 연결
        test_predictions_all = torch.cat(test_predictions_all)
        test_labels_all = torch.cat(test_labels_all)

        # 평가 지표 계산
        accuracy = accuracy_score(test_labels_all, test_predictions_all)
        f1 = f1_score(test_labels_all, test_predictions_all)
        recall = recall_score(test_labels_all, test_predictions_all)
        precision = precision_score(test_labels_all, test_predictions_all)

        print(f'Test Accuracy: {accuracy:.4f}')
        print(f'Test F1 Score: {f1:.4f}')
        print(f'Test Recall: {recall:.4f}')
        print(f'Test Precision: {precision:.4f}')

        # 모델 저장
        torch.save(best_model.state_dict(), "best_model(Electra).pth")



In [None]:
def main():
    df = pd.read_csv("/content/drive/MyDrive/IMCOM_Edtech_apps(prepro+sentiment).csv")
    classifier = Electra(df)
    classifier.set_seed(42)
    classifier.train()
    classifier.evaluate()

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10





Validation Loss: 0.4947, Accuracy: 0.7931, F1: 0.8121, Recall: 0.9483, Precision: 0.7100

Epoch 2/10





Validation Loss: 0.3663, Accuracy: 0.8685, F1: 0.8675, Recall: 0.9132, Precision: 0.8262

Epoch 3/10





Validation Loss: 0.3611, Accuracy: 0.8811, F1: 0.8777, Recall: 0.9051, Precision: 0.8519

Epoch 4/10





Validation Loss: 0.3405, Accuracy: 0.8823, F1: 0.8763, Recall: 0.8847, Precision: 0.8680

Epoch 5/10





Validation Loss: 0.3468, Accuracy: 0.8807, F1: 0.8776, Recall: 0.9076, Precision: 0.8495

Epoch 6/10





Validation Loss: 0.3407, Accuracy: 0.8882, F1: 0.8815, Recall: 0.8828, Precision: 0.8803

Epoch 7/10





Validation Loss: 0.3571, Accuracy: 0.8854, F1: 0.8828, Recall: 0.9154, Precision: 0.8524
Early stopping at epoch 7


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.8856
Test F1 Score: 0.8818
Test Recall: 0.9169
Test Precision: 0.8493


### ELECTRA_base 모델 비교

In [3]:
class Electra_base:
    def __init__(self, df, model_name="google/electra-base-discriminator", max_length=64, learning_rate=2e-5, batch_size=16, epochs=10, patience=3, device=None):
        self.df = df
        self.model_name = model_name
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.config = AutoConfig.from_pretrained(self.model_name, hidden_dropout_prob=0.5, num_labels=1)  # num_labels=1로 설정
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config, ignore_mismatched_sizes=True)
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-2)
        self.best_model_state_dict = None
        self.best_accuracy = 0

    @staticmethod
    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def preprocess_data(self, df):
        inputs = self.tokenizer(
            list(df['prepro']),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        labels = torch.tensor(df['label'].values)
        return DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=self.batch_size, shuffle=True)

    def train(self):
        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        train_loader = self.preprocess_data(df_train)
        val_loader = self.preprocess_data(df_val)

        self.model.to(self.device)

        patience_counter = 0
        min_val_loss = float('inf')

        for epoch in range(self.epochs):
            print(f"\nEpoch {epoch + 1}/{self.epochs}")
            self.model.train()

            for input_batch in tqdm(train_loader, desc="Training Batches", leave=False):
                input_ids, attention_mask, label_batch = [tensor.to(self.device) for tensor in input_batch]

                self.optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

                # Logits 크기 및 타겟 크기 확인
                #print(f"Logits size: {outputs.logits.size()}, Labels size: {label_batch.size()}")  # 로깅

                # 이진 분류의 경우 logits 차원 축소
                logits = outputs.logits.squeeze(dim=-1)
                #print(f"Logits after squeeze: {logits.size()}")  # Log the logits after squeezing

                # loss 계산 시 레이블을 float으로 변환
                loss = F.binary_cross_entropy_with_logits(logits, label_batch.float())  # Binary classification

                loss.backward()
                self.optimizer.step()

            # Validation 단계
            self.model.eval()
            val_loss_total = 0
            val_predictions_all = []
            val_labels_all = []

            with torch.inference_mode():
                for val_batch in tqdm(val_loader, desc="Validation Batches", leave=False):
                    input_ids, attention_mask, val_labels = [tensor.to(self.device) for tensor in val_batch]

                    val_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    val_logits = val_outputs.logits.squeeze(dim=-1)

                    # Binary classification: use sigmoid and round for predictions
                    val_predictions = torch.round(torch.sigmoid(val_logits))

                    # Validation loss 계산
                    val_loss = F.binary_cross_entropy_with_logits(val_logits, val_labels.float())
                    val_loss_total += val_loss.item()

                    val_predictions_all.append(val_predictions.cpu())
                    val_labels_all.append(val_labels.cpu())

            # 리스트들을 연결
            val_predictions_all = torch.cat(val_predictions_all)
            val_labels_all = torch.cat(val_labels_all)

            # Validation 지표 계산
            val_accuracy = accuracy_score(val_labels_all, val_predictions_all)
            val_f1 = f1_score(val_labels_all, val_predictions_all)
            val_recall = recall_score(val_labels_all, val_predictions_all)
            val_precision = precision_score(val_labels_all, val_predictions_all)

            val_loss_total /= len(val_loader)
            print(f'\nValidation Loss: {val_loss_total:.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}, Recall: {val_recall:.4f}, Precision: {val_precision:.4f}')

            # Early stopping logic
            if val_loss_total < min_val_loss:
                min_val_loss = val_loss_total
                patience_counter = 0
                self.best_model_state_dict = self.model.state_dict().copy()
            else:
                patience_counter += 1

            if patience_counter >= self.patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    def evaluate(self):
        if self.best_model_state_dict is None:
            raise ValueError("No trained model found. Please train the model first.")

        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        test_loader = self.preprocess_data(df_test)

        best_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        best_model.load_state_dict(self.best_model_state_dict)
        best_model.to(self.device)

        best_model.eval()
        test_predictions_all = []
        test_labels_all = []

        with torch.inference_mode():
            for test_batch in tqdm(test_loader, desc="Test Batches", leave=False):
                input_ids, attention_mask, test_labels = [tensor.to(self.device) for tensor in test_batch]

                test_outputs = best_model(input_ids=input_ids, attention_mask=attention_mask)
                test_logits = test_outputs.logits.squeeze(dim=-1)

                # Binary classification: use sigmoid and round for predictions
                test_predictions = torch.round(torch.sigmoid(test_logits))

                test_predictions_all.append(test_predictions.cpu())
                test_labels_all.append(test_labels.cpu())

        # 리스트들을 연결
        test_predictions_all = torch.cat(test_predictions_all)
        test_labels_all = torch.cat(test_labels_all)

        # 평가 지표 계산
        accuracy = accuracy_score(test_labels_all, test_predictions_all)
        f1 = f1_score(test_labels_all, test_predictions_all)
        recall = recall_score(test_labels_all, test_predictions_all)
        precision = precision_score(test_labels_all, test_predictions_all)

        print(f'Test Accuracy: {accuracy:.4f}')
        print(f'Test F1 Score: {f1:.4f}')
        print(f'Test Recall: {recall:.4f}')
        print(f'Test Precision: {precision:.4f}')

        # 모델 저장
        torch.save(best_model.state_dict(), "best_model(Electra_base).pth")

def main():
    df = pd.read_csv("/content/drive/MyDrive/IMCOM_Edtech_apps(prepro+sentiment).csv")
    classifier = Electra_base(df)
    classifier.set_seed(42)
    classifier.train()
    classifier.evaluate()

if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10





Validation Loss: 0.5599, Accuracy: 0.7888, F1: 0.8070, Recall: 0.9371, Precision: 0.7087

Epoch 2/10





Validation Loss: 0.4039, Accuracy: 0.8854, F1: 0.8810, Recall: 0.8997, Precision: 0.8630

Epoch 3/10





Validation Loss: 0.4376, Accuracy: 0.8731, F1: 0.8741, Recall: 0.9340, Precision: 0.8213

Epoch 4/10





Validation Loss: 0.3803, Accuracy: 0.8905, F1: 0.8869, Recall: 0.9109, Precision: 0.8641

Epoch 5/10





Validation Loss: 0.4098, Accuracy: 0.8918, F1: 0.8884, Recall: 0.9134, Precision: 0.8646

Epoch 6/10





Validation Loss: 0.5547, Accuracy: 0.8598, F1: 0.8504, Recall: 0.8454, Precision: 0.8554

Epoch 7/10





Validation Loss: 0.3503, Accuracy: 0.8877, F1: 0.8868, Recall: 0.9331, Precision: 0.8449

Epoch 8/10





Validation Loss: 0.3963, Accuracy: 0.8927, F1: 0.8872, Recall: 0.8954, Precision: 0.8793

Epoch 9/10





Validation Loss: 0.3813, Accuracy: 0.8977, F1: 0.8921, Recall: 0.8976, Precision: 0.8867

Epoch 10/10





Validation Loss: 0.3535, Accuracy: 0.8981, F1: 0.8918, Recall: 0.8905, Precision: 0.8931
Early stopping at epoch 10


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.8952
Test F1 Score: 0.8877
Test Recall: 0.8893
Test Precision: 0.8860
