In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from torch.optim import AdamW 
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AlbertConfig, AutoConfig
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
class Distilroberta:
    def __init__(self, df, model_name='distilbert/distilroberta-base', max_length=64, learning_rate=2e-5, batch_size=16, epochs=10, patience=3, device=None):
        self.df = df
        self.model_name = model_name
        self.max_length = max_length
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.patience = patience
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        
        # Tokenizer와 ALBERT 전용 Config 불러오기
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.config = AutoConfig.from_pretrained(self.model_name, hidden_dropout_prob=0.2, num_labels=1)  # num_labels=1로 설정
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config, ignore_mismatched_sizes=True)
        
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.best_model_state_dict = None
        self.best_accuracy = 0

    @staticmethod
    def set_seed(seed):
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def preprocess_data(self, df):
        inputs = self.tokenizer(
            list(df['prepro']),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        labels = torch.tensor(df['label'].values)
        return DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=self.batch_size, shuffle=True)

    def train(self):
        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        train_loader = self.preprocess_data(df_train)
        val_loader = self.preprocess_data(df_val)

        self.model.to(self.device)

        patience_counter = 0
        min_val_loss = float('inf')

        for epoch in range(self.epochs):
            print(f"\nEpoch {epoch + 1}/{self.epochs}")
            self.model.train()

            for input_batch in tqdm(train_loader, desc="Training Batches", leave=False):
                input_ids, attention_mask, label_batch = [tensor.to(self.device) for tensor in input_batch]

                self.optimizer.zero_grad()
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                
                # Logits 크기 및 타겟 크기 확인
                #print(f"Logits size: {outputs.logits.size()}, Labels size: {label_batch.size()}")  # 로깅
                
                # 이진 분류의 경우 logits 차원 축소
                logits = outputs.logits.squeeze(dim=-1)
                #print(f"Logits after squeeze: {logits.size()}")  # Log the logits after squeezing
                
                # loss 계산 시 레이블을 float으로 변환
                loss = F.binary_cross_entropy_with_logits(logits, label_batch.float())  # Binary classification

                loss.backward()
                self.optimizer.step()

            # Validation 단계
            self.model.eval()
            val_loss_total = 0
            val_predictions_all = []
            val_labels_all = []

            with torch.inference_mode():
                for val_batch in tqdm(val_loader, desc="Validation Batches", leave=False):
                    input_ids, attention_mask, val_labels = [tensor.to(self.device) for tensor in val_batch]

                    val_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                    val_logits = val_outputs.logits.squeeze(dim=-1)

                    # Binary classification: use sigmoid and round for predictions
                    val_predictions = torch.round(torch.sigmoid(val_logits))

                    # Validation loss 계산
                    val_loss = F.binary_cross_entropy_with_logits(val_logits, val_labels.float())
                    val_loss_total += val_loss.item()

                    val_predictions_all.append(val_predictions.cpu())
                    val_labels_all.append(val_labels.cpu())

            # 리스트들을 연결
            val_predictions_all = torch.cat(val_predictions_all)
            val_labels_all = torch.cat(val_labels_all)

            # Validation 지표 계산
            val_accuracy = accuracy_score(val_labels_all, val_predictions_all)
            val_f1 = f1_score(val_labels_all, val_predictions_all)
            val_recall = recall_score(val_labels_all, val_predictions_all)
            val_precision = precision_score(val_labels_all, val_predictions_all)

            val_loss_total /= len(val_loader)
            print(f'\nValidation Loss: {val_loss_total:.4f}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}, Recall: {val_recall:.4f}, Precision: {val_precision:.4f}')

            # Early stopping logic
            if val_loss_total < min_val_loss:
                min_val_loss = val_loss_total
                patience_counter = 0
                self.best_model_state_dict = self.model.state_dict().copy()
            else:
                patience_counter += 1

            if patience_counter >= self.patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    def evaluate(self):
        if self.best_model_state_dict is None:
            raise ValueError("No trained model found. Please train the model first.")

        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        test_loader = self.preprocess_data(df_test)

        best_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        best_model.load_state_dict(self.best_model_state_dict)
        best_model.to(self.device)

        best_model.eval()
        test_predictions_all = []
        test_labels_all = []
        
        with torch.inference_mode():
            for test_batch in tqdm(test_loader, desc="Test Batches", leave=False):
                input_ids, attention_mask, test_labels = [tensor.to(self.device) for tensor in test_batch]

                test_outputs = best_model(input_ids=input_ids, attention_mask=attention_mask)
                test_logits = test_outputs.logits.squeeze(dim=-1)

                # Binary classification: use sigmoid and round for predictions
                test_predictions = torch.round(torch.sigmoid(test_logits))

                test_predictions_all.append(test_predictions.cpu())
                test_labels_all.append(test_labels.cpu())

        # 리스트들을 연결
        test_predictions_all = torch.cat(test_predictions_all)
        test_labels_all = torch.cat(test_labels_all)

        # 평가 지표 계산
        accuracy = accuracy_score(test_labels_all, test_predictions_all)
        f1 = f1_score(test_labels_all, test_predictions_all)
        recall = recall_score(test_labels_all, test_predictions_all)
        precision = precision_score(test_labels_all, test_predictions_all)

        print(f'Test Accuracy: {accuracy:.4f}')
        print(f'Test F1 Score: {f1:.4f}')
        print(f'Test Recall: {recall:.4f}')
        print(f'Test Precision: {precision:.4f}')

        # 모델 저장
        torch.save(best_model.state_dict(), "best_model(DistilRoBERTa).pth")



In [3]:
def main():
    df = pd.read_csv("C:\\Users\\USER\\Desktop\\충원's project\\IMCOM\\IMCOM_Edtech_apps(prepro+sentiment).csv")
    classifier = Distilroberta(df)
    classifier.set_seed(42)
    classifier.train()
    classifier.evaluate()

if __name__ == "__main__":
    main()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10


                                                                     


Validation Loss: 0.2922, Accuracy: 0.8971, F1: 0.8905, Recall: 0.8879, Precision: 0.8930

Epoch 2/10


                                                                     


Validation Loss: 0.2703, Accuracy: 0.9006, F1: 0.8956, Recall: 0.9040, Precision: 0.8873

Epoch 3/10


                                                                     


Validation Loss: 0.2816, Accuracy: 0.8978, F1: 0.8885, Recall: 0.8643, Precision: 0.9142

Epoch 4/10


                                                                     


Validation Loss: 0.3055, Accuracy: 0.8988, F1: 0.8898, Recall: 0.8667, Precision: 0.9141

Epoch 5/10


                                                                     


Validation Loss: 0.2907, Accuracy: 0.8976, F1: 0.8934, Recall: 0.9112, Precision: 0.8764
Early stopping at epoch 5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                               

Test Accuracy: 0.8972
Test F1 Score: 0.8914
Test Recall: 0.9071
Test Precision: 0.8763


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class Distilroberta:
    def __init__(self, df, model_name='distilbert/distilroberta-base', max_length=64, batch_size=16, device=None):
        self.df = df
        self.model_name = model_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        
        # Tokenizer와 모델 초기화
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.config = AutoConfig.from_pretrained(self.model_name, num_labels=1)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.config)
        self.model.to(self.device)

    def preprocess_data(self, df):
        inputs = self.tokenizer(
            list(df['prepro']),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        labels = torch.tensor(df['label'].values)
        return DataLoader(TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels), batch_size=self.batch_size, shuffle=False)

    def load_weights(self, path):
        # 저장된 가중치 로드
        self.model.load_state_dict(torch.load(path, map_location=self.device))
        self.model.to(self.device)

    def evaluate(self):
        df_train, df_temp = train_test_split(self.df, test_size=0.4, random_state=42)
        df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
        test_loader = self.preprocess_data(df_test)

        self.model.eval()  # 평가 모드로 전환
        test_predictions_all = []
        test_labels_all = []

        with torch.inference_mode():
            for test_batch in tqdm(test_loader, desc="Test Batches", leave=False):
                input_ids, attention_mask, test_labels = [tensor.to(self.device) for tensor in test_batch]

                test_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                test_logits = test_outputs.logits.squeeze(dim=-1)

                # Binary classification: use sigmoid and round for predictions
                test_predictions = torch.round(torch.sigmoid(test_logits))

                test_predictions_all.append(test_predictions.cpu())
                test_labels_all.append(test_labels.cpu())

        # 리스트들을 연결
        test_predictions_all = torch.cat(test_predictions_all)
        test_labels_all = torch.cat(test_labels_all)

        # 평가 지표 계산
        accuracy = accuracy_score(test_labels_all, test_predictions_all)
        f1 = f1_score(test_labels_all, test_predictions_all)
        recall = recall_score(test_labels_all, test_predictions_all)
        precision = precision_score(test_labels_all, test_predictions_all)

        # 평가 결과 출력
        print(f'Test Accuracy: {accuracy:.4f}')
        print(f'Test F1 Score: {f1:.4f}')
        print(f'Test Recall: {recall:.4f}')
        print(f'Test Precision: {precision:.4f}')



In [5]:

def main():
    df = pd.read_csv("C:\\Users\\USER\\Desktop\\충원's project\\IMCOM\\IMCOM_Edtech_apps(prepro+sentiment).csv")
    classifier = Distilroberta(df)
    classifier.load_weights("best_model(DistilRoBERTa).pth")  # 저장된 가중치 로드
    classifier.evaluate()  # 로드된 가중치로 Test 데이터 평가

if __name__ == "__main__":
    main()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.model.load_state_dict(torch.load(path, map_location=self.device))
                                                               

Test Accuracy: 0.8972
Test F1 Score: 0.8914
Test Recall: 0.9071
Test Precision: 0.8763
