#### BERT 모델 학습 및 실행(선수, 팀)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

# CSV 파일 경로
file_path = '/content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/train_pt.csv'

# CSV 파일 읽기
df = pd.read_csv(file_path)

df

Unnamed: 0,sentence,label
0,선수,0
1,이번 경기에서 선수의 기록이 어땠어?,0
2,최근에 선수의 성적이 어떤가요?,0
3,저번 시즌에 선수가 어떤 성적을 냈나요?,0
4,선수의 최근 성적을 알 수 있을까요?,0
...,...,...
5494,충북청주FC의 최근 승부 결과를 분석하고 싶어요.,1
5495,충북청주FC의 현재 전략을 평가하고 싶습니다.,1
5496,충북청주FC의 최근 경기에서의 성적을 분석하고 싶어요.,1
5497,충북청주FC의 최근 승부 결과에 대해 알고 싶습니다.,1


In [3]:
len(df)

5499

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERT 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

# 원본 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/train_pt.csv')
x = list(df['sentence'].values)
y = list(df['label'].values)

# # 예시 데이터
# x = ['K리그1', 'K리그1에서의 기록을 확인하고 싶어요.', 'K리그1의 최신 기록을 살펴보고 싶어요.', 'K리그2', 'K리그2 경기 기록이 궁금해요.', 'K리그2에서의 팀 기록을 보여주세요.']
# y = [0, 0, 0, 1, 1, 1]

# 학습 데이터와 검증 데이터로 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(x, y, test_size = 0.2, random_state = 11)

# 학습 데이터와 검증 데이터를 토크나이징하여 인코딩
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# 학습 데이터와 검증 데이터를 텐서로 변환
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 16)

# BERT 모델 불러오기
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels = 2)
model.to(device)

# 옵티마이저 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr = 5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_loader)*3)

# 손실 함수 설정
criterion = torch.nn.CrossEntropyLoss()

# 학습 함수 정의
def train(model, train_loader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

# 검증 함수 정의
def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim = 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(val_loader), accuracy

# 학습 및 검증
num_epochs = 1
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, criterion)
    val_loss, val_accuracy = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}: Train Loss - {train_loss:.4f}, Val Loss - {val_loss:.4f}, Val Accuracy - {val_accuracy:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss - 0.0218, Val Loss - 0.0001, Val Accuracy - 1.0000


In [None]:
# 학습 및 검증 루프
num_epochs = 4
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, criterion)
    val_loss, val_accuracy = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}: Train Loss - {train_loss:.4f}, Val Loss - {val_loss:.4f}, Val Accuracy - {val_accuracy:.4f}")

# 모든 학습이 끝난 후 최종 모델과 토크나이저 저장
final_model_save_path = "/content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/checkpoint/3/final_model.bin"
final_tokenizer_save_path = "/content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/checkpoint/3/final_tokenizer"

torch.save(model.state_dict(), final_model_save_path)
tokenizer.save_pretrained(final_tokenizer_save_path)

print(f"Final model and tokenizer saved to {final_model_save_path} and {final_tokenizer_save_path}")


Epoch 1: Train Loss - 0.0009, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 2: Train Loss - 0.0009, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 3: Train Loss - 0.0009, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 4: Train Loss - 0.0054, Val Loss - 0.0005, Val Accuracy - 1.0000
Final model and tokenizer saved to /content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/final_model.bin and /content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/final_tokenizer


In [None]:
# 예측 수행
print('선수와 팀 중 검색하고자 하는 카테고리를 선택해 주세요.(ex. 팀 성적이 궁금합니다.) \n')
user_question = str(input())
test_texts = list(user_question)

test_encodings = tokenizer(user_question, truncation=True, padding=True)
test_dataset = TensorDataset(torch.tensor([test_encodings['input_ids']]),  # 리스트 형태로 변환하여 텐서 생성
                             torch.tensor([test_encodings['attention_mask']]))  # 리스트 형태로 변환하여 텐서 생성
test_loader = DataLoader(test_dataset, batch_size = 16)

model.eval()
tp_prediction = []
with torch.no_grad():
    for input_ids, attention_mask in test_loader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        tp_prediction.extend(preds.cpu().numpy())

print("Predictions:", tp_prediction)

선수와 팀 중 검색하고자 하는 카테고리를 선택해 주세요.(ex. 팀 성적이 궁금합니다.) 

팀의 순위를 확인하고 싶어요.
Predictions: [1]


In [None]:
# 예측 수행
print('선수와 팀 중 검색하고자 하는 카테고리를 선택해 주세요.(ex. 팀 성적이 궁금합니다.) \n')
user_question = str(input())
test_texts = list(user_question)

test_encodings = tokenizer(user_question, truncation=True, padding=True)
test_dataset = TensorDataset(torch.tensor([test_encodings['input_ids']]),  # 리스트 형태로 변환하여 텐서 생성
                             torch.tensor([test_encodings['attention_mask']]))  # 리스트 형태로 변환하여 텐서 생성
test_loader = DataLoader(test_dataset, batch_size = 16)

model.eval()
tp_prediction = []
with torch.no_grad():
    for input_ids, attention_mask in test_loader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        tp_prediction.extend(preds.cpu().numpy())

print("Predictions:", tp_prediction)

선수와 팀 중 검색하고자 하는 카테고리를 선택해 주세요.(ex. 팀 성적이 궁금합니다.) 

선수의 통계는 어떻게 되나요?
Predictions: [0]


#### BERT 모델 학습 및 실행(리그 선택)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERT 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

# 원본 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/train.csv')
x = list(df['sentence'].values)
y = list(df['label'].values)

# # 예시 데이터
# x = ['K리그1', 'K리그1에서의 기록을 확인하고 싶어요.', 'K리그1의 최신 기록을 살펴보고 싶어요.', 'K리그2', 'K리그2 경기 기록이 궁금해요.', 'K리그2에서의 팀 기록을 보여주세요.']
# y = [0, 0, 0, 1, 1, 1]

# 학습 데이터와 검증 데이터로 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(x, y, test_size = 0.2, random_state = 11)

# 학습 데이터와 검증 데이터를 토크나이징하여 인코딩
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# 인코딩 한 학습 데이터와 검증 데이터를 텐서로 변환
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 16)

# BERT 모델 불러오기
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels = 2)
model.to(device)

# 옵티마이저 및 스케줄러 설정
optimizer = AdamW(model.parameters(), lr = 5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = len(train_loader)*3)

# 손실 함수 설정
criterion = torch.nn.CrossEntropyLoss()

# 학습 함수 정의
def train(model, train_loader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(train_loader)

# 검증 함수 정의
def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs.loss
            total_loss += loss.item()
            logits = outputs.logits
            preds = torch.argmax(logits, dim = 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(val_loader), accuracy

# 학습 및 검증
num_epochs = 4
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, criterion)
    val_loss, val_accuracy = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}: Train Loss - {train_loss:.4f}, Val Loss - {val_loss:.4f}, Val Accuracy - {val_accuracy:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss - 0.3518, Val Loss - 0.0009, Val Accuracy - 1.0000
Epoch 2: Train Loss - 0.0043, Val Loss - 0.0834, Val Accuracy - 0.9887
Epoch 3: Train Loss - 0.0080, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 4: Train Loss - 0.0048, Val Loss - 0.0005, Val Accuracy - 1.0000


In [None]:
import os

# 학습 및 검증 루프
num_epochs = 4
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, criterion)
    val_loss, val_accuracy = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}: Train Loss - {train_loss:.4f}, Val Loss - {val_loss:.4f}, Val Accuracy - {val_accuracy:.4f}")

# 마지막 에폭에서 모델과 토크나이저 저장
checkpoint_dir = "/content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/checkpoint"

# 체크포인트 디렉토리가 없다면 생성
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# 모델 저장
model_save_path = os.path.join(checkpoint_dir, "final_model.bin")
torch.save(model.state_dict(), model_save_path)

# 토크나이저 저장
tokenizer_save_path = os.path.join(checkpoint_dir, "final_tokenizer")
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer have been saved to {model_save_path} and {tokenizer_save_path}")


Epoch 1: Train Loss - 0.0039, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 2: Train Loss - 0.0009, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 3: Train Loss - 0.0009, Val Loss - 0.0005, Val Accuracy - 1.0000
Epoch 4: Train Loss - 0.0009, Val Loss - 0.0005, Val Accuracy - 1.0000
Model and tokenizer have been saved to /content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/checkpoint/final_model.bin and /content/drive/MyDrive/hjh_kita_directory/hjh2차챗봇프로젝트/project_model/checkpoint/final_tokenizer


In [None]:
# 예측 수행
print('K리그1과 K리그2 중 검색하고자 하는 리그를 선택해주세요. \n')
user_question = str(input())
test_texts = list(user_question)

test_encodings = tokenizer(user_question, truncation=True, padding=True)
test_dataset = TensorDataset(torch.tensor([test_encodings['input_ids']]),  # 리스트 형태로 변환하여 텐서 생성
                             torch.tensor([test_encodings['attention_mask']]))  # 리스트 형태로 변환하여 텐서 생성
test_loader = DataLoader(test_dataset, batch_size = 16)

model.eval()
league_prediction = []
with torch.no_grad():
    for input_ids, attention_mask in test_loader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        league_prediction.extend(preds.cpu().numpy())

print("Predictions:", league_prediction)

K리그1과 K리그2 중 검색하고자 하는 리그를 선택해주세요. 

K 1부 리그
Predictions: [0]
