In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

# 키워드 생성 및 감정 라벨 동시 학습 (데이터 전처리 완료)

In [None]:
# Define the dataset class for review to keyword+sentiment generation
class ReviewWithSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        """
        Dataset class for handling review-to-keyword+sentiment data.

        Args:
            dataframe (pd.DataFrame): DataFrame with Review and Keywords_Sentiments.
            tokenizer (T5Tokenizer): Tokenizer for processing text data.
            max_len (int): Maximum token length for input and target sequences.
        """
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = row['Review']
        target_text = row['Keywords_Sentiments']  # Combined keyword+sentiment

        # Tokenize input and target text
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': targets['input_ids'].squeeze(0),
        }


In [None]:
# Load and preprocess the dataset
file_path = "/content/Transformed_Reviews_with_Sentiments.csv"  # Dataset path
data = pd.read_csv(file_path)
# data.columns = [col.strip() for col in data.columns]  # Strip any whitespace

In [None]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

In [None]:
# Define tokenizer and model
model_name = "paust/pko-t5-base"  # Pretrained Korean T5 model
tokenizer = T5TokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.90M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

In [None]:
# Prepare the datasets
max_len = 128
batch_size = 16

train_dataset = ReviewWithSentimentDataset(train_data, tokenizer, max_len)
val_dataset = ReviewWithSentimentDataset(val_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Load the model
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(50358, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50358, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
# Training function
def train_model(model, dataloader, val_loader, optimizer, device, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        step = 0
        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            step += 1

            # Print loss every 20 steps
            if step % 20 == 0:
                print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

            # Print generated text every 100 steps
            if step % 100 == 0:
                model.eval()
                with torch.no_grad():
                    sample_input_ids = batch['input_ids'][0].unsqueeze(0).to(device)
                    sample_attention_mask = batch['attention_mask'][0].unsqueeze(0).to(device)
                    generated_output = model.generate(
                        input_ids=sample_input_ids,
                        attention_mask=sample_attention_mask,
                        max_length=128,
                        num_beams=5,
                        early_stopping=True
                    )
                    decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
                    print(f"Step {step}: Generated Output: {decoded_output}")
                model.train()

        print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(dataloader):.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                val_loss += outputs.loss.item()
        print(f"Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader):.4f}")
        model.train()


In [None]:
# Train the model
train_model(model, train_loader, val_loader, optimizer, device, num_epochs=3)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Step 20, Loss: 13.3036
Epoch 1, Step 40, Loss: 9.3269
Epoch 1, Step 60, Loss: 3.5980
Epoch 1, Step 80, Loss: 1.9045
Epoch 1, Step 100, Loss: 1.1744
Step 100: Generated Output: 맛(Positive), 마늘탕수육(Positive), 차돌짬뽕(Positive)
Epoch 1, Step 120, Loss: 0.6046
Epoch 1, Step 140, Loss: 0.4714
Epoch 1, Step 160, Loss: 0.3063
Epoch 1, Training Loss: 4.5971
Epoch 1, Validation Loss: 0.2062
Epoch 2, Step 20, Loss: 0.3438
Epoch 2, Step 40, Loss: 0.2588
Epoch 2, Step 60, Loss: 0.2480
Epoch 2, Step 80, Loss: 0.2138
Epoch 2, Step 100, Loss: 0.2508
Step 100: Generated Output: 멘(Positive), (Positive)
Epoch 2, Step 120, Loss: 0.2153
Epoch 2, Step 140, Loss: 0.2318
Epoch 2, Step 160, Loss: 0.2403
Epoch 2, Training Loss: 0.2398
Epoch 2, Validation Loss: 0.1425
Epoch 3, Step 20, Loss: 0.2438
Epoch 3, Step 40, Loss: 0.1912
Epoch 3, Step 60, Loss: 0.1758
Epoch 3, Step 80, Loss: 0.1536
Epoch 3, Step 100, Loss: 0.1437
Step 100: Generated Output: 우래옥(Negative), 우래옥(Negative), 우래옥(Negative)
Epoch 3, Step 

In [None]:
# Save the fine-tuned model
output_dir = "./t5_fine_tuned_keywords_sentiments"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ./t5_fine_tuned_keywords_sentiments


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# google drive에 학습 완료된 모델 저장
# 모델 저장 경로 설정
drive_output_dir = '/content/drive/MyDrive/t5_fine_tuned_keywords_sentiments'

# 모델 복사
!cp -r ./t5_fine_tuned_keywords_sentiments $drive_output_dir

print(f"Model saved to Google Drive at {drive_output_dir}")

Model saved to Google Drive at /content/drive/MyDrive/t5_fine_tuned_keywords_sentiments


# test data 모델 성능 테스트

In [None]:
# Test the model
def test_model(review, model, tokenizer, max_len, device):
    model.eval()
    inputs = tokenizer(
        review,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_len,
            num_beams=5,
            early_stopping=True
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [None]:
# Example usage
reviews = [
    "대표메뉴는 소고기보신탕. 굴국밥도 괜찮고 여름철에 콩국수가 별미.",
    "소고기보신탕 특. 맛은 있는데. 특인데 푸짐하지 않음.",
    "웬만한 일본우동집보다 맛있어요. 면발에 놀라고 튀김 바삭거림에 놀라고. 양에 놀라고. 메뉴에 텐동 추가해주면 좋겠네요",
    "제가 원한 거보단 면이 좀 덜 쫀득해서...🥹 우동가조쿠를 이기는 곳이 없네",
    "그냥 적당히 괜찮은곳 줄설정도는 아닌거같은데...",
    "객관적으로 맛이 없어요 특히 돈까스가 너무 질기고 냄새나요",
    "우동,소바 가성비👍🏻가격 저렴해서 서비스에 대해선 언급 안하겟음  돈까스 치즈돈까스 냄새심함 두개 먹고 다 버림 … 왜 줄서서 먹는지 이해불가",
    "서초구 24년 7월 방문. 치쿠와 붓가케우동 12,000. 어묵튀김인데 붓가케소스에 푹 찍어먹으니 맛남. 서초구 24년 5월 방문. 붓가케우동세트15,000 (평일점심엔14,000). 몇년 만에 오래간만에 방문. 면발 쫄~깃 여전히 맛있네! 단품에도 간장계란밥이 나오고 세트엔 4종튀김(새우,단호박,고구마,고추)이 추가 됨"]

In [None]:
for review in reviews:
    output = test_model(review, model, tokenizer, max_len=128, device=device)
    print("Review:", review)
    print("Generated Output:", output)
    print("-" * 50)

Review: 대표메뉴는 소고기보신탕. 굴국밥도 괜찮고 여름철에 콩국수가 별미.
Generated Output: 소고기보신탕(Positive), 굴국밥(Positive), 콩국수(Positive)
--------------------------------------------------
Review: 소고기보신탕 특. 맛은 있는데. 특인데 푸짐하지 않음.
Generated Output: 소고기보신탕(Positive), 특(Positive), 푸짐하지 않음(Negative), 특(Negative)
--------------------------------------------------
Review: 웬만한 일본우동집보다 맛있어요. 면발에 놀라고 튀김 바삭거림에 놀라고. 양에 놀라고. 메뉴에 텐동 추가해주면 좋겠네요
Generated Output: 면발에 놀라고(Positive), 튀김 바삭거림에 놀라고(Positive), 양에 놀라고(Positive), 메뉴에 텐동 추가(Positive)
--------------------------------------------------
Review: 제가 원한 거보단 면이 좀 덜 쫀득해서...🥹 우동가조쿠를 이기는 곳이 없네
Generated Output: 우동가조쿠(Negative), 면 쫀득함(Negative), 우동가조쿠(Negative), 우동가조쿠(Negative)
--------------------------------------------------
Review: 그냥 적당히 괜찮은곳 줄설정도는 아닌거같은데...
Generated Output: 적당히 괜찮은곳(Positive), 줄설정도는 아닌거같은데(Negative)
--------------------------------------------------
Review: 객관적으로 맛이 없어요 특히 돈까스가 너무 질기고 냄새나요
Generated Output: 맛(Negative), 돈까스(Negative), 질기고 냄새(Negative), 돈까스(N