In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

# ÌÇ§ÏõåÎìú ÏÉùÏÑ± Î∞è Í∞êÏ†ï ÎùºÎ≤® ÎèôÏãú ÌïôÏäµ (Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨ ÏôÑÎ£å)

In [None]:
# Define the dataset class for review to keyword+sentiment generation
class ReviewWithSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        """
        Dataset class for handling review-to-keyword+sentiment data.

        Args:
            dataframe (pd.DataFrame): DataFrame with Review and Keywords_Sentiments.
            tokenizer (T5Tokenizer): Tokenizer for processing text data.
            max_len (int): Maximum token length for input and target sequences.
        """
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = row['Review']
        target_text = row['Keywords_Sentiments']  # Combined keyword+sentiment

        # Tokenize input and target text
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': targets['input_ids'].squeeze(0),
        }


In [None]:
# Load and preprocess the dataset
file_path = "/content/Transformed_Reviews_with_Sentiments.csv"  # Dataset path
data = pd.read_csv(file_path)
# data.columns = [col.strip() for col in data.columns]  # Strip any whitespace

In [None]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

In [None]:
# Define tokenizer and model
model_name = "paust/pko-t5-base"  # Pretrained Korean T5 model
tokenizer = T5TokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.90M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

In [None]:
# Prepare the datasets
max_len = 128
batch_size = 16

train_dataset = ReviewWithSentimentDataset(train_data, tokenizer, max_len)
val_dataset = ReviewWithSentimentDataset(val_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Load the model
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(50358, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50358, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
# Training function
def train_model(model, dataloader, val_loader, optimizer, device, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        step = 0
        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            step += 1

            # Print loss every 20 steps
            if step % 20 == 0:
                print(f"Epoch {epoch+1}, Step {step}, Loss: {loss.item():.4f}")

            # Print generated text every 100 steps
            if step % 100 == 0:
                model.eval()
                with torch.no_grad():
                    sample_input_ids = batch['input_ids'][0].unsqueeze(0).to(device)
                    sample_attention_mask = batch['attention_mask'][0].unsqueeze(0).to(device)
                    generated_output = model.generate(
                        input_ids=sample_input_ids,
                        attention_mask=sample_attention_mask,
                        max_length=128,
                        num_beams=5,
                        early_stopping=True
                    )
                    decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
                    print(f"Step {step}: Generated Output: {decoded_output}")
                model.train()

        print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(dataloader):.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                val_loss += outputs.loss.item()
        print(f"Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader):.4f}")
        model.train()


In [None]:
# Train the model
train_model(model, train_loader, val_loader, optimizer, device, num_epochs=3)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Step 20, Loss: 13.3036
Epoch 1, Step 40, Loss: 9.3269
Epoch 1, Step 60, Loss: 3.5980
Epoch 1, Step 80, Loss: 1.9045
Epoch 1, Step 100, Loss: 1.1744
Step 100: Generated Output: Îßõ(Positive), ÎßàÎäòÌÉïÏàòÏú°(Positive), Ï∞®ÎèåÏß¨ÎΩï(Positive)
Epoch 1, Step 120, Loss: 0.6046
Epoch 1, Step 140, Loss: 0.4714
Epoch 1, Step 160, Loss: 0.3063
Epoch 1, Training Loss: 4.5971
Epoch 1, Validation Loss: 0.2062
Epoch 2, Step 20, Loss: 0.3438
Epoch 2, Step 40, Loss: 0.2588
Epoch 2, Step 60, Loss: 0.2480
Epoch 2, Step 80, Loss: 0.2138
Epoch 2, Step 100, Loss: 0.2508
Step 100: Generated Output: Î©ò(Positive), (Positive)
Epoch 2, Step 120, Loss: 0.2153
Epoch 2, Step 140, Loss: 0.2318
Epoch 2, Step 160, Loss: 0.2403
Epoch 2, Training Loss: 0.2398
Epoch 2, Validation Loss: 0.1425
Epoch 3, Step 20, Loss: 0.2438
Epoch 3, Step 40, Loss: 0.1912
Epoch 3, Step 60, Loss: 0.1758
Epoch 3, Step 80, Loss: 0.1536
Epoch 3, Step 100, Loss: 0.1437
Step 100: Generated Output: Ïö∞ÎûòÏò•(Negative), Ïö∞ÎûòÏò•(Negat

In [None]:
# Save the fine-tuned model
output_dir = "./t5_fine_tuned_keywords_sentiments"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ./t5_fine_tuned_keywords_sentiments


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# google driveÏóê ÌïôÏäµ ÏôÑÎ£åÎêú Î™®Îç∏ Ï†ÄÏû•
# Î™®Îç∏ Ï†ÄÏû• Í≤ΩÎ°ú ÏÑ§Ï†ï
drive_output_dir = '/content/drive/MyDrive/t5_fine_tuned_keywords_sentiments'

# Î™®Îç∏ Î≥µÏÇ¨
!cp -r ./t5_fine_tuned_keywords_sentiments $drive_output_dir

print(f"Model saved to Google Drive at {drive_output_dir}")

Model saved to Google Drive at /content/drive/MyDrive/t5_fine_tuned_keywords_sentiments


# test data Î™®Îç∏ ÏÑ±Îä• ÌÖåÏä§Ìä∏

In [None]:
# Test the model
def test_model(review, model, tokenizer, max_len, device):
    model.eval()
    inputs = tokenizer(
        review,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_len,
            num_beams=5,
            early_stopping=True
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [None]:
# Example usage
reviews = [
    "ÎåÄÌëúÎ©îÎâ¥Îäî ÏÜåÍ≥†Í∏∞Î≥¥Ïã†ÌÉï. Íµ¥Íµ≠Î∞•ÎèÑ Í¥úÏ∞ÆÍ≥† Ïó¨Î¶ÑÏ≤†Ïóê ÏΩ©Íµ≠ÏàòÍ∞Ä Î≥ÑÎØ∏.",
    "ÏÜåÍ≥†Í∏∞Î≥¥Ïã†ÌÉï Ìäπ. ÎßõÏùÄ ÏûàÎäîÎç∞. ÌäπÏù∏Îç∞ Ìë∏ÏßêÌïòÏßÄ ÏïäÏùå.",
    "Ïõ¨ÎßåÌïú ÏùºÎ≥∏Ïö∞ÎèôÏßëÎ≥¥Îã§ ÎßõÏûàÏñ¥Ïöî. Î©¥Î∞úÏóê ÎÜÄÎùºÍ≥† ÌäÄÍπÄ Î∞îÏÇ≠Í±∞Î¶ºÏóê ÎÜÄÎùºÍ≥†. ÏñëÏóê ÎÜÄÎùºÍ≥†. Î©îÎâ¥Ïóê ÌÖêÎèô Ï∂îÍ∞ÄÌï¥Ï£ºÎ©¥ Ï¢ãÍ≤†ÎÑ§Ïöî",
    "Ï†úÍ∞Ä ÏõêÌïú Í±∞Î≥¥Îã® Î©¥Ïù¥ Ï¢Ä Îçú Ï´ÄÎìùÌï¥ÏÑú...ü•π Ïö∞ÎèôÍ∞ÄÏ°∞Ïø†Î•º Ïù¥Í∏∞Îäî Í≥≥Ïù¥ ÏóÜÎÑ§",
    "Í∑∏ÎÉ• Ï†ÅÎãπÌûà Í¥úÏ∞ÆÏùÄÍ≥≥ Ï§ÑÏÑ§Ï†ïÎèÑÎäî ÏïÑÎãåÍ±∞Í∞ôÏùÄÎç∞...",
    "Í∞ùÍ¥ÄÏ†ÅÏúºÎ°ú ÎßõÏù¥ ÏóÜÏñ¥Ïöî ÌäπÌûà ÎèàÍπåÏä§Í∞Ä ÎÑàÎ¨¥ ÏßàÍ∏∞Í≥† ÎÉÑÏÉàÎÇòÏöî",
    "Ïö∞Îèô,ÏÜåÎ∞î Í∞ÄÏÑ±ÎπÑüëçüèªÍ∞ÄÍ≤© Ï†ÄÎ†¥Ìï¥ÏÑú ÏÑúÎπÑÏä§Ïóê ÎåÄÌï¥ÏÑ† Ïñ∏Í∏â ÏïàÌïòÍ≤üÏùå  ÎèàÍπåÏä§ ÏπòÏ¶àÎèàÍπåÏä§ ÎÉÑÏÉàÏã¨Ìï® ÎëêÍ∞ú Î®πÍ≥† Îã§ Î≤ÑÎ¶º ‚Ä¶ Ïôú Ï§ÑÏÑúÏÑú Î®πÎäîÏßÄ Ïù¥Ìï¥Î∂àÍ∞Ä",
    "ÏÑúÏ¥àÍµ¨ 24ÎÖÑ 7Ïõî Î∞©Î¨∏. ÏπòÏø†ÏôÄ Î∂ìÍ∞ÄÏºÄÏö∞Îèô 12,000. Ïñ¥Î¨µÌäÄÍπÄÏù∏Îç∞ Î∂ìÍ∞ÄÏºÄÏÜåÏä§Ïóê Ìëπ Ï∞çÏñ¥Î®πÏúºÎãà ÎßõÎÇ®. ÏÑúÏ¥àÍµ¨ 24ÎÖÑ 5Ïõî Î∞©Î¨∏. Î∂ìÍ∞ÄÏºÄÏö∞ÎèôÏÑ∏Ìä∏15,000 (ÌèâÏùºÏ†êÏã¨Ïóî14,000). Î™áÎÖÑ ÎßåÏóê Ïò§ÎûòÍ∞ÑÎßåÏóê Î∞©Î¨∏. Î©¥Î∞ú Ï´Ñ~ÍπÉ Ïó¨Ï†ÑÌûà ÎßõÏûàÎÑ§! Îã®ÌíàÏóêÎèÑ Í∞ÑÏû•Í≥ÑÎûÄÎ∞•Ïù¥ ÎÇòÏò§Í≥† ÏÑ∏Ìä∏Ïóî 4Ï¢ÖÌäÄÍπÄ(ÏÉàÏö∞,Îã®Ìò∏Î∞ï,Í≥†Íµ¨Îßà,Í≥†Ï∂î)Ïù¥ Ï∂îÍ∞Ä Îê®"]

In [None]:
for review in reviews:
    output = test_model(review, model, tokenizer, max_len=128, device=device)
    print("Review:", review)
    print("Generated Output:", output)
    print("-" * 50)

Review: ÎåÄÌëúÎ©îÎâ¥Îäî ÏÜåÍ≥†Í∏∞Î≥¥Ïã†ÌÉï. Íµ¥Íµ≠Î∞•ÎèÑ Í¥úÏ∞ÆÍ≥† Ïó¨Î¶ÑÏ≤†Ïóê ÏΩ©Íµ≠ÏàòÍ∞Ä Î≥ÑÎØ∏.
Generated Output: ÏÜåÍ≥†Í∏∞Î≥¥Ïã†ÌÉï(Positive), Íµ¥Íµ≠Î∞•(Positive), ÏΩ©Íµ≠Ïàò(Positive)
--------------------------------------------------
Review: ÏÜåÍ≥†Í∏∞Î≥¥Ïã†ÌÉï Ìäπ. ÎßõÏùÄ ÏûàÎäîÎç∞. ÌäπÏù∏Îç∞ Ìë∏ÏßêÌïòÏßÄ ÏïäÏùå.
Generated Output: ÏÜåÍ≥†Í∏∞Î≥¥Ïã†ÌÉï(Positive), Ìäπ(Positive), Ìë∏ÏßêÌïòÏßÄ ÏïäÏùå(Negative), Ìäπ(Negative)
--------------------------------------------------
Review: Ïõ¨ÎßåÌïú ÏùºÎ≥∏Ïö∞ÎèôÏßëÎ≥¥Îã§ ÎßõÏûàÏñ¥Ïöî. Î©¥Î∞úÏóê ÎÜÄÎùºÍ≥† ÌäÄÍπÄ Î∞îÏÇ≠Í±∞Î¶ºÏóê ÎÜÄÎùºÍ≥†. ÏñëÏóê ÎÜÄÎùºÍ≥†. Î©îÎâ¥Ïóê ÌÖêÎèô Ï∂îÍ∞ÄÌï¥Ï£ºÎ©¥ Ï¢ãÍ≤†ÎÑ§Ïöî
Generated Output: Î©¥Î∞úÏóê ÎÜÄÎùºÍ≥†(Positive), ÌäÄÍπÄ Î∞îÏÇ≠Í±∞Î¶ºÏóê ÎÜÄÎùºÍ≥†(Positive), ÏñëÏóê ÎÜÄÎùºÍ≥†(Positive), Î©îÎâ¥Ïóê ÌÖêÎèô Ï∂îÍ∞Ä(Positive)
--------------------------------------------------
Review: Ï†úÍ∞Ä ÏõêÌïú Í±∞Î≥¥Îã® Î©¥Ïù¥ Ï¢Ä Îçú Ï´ÄÎìùÌï¥ÏÑú...ü•π Ïö∞ÎèôÍ∞ÄÏ°∞Ïø†Î•º Ïù¥Í∏∞Îäî Í≥≥Ïù¥ ÏóÜÎÑ§
Generated Output: Ïö∞ÎèôÍ∞ÄÏ°∞Ï