In [None]:
!pip install datasets sentencepiece
!pip install accelerate -U
!pip install transformers[torch] -U

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

# 데이터셋 로드
data_path = '/content/processed_smilestyle_dataset.csv'
df = pd.read_csv(data_path)
hf_dataset = Dataset.from_pandas(df)

# 데이터셋을 train과 test로 분할
hf_dataset = hf_dataset.train_test_split(test_size=0.1)

# 데이터셋 구조 확인
print("Dataset features:", hf_dataset["train"].features)
print("First example:", hf_dataset["train"][0])

# 모델과 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v1")
model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v1")

def preprocess_function(examples):
    input_texts = [f"<{style}> {source}" for style, source in zip(examples['style'], examples['source'])]

    # None 값을 빈 문자열로 대체
    target_texts = [target if target is not None else "" for target in examples['target']]

    # 입력 텍스트 토큰화
    model_inputs = tokenizer(input_texts, max_length=128, truncation=True, padding="max_length")

    # 타겟 텍스트 토큰화
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

def tokenize_in_batches(dataset, batch_size=1000):
    tokenized_datasets = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=batch_size,
        remove_columns=dataset.column_names,
        desc="Running tokenizer on dataset",
    )
    return tokenized_datasets

# 데이터셋 전처리
def preprocess_dataset(dataset):
    # None 값을 빈 문자열로 대체
    dataset = dataset.map(lambda example: {
        'source': example['source'] if example['source'] is not None else "",
        'style': example['style'] if example['style'] is not None else "",
        'target': example['target'] if example['target'] is not None else ""
    })
    return dataset

# 데이터셋 전처리 및 토큰화
tokenized_datasets = {}
for split in hf_dataset.keys():
    preprocessed_dataset = preprocess_dataset(hf_dataset[split])
    tokenized_datasets[split] = tokenize_in_batches(preprocessed_dataset)

# 데이터셋 구조 확인
print("Tokenized dataset features:", tokenized_datasets['train'].features)
print("First tokenized example:", tokenized_datasets['train'][0])

# 데이터 콜레이터 설정
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 학습 인자 설정
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    predict_with_generate=True,
    fp16=True,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000,
)

# 트레이너 설정
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 모델 학습
trainer.train()

Dataset features: {'source': Value(dtype='string', id=None), 'style': Value(dtype='string', id=None), 'target': Value(dtype='string', id=None)}
First example: {'source': '색맹 때문에 물건을 구분하기 힘들 때는 있습니다.', 'style': 'chat', 'target': '색맹 때문에 물건을 구분하기 힘들 때는 있지'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


model.safetensors:   0%|          | 0.00/495M [00:00<?, ?B/s]

Map:   0%|          | 0/13338 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/13338 [00:00<?, ? examples/s]



Map:   0%|          | 0/1482 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1482 [00:00<?, ? examples/s]

Tokenized dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
First tokenized example: {'input_ids': [14686, 17426, 16881, 261, 16088, 10527, 14362, 22152, 18272, 14191, 15994, 15802, 20628, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.0712,0.060108
2,0.0544,0.05625
3,0.0409,0.054276
4,0.0372,0.055525
5,0.03,0.058225
6,0.0216,0.060938
7,0.0177,0.061347
8,0.0137,0.064606
9,0.0116,0.06621
10,0.0086,0.067803


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-defa

TrainOutput(global_step=83400, training_loss=0.009049009891021417, metrics={'train_runtime': 5717.0284, 'train_samples_per_second': 233.303, 'train_steps_per_second': 14.588, 'total_flos': 1.01658311000064e+17, 'train_loss': 0.009049009891021417, 'epoch': 100.0})

In [None]:
model.save_pretrained('./KoBart_trained_transfer')
tokenizer.save_pretrained('./KoBart_trained_transfer')

Non-default generation parameters: {'forced_eos_token_id': 1}


('./KoBart_trained_transfer/tokenizer_config.json',
 './KoBart_trained_transfer/special_tokens_map.json',
 './KoBart_trained_transfer/tokenizer.json')

In [None]:
import shutil

# 모델 폴더를 zip 파일로 압축
shutil.make_archive('KoBart_trained_transfer', 'zip', './KoBart_trained_transfer')

# Colab에서 로컬로 파일 다운로드
from google.colab import files
files.download('KoBart_trained_transfer.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 스타일별 문장 변경 코드
import torch
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

# 모델과 토크나이저 로드
model_name = "./KoBart_trained_transfer"  # 학습한 모델의 경로
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to('cuda')

# 스타일 목록 추출
styles = df['style'].unique()
print("Available styles:", styles)

def change_style(input_text, style):
    input_text = f"<{style}> {input_text}"
    inputs = tokenizer(input_text, return_tensors='pt', max_length=128, truncation=True, padding='max_length').to('cuda')
    outputs = model.generate(inputs['input_ids'], max_length=128, num_beams=5, early_stopping=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Available styles: ['gentle' 'sosim' 'informal' 'chat']


In [None]:
# 테스트할 문장
test_sentence = "안녕 오늘따라 날씨도 좋은데 오늘 뭐 할 계획이야?"

# 각 스타일별로 문장을 변경
for style in styles:
    styled_sentence = change_style(test_sentence, style)
    print(f"Style: {style}\nGenerated Text: {styled_sentence}\n")

Style: gentle
Generated Text: 안녕하십니까. 오늘 날씨도 좋네요. 오늘 뭐하실 계획이십니까?

Style: sosim
Generated Text: 안녕 오늘 오늘따라 날씨도 좋은데 오늘 뭐할 계획이야 혹시..?

Style: informal
Generated Text: 안녕 오늘따라 날씨도 좋은데 오늘 뭐 할 계획이야?

Style: chat
Generated Text: ᄒᄋ 오늘 날씨도 좋긴한데 오늘 뭐 할 생각?

