# full conversation model_4

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

# 데이터 로드
dataset = load_dataset('csv', data_files={'train': 'DoWADO/AI/tfs_train.csv', 'test': 'DoWADO/AI/tfs_test.csv'})

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

# 텍스트 데이터 토크나이즈
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 라벨 설정 및 불필요한 컬럼 제거
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# 모델 로드
model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels=43)

# 훈련 설정
training_args = TrainingArguments(
    output_dir='DoWADO/AI/result_model_4',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

# 평가 지표 정의
metric = load_metric('accuracy', trust_remote_code=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# 모델 훈련
trainer.train()


2024-07-15 11:46:04.516179: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 11:46:04.569919: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 11:46:04.569938: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 11:46:04.571397: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-15 11:46:04.580742: I tensorflow/core/platform/cpu_feature_guar

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks..

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.991443,0.514638
2,2.661900,1.587187,0.616333
3,2.661900,1.620897,0.608629
4,1.363100,1.598755,0.616333
5,0.847200,1.721264,0.602465
6,0.847200,1.823898,0.59322
7,0.523300,1.836281,0.605547
8,0.269300,1.954917,0.602465
9,0.269300,1.979437,0.599384
10,0.156000,1.992015,0.597843


TrainOutput(global_step=3250, training_loss=0.9045408272376427, metrics={'train_runtime': 2007.9043, 'train_samples_per_second': 25.898, 'train_steps_per_second': 1.619, 'total_flos': 1.3686811447296e+16, 'train_loss': 0.9045408272376427, 'epoch': 10.0})

In [2]:
# 모델 저장
trainer.save_model('DoWADO/AI/transformers/model_4_full_conversation/trained_model')
tokenizer.save_pretrained('DoWADO/AI/transformers/model_4_full_conversation/trained_tokenizer')

('DoWADO/AI/transformers/model_4_full_conversation/trained_tokenizer/tokenizer_config.json',
 'DoWADO/AI/transformers/model_4_full_conversation/trained_tokenizer/special_tokens_map.json',
 'DoWADO/AI/transformers/model_4_full_conversation/trained_tokenizer/vocab.txt',
 'DoWADO/AI/transformers/model_4_full_conversation/trained_tokenizer/added_tokens.json',
 'DoWADO/AI/transformers/model_4_full_conversation/trained_tokenizer/tokenizer.json')

# 불용어 처리한 Kiwi full convsersation model_5

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

# 데이터 로드
dataset = load_dataset('csv', data_files={'train': 'DoWADO/AI/tfs_train_4.csv', 'test': 'DoWADO/AI/tfs_test_4.csv'})

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

# 텍스트 데이터 토크나이즈
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 라벨 설정 및 불필요한 컬럼 제거
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# 모델 로드
model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels=43)

# 훈련 설정
training_args = TrainingArguments(
    output_dir='DoWADO/AI/result_model_5',
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

# 평가 지표 정의
metric = load_metric('accuracy', trust_remote_code=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 트레이너 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# 모델 훈련
trainer.train()


2024-07-15 16:00:49.519498: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 16:00:49.571282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 16:00:49.571325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 16:00:49.572404: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-15 16:00:49.582319: I tensorflow/core/platform/cpu_feature_guar

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5200 [00:00<?, ? examples/s]

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks..

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.382027,0.684129
2,1.985500,1.244085,0.676425
3,1.985500,1.142678,0.705701
4,0.822700,1.149194,0.708783
5,0.422800,1.32962,0.687211
6,0.422800,1.303891,0.694915
7,0.203600,1.305117,0.697997
8,0.098300,1.320279,0.705701
9,0.098300,1.355302,0.718028
10,0.053400,1.382517,0.710324


TrainOutput(global_step=3250, training_loss=0.5540604453453651, metrics={'train_runtime': 1973.4656, 'train_samples_per_second': 26.35, 'train_steps_per_second': 1.647, 'total_flos': 1.3686811447296e+16, 'train_loss': 0.5540604453453651, 'epoch': 10.0})

In [2]:
# 모델 저장
trainer.save_model('DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_model')
tokenizer.save_pretrained('DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_tokenizer')

('DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_tokenizer/tokenizer_config.json',
 'DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_tokenizer/special_tokens_map.json',
 'DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_tokenizer/vocab.txt',
 'DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_tokenizer/added_tokens.json',
 'DoWADO/AI/transformers/model_5_kiwi_full_conversation_wo_stopwords/trained_tokenizer/tokenizer.json')