### パッケージ、自作関数のインポート

In [1]:
import pandas  as pd
import numpy as np
import torch
import io
from PIL import Image
import zipfile
import os
import warnings
from googletrans import Translator
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, Trainer, TrainingArguments, EarlyStoppingCallback, AdamW
import my_catr
warnings.simplefilter('ignore')

### 必要なファイルをコピー

In [2]:
!git clone https://github.com/saahiluppal/catr.git

fatal: destination path 'catr' already exists and is not an empty directory.


### 必要なパッケージをインポート

In [3]:
os.chdir("./catr")
!pip install -q -q -q -r requirements.txt
os.chdir("../")

### デバイスの設定

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

### データの読み込み、お試し用に抽出

In [5]:
train_df = pd.read_csv("../data/bokete/train.csv")
test_df = pd.read_csv("../data/bokete/test.csv")
train_df_test = train_df.sample(100)

### 画像を英語テキストに変換
時間かかる

In [6]:
imcap = my_catr.my_catr()
img_texts_en = []
for file_path in tqdm(train_df_test["odai_photo_file_name"]):
    file_path = "../data/bokete/train/" + file_path
    temp_img_text1 = imcap.fit(file_path)
    img_texts_en.append(temp_img_text1)
tr = Translator()
train_df_test["img_texts_en"] = img_texts_en
img_texts_jp = [tr.translate(text, src="en", dest="ja").text for text in tqdm(train_df_test["img_texts_en"])]
train_df_test["img_texts_jp"] = img_texts_jp

Using cache found in C:\Users\takah/.cache\torch\hub\saahiluppal_catr_master


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

### 途中から始める場合はここから

In [7]:
# import pandas  as pd
# import numpy as np
# import io
# from PIL import Image
# import zipfile
# import os
# import warnings
# from googletrans import Translator
# from tqdm.notebook import tqdm
# from sklearn.model_selection import train_test_split
# warnings.simplefilter('ignore')
# train_df_test = pd.read_csv("../data/bokete/train_test.csv")

### 学習用、評価用に分割

In [8]:
train_df_test, eval_df_test = train_test_split(train_df_test)
train_df_test = train_df_test.reset_index(drop=True)
eval_df_test = eval_df_test.reset_index(drop=True)

### BERT関係のダウンロード

In [9]:
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

### BERT用にデータを作成

In [10]:
train_labels =train_df_test["is_laugh"].tolist()
eval_labels = eval_df_test["is_laugh"].tolist()
train_test_doc = [[train_df_test["img_texts_jp"].loc[i:i].values[0],train_df_test["text"].loc[i:i].values[0]] for i in range(len(train_df_test))]
eval_test_doc = [[eval_df_test["img_texts_jp"].loc[i:i].values[0],eval_df_test["text"].loc[i:i].values[0]] for i in range(len(eval_df_test))]
train_encodings = tokenizer(train_test_doc, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
test_encodings = tokenizer(eval_test_doc, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

### データセットの作成

In [11]:
import torch

class JpSentiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_test_dataset = JpSentiDataset(train_encodings, train_labels)
eval_test_dataset = JpSentiDataset(test_encodings, eval_labels)

### 評価用の関数定義

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### ファインチューニング

In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=1,
    dataloader_pin_memory=False,
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    logging_steps=50,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_dataset,
    eval_dataset=eval_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()

***** Running training *****
  Num examples = 75
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 190


  0%|          | 0/190 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 25
  Batch size = 8


{'loss': 0.7002, 'learning_rate': 5e-06, 'epoch': 2.63}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.710249125957489, 'eval_accuracy': 0.4, 'eval_f1': 0.3487684729064039, 'eval_precision': 0.6166666666666667, 'eval_recall': 0.4, 'eval_runtime': 0.2644, 'eval_samples_per_second': 94.57, 'eval_steps_per_second': 15.131, 'epoch': 2.63}


***** Running Evaluation *****
  Num examples = 25
  Batch size = 8


{'loss': 0.5973, 'learning_rate': 1e-05, 'epoch': 5.26}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.7831984162330627, 'eval_accuracy': 0.6, 'eval_f1': 0.5935897435897437, 'eval_precision': 0.8222222222222223, 'eval_recall': 0.6, 'eval_runtime': 0.2646, 'eval_samples_per_second': 94.476, 'eval_steps_per_second': 15.116, 'epoch': 5.26}


***** Running Evaluation *****
  Num examples = 25
  Batch size = 8


{'loss': 0.2408, 'learning_rate': 1.5e-05, 'epoch': 7.89}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.6641982793807983, 'eval_accuracy': 0.56, 'eval_f1': 0.5744499178981939, 'eval_precision': 0.6330769230769231, 'eval_recall': 0.56, 'eval_runtime': 0.2687, 'eval_samples_per_second': 93.024, 'eval_steps_per_second': 14.884, 'epoch': 7.89}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 40.9778, 'train_samples_per_second': 18.303, 'train_steps_per_second': 4.637, 'train_loss': 0.4159835275850798, 'epoch': 10.0}


TrainOutput(global_step=190, training_loss=0.4159835275850798, metrics={'train_runtime': 40.9778, 'train_samples_per_second': 18.303, 'train_steps_per_second': 4.637, 'train_loss': 0.4159835275850798, 'epoch': 10.0})

### モデルを評価

In [14]:
trainer.evaluate(eval_dataset=eval_test_dataset)

***** Running Evaluation *****
  Num examples = 25
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 2.8265466690063477,
 'eval_accuracy': 0.52,
 'eval_f1': 0.534025974025974,
 'eval_precision': 0.6088311688311688,
 'eval_recall': 0.52,
 'eval_runtime': 0.2656,
 'eval_samples_per_second': 94.141,
 'eval_steps_per_second': 15.063,
 'epoch': 10.0}