### パッケージ、自作関数のインポート

In [None]:
import pandas  as pd
import numpy as np
import datetime
import torch
import io
from PIL import Image
import zipfile
import os
import warnings
from googletrans import Translator
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, Trainer, TrainingArguments, EarlyStoppingCallback, AdamW
import my_catr
warnings.simplefilter('ignore')

### 必要なファイルをコピー

In [None]:
!git clone https://github.com/saahiluppal/catr.git

### 必要なパッケージをインポート

In [None]:
os.chdir("./catr")
!pip install -q -q -q -r requirements.txt
os.chdir("../")

### デバイスの設定

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

### データの読み込み

In [None]:
train_df = pd.read_csv("../data/bokete/train.csv")
test_df = pd.read_csv("../data/bokete/test.csv")
train_img_encoding = pd.read_csv("./data/train_img_text.csv")
test_img_encoding = pd.read_csv("./data/test_img_text.csv")

In [None]:
train_df = pd.merge(train_df, train_img_encoding, on = "odai_photo_file_name", how = "left")
test_df = pd.merge(test_df, test_img_encoding, on = "odai_photo_file_name", how = "left")

### 学習用、評価用に分割

In [None]:
train_df, eval_df = train_test_split(train_df)
train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

### BERT関係のダウンロード

In [None]:
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

### BERT用にデータを作成

In [None]:
train_labels =train_df["is_laugh"].tolist()
eval_labels = eval_df["is_laugh"].tolist()
train_test_doc = [[train_df["img_texts_jp"].loc[i:i].values[0],train_df["text"].loc[i:i].values[0]] for i in range(len(train_df))]
eval_test_doc = [[eval_df["img_texts_jp"].loc[i:i].values[0],eval_df["text"].loc[i:i].values[0]] for i in range(len(eval_df))]
train_encodings = tokenizer(train_test_doc, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
eval_encodings = tokenizer(eval_test_doc, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

### データセットの作成

In [None]:
import torch

class JpSentiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_test_dataset = JpSentiDataset(train_encodings, train_labels)
eval_test_dataset = JpSentiDataset(eval_encodings, eval_labels)

### 評価用の関数定義

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### ファインチューニング

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=1,
    dataloader_pin_memory=False,
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    logging_steps=50,
    logging_dir='./logs',
    load_best_model_at_end = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_dataset,
    eval_dataset=eval_test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()

### モデルを評価

In [None]:
trainer.evaluate(eval_dataset=eval_test_dataset)

### 提出用に予測

In [None]:
sub_df = pd.read_csv("../data/bokete/sample_submission.csv")

In [None]:
test_doc = [[test_df["img_texts_jp"].loc[i:i].values[0],test_df["text"].loc[i:i].values[0]] for i in range(len(test_df))]
test_encodings = tokenizer(test_doc, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)

In [None]:
import torch

class JpSentiDataset2(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = JpSentiDataset2(test_encodings)

In [None]:
out = trainer.predict(test_dataset)
out_tensor = torch.tensor(out.predictions)
sig = torch.nn.Softmax()
pred = sig(out_tensor).numpy()[:,1]
sub_df["is_laugh"] = pred

In [11]:
file_name = "./data"+datetime.datetime.now().strftime('%Y_%m_%d_%H_%M%S')+"_sub.csv"
sub_df.to_csv(file_name)