<a href="https://colab.research.google.com/github/4mami/PreStudy/blob/main/PreStudy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install fugashi
!pip install ipadic

In [None]:
import torch
from transformers import BertJapaneseTokenizer

In [None]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

In [None]:
class ClassifyTwitterUserToMultiLabel(torch.nn.Module):
    def __init__(self, embedding_dim, num_labels):
        super().__init__()
        self.embedding = torch.nn.Embedding(len(tokenizer.vocab), embedding_dim, padding_idx=0)
        self.linear = torch.nn.Linear(embedding_dim, num_labels)

    def forward(self, input_tensor):
        tensor = self.embedding(input_tensor) # (1, 990, 120, Edim)
        tensor = tensor.mean(2)
        tensor = tensor.mean(1)

        return self.linear(tensor)

In [None]:
# jsonファイルから読み込んだデータを基に、(ラベルを表すリスト, 単語ID化したツイートのリスト)のタプルのリストdata_listを作成する
import json
data = json.load(open('drive/MyDrive/Data/TrainingData.json'))

data_list = []

for datum in data:
    label = datum["label"]
    tweets = datum["tweets"]
    tweets_ids = tokenizer(tweets, padding="longest")["input_ids"]

    data_list.append((label, tweets_ids))

print(data_list[0])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def train(train_data_list):
    train_loss = 0
    train_acc = 0

    for data in train_data_list:
        input_tensor = torch.tensor(data[1])
        input_tensor = input_tensor.unsqueeze(0).to(device)

        optimizer.zero_grad()

        label_tensor = torch.tensor(data[0]).unsqueeze(0).to(device)
        output_tensor = model(input_tensor)

        loss = criterion(output_tensor, label_tensor.float())
        train_loss += loss.item()
        loss.backward()

        optimizer.step()

        label_predicted = (output_tensor > 0).int()
        num_correct = (label_predicted == label_tensor).int().sum().item()
        acc = num_correct / CAT_SIZE
        train_acc += acc

    scheduler.step()
    return train_loss / len(train_data_list), train_acc / len(train_data_list)

In [None]:
def validation(valid_data_list):
    valid_loss = 0
    valid_acc = 0
    divided_acc = [0] * 6

    for data in valid_data_list:
        input_tensor = torch.tensor(data[1])
        input_tensor = input_tensor.unsqueeze(0).to(device)
        label_tensor = torch.tensor(data[0]).unsqueeze(0).to(device)

        with torch.no_grad():
            output_tensor = model(input_tensor)
            loss = criterion(output_tensor, label_tensor.float())
            valid_loss += loss.item()

            label_predicted = (output_tensor > 0).int()
            num_correct = (label_predicted == label_tensor).int().sum().item()
            acc = num_correct / CAT_SIZE
            valid_acc += acc

            for i in range(6):
                if ((label_predicted[0][i] == label_tensor[0][i]).item()):
                    divided_acc[i] += 1

    return valid_loss / len(valid_data_list), valid_acc / len(valid_data_list), divided_acc

In [None]:
import time
import random
NUM_EPOCH = 80
NUM_FOLD = 5
CAT_SIZE = 6
EMBEDDED_DIM = 768
criterion = torch.nn.BCEWithLogitsLoss().to(device)
mean_train_acc = 0.0
mean_valid_acc = 0.0

start_time = time.time()
for fold in range(NUM_FOLD):
    # モデルの性能を見る以上、各foldごとにモデルの学習成果をリセットしないといけないから、毎foldでmodel,optimizer,schedulerを代入し直す
    model = ClassifyTwitterUserToMultiLabel(EMBEDDED_DIM, CAT_SIZE)
    model = model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

    valid_data_list = data_list[fold*20:fold*20+20]
    train_data_list = [] + data_list[:fold*20]
    train_data_list += data_list[fold*20+20:]

    for epoch in range(NUM_EPOCH):
        random.shuffle(train_data_list)
        train_loss, train_acc = train(train_data_list)
        valid_loss, valid_acc, divided_acc = validation(valid_data_list)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Fold: %d' %(fold + 1), 'Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
        print(f'\t\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)\t|\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)', "各ラベルごとの正解数：", divided_acc)

    mean_train_acc += train_acc / NUM_FOLD
    mean_valid_acc += valid_acc / NUM_FOLD
    print("-----------------------------------------------------------------------------------------------------------------------")

print(f"Mean Train Acc: {mean_train_acc * 100:.1f}%")
print(f"Mean Valid Acc: {mean_valid_acc * 100:.1f}%")

In [None]:
# データ内の各ラベルの合計数を出力
ind_0 = 0
ind_1 = 0
ind_2 = 0
ind_3 = 0
ind_4 = 0
ind_5 = 0

for i, data in enumerate(data_list):
    ind_0 += data[0][0]
    ind_1 += data[0][1]
    ind_2 += data[0][2]
    ind_3 += data[0][3]
    ind_4 += data[0][4]
    ind_5 += data[0][5]

    if ((i+1) % 20 == 0):
        print(i+1, "個までの各ラベル合計数：", ind_0, ",", ind_1, ",", ind_2, ",", ind_3, ",", ind_4, ",", ind_5)
        ind_0 = 0
        ind_1 = 0
        ind_2 = 0
        ind_3 = 0
        ind_4 = 0
        ind_5 = 0
