# 前処理

## GPU info

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Feb 19 14:37:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             40W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## Google Drive マウント

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## module インストール (初回のみ)

In [None]:
# ! pip install mecab-python3 unidic-lite #MeCab
# ! pip install fugashi

# !cp -r /usr/local/lib/python3.10/dist-packages/fugashi /content/drive/MyDrive/colab-packages/
# !cp -r /usr/local/lib/python3.10/dist-packages/fugashi-1.3.2.dist-info /content/drive/MyDrive/colab-packages/
# !cp -r /usr/local/lib/python3.10/dist-packages/fugashi.libs /content/drive/MyDrive/colab-packages/
# !cp -r /usr/local/lib/python3.10/dist-packages/mecab_python3-1.0.9.dist-info /content/drive/MyDrive/colab-packages/
# !cp -r /usr/local/lib/python3.10/dist-packages/mecab_python3.libs /content/drive/MyDrive/colab-packages/
# !cp -r /usr/local/lib/python3.10/dist-packages/unidic_lite /content/drive/MyDrive/colab-packages/
# !cp -r /usr/local/lib/python3.10/dist-packages/unidic_lite-1.0.8.dist-info /content/drive/MyDrive/colab-packages/

# import

In [None]:
# install していた分
!cp -r /content/drive/MyDrive/colab-packages/fugashi /usr/local/lib/python3.10/dist-packages/fugashi
!cp -r /content/drive/MyDrive/colab-packages/fugashi-1.3.2.dist-info /usr/local/lib/python3.10/dist-packages/fugashi-1.3.2.dist-info
!cp -r /content/drive/MyDrive/colab-packages/fugashi.libs /usr/local/lib/python3.10/dist-packages/fugashi.libs
!cp -r /content/drive/MyDrive/colab-packages/mecab_python3-1.0.9.dist-info /usr/local/lib/python3.10/dist-packages/mecab_python3-1.0.9.dist-info
!cp -r /content/drive/MyDrive/colab-packages/mecab_python3.libs /usr/local/lib/python3.10/dist-packages/mecab_python3.libs
!cp -r /content/drive/MyDrive/colab-packages/unidic_lite /usr/local/lib/python3.10/dist-packages/unidic_lite
!cp -r /content/drive/MyDrive/colab-packages/unidic_lite-1.0.8.dist-info /usr/local/lib/python3.10/dist-packages/unidic_lite-1.0.8.dist-info

In [None]:
from pathlib import Path
from datetime import datetime
import pytz
import copy
import os
import time
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import json
import random
from collections.abc import Iterable
from collections import defaultdict
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from torch.utils.data import DataLoader, Dataset
from torch import FloatTensor, LongTensor
from tqdm import tqdm, trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    PreTrainedModel,
)
from transformers.modeling_outputs import SequenceClassifierOutput,BaseModelOutputWithPast
from transformers.optimization import get_linear_schedule_with_warmup
from transformers.tokenization_utils import BatchEncoding, PreTrainedTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append('/content/drive/MyDrive/ex2024/')

# colab 関連
from google.colab import files

# 乱数シードの設定
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


#utility 関数

In [None]:
def load_jsonl(path: Path | str) -> pd.DataFrame:
    path = Path(path)
    return pd.read_json(path, lines=True)


def load_json(path: Path | str) -> dict:
    path = Path(path)
    with path.open() as f:
        data = json.load(f)
    return data

def make_dir(path):
    os.makedirs(path, exist_ok=True)

# Args (実験に使う定数などなど〜)

In [None]:
# -*- coding: utf-8 -*-
class Args():
    def __init__(self):
        ###############################################################################
        # 現在の日時を取得してフォルダ名に使用
        self.current_time = datetime.now(pytz.utc).astimezone(pytz.timezone('Asia/Tokyo')).strftime('%Y-%m-%d_%H-%M-%S')

        # 実験結果を保存するベースディレクトリ
        self.base_dir = Path('/content/drive/MyDrive/ex2024/results/')

        # フォルダ名に日時を追加してパスを生成
        self.experiment_dir = self.base_dir / f'experiment_{self.current_time}'

        # フォルダの作成 (必要であれば親ディレクトリも作成)
        self.experiment_dir.mkdir(parents=True, exist_ok=True)
        print(f"Experiment directory created at: {self.experiment_dir}")

        #訓練後モデル保存先
        self.output_model_dir: Path = self.experiment_dir / f'model.bin'

        #学習結果保存先
        self.result_csv_dir = {
            'train': self.experiment_dir / f'train_result.csv',
            'val': self.experiment_dir / f'val_result.csv',
            'test': self.experiment_dir / f'test_result.csv'
        }
        self.result_confusion_matrix_dir = {
            'train': self.experiment_dir / f'train_cm.json',
            'val': self.experiment_dir / f'val_cm.json',
            'test': self.experiment_dir / f'test_cm.json'
        }
        #学習曲線保存先
        self.result_accuracy_dir = self.experiment_dir / f'train_val_accuracy.png'
        self.result_loss_dir = self.experiment_dir / f'train_val_loss.png'
        self.result_param_dir = self.experiment_dir / f'train_param.png'
        ###############################################################################
        #BERT訓練済みモデル
        self.bert_pretrained_model_name = "cl-tohoku/bert-base-japanese-v3"
        self.max_seq_len = 512 #BERT入力列最大値

        #livedoorニュースコーパスデータセットへのパス
        self.dataset_dir: Path = Path('/content/drive/MyDrive/ex2024/livedoor/datasets/livedoor/')
        self.summary_dir: Path = Path('/content/drive/MyDrive/ex2024/livedoor/datasets/livedoor/summary/')

        #---学習周り---
        self.batch_size: int = 16
        self.epochs: int = 20
        self.lr: float = 3e-5
        self.n_class: int = 9
        self.seed: int = 42
        self.trial: int = 5

        self.sampling_flag = {'train': False, 'val': False, 'test': False}
        self.sampling_rate = {'train': 0.05, 'val': 0.1, 'test': 0.1}

# Net

In [None]:
class BertPretrainedNet(nn.Module):
    def __init__(self, _model_name='cl-tohoku/bert-base-japanese-v3', _fine_tuning_last=True, _fine_tuning_all=False):
        super().__init__()
        self.bert = AutoModel.from_pretrained(_model_name)
        self.hidden_size: int = self.bert.config.hidden_size #768
        self.fine_tuning_last = _fine_tuning_last
        self.fine_tuning_all = _fine_tuning_all

        if self.fine_tuning_all is False:
            # Bertの1〜11段目は更新せず、12段目とSequenceClassificationのLayerのみトレーニングする。
            # 一旦全部のパラメータのrequires_gradをFalseで更新
            for name, param in self.bert.named_parameters():
                param.requires_grad = False
            if self.fine_tuning_last:
                # Bert encoderの最終レイヤのrequires_gradをTrueで更新
                for name, param in self.bert.encoder.layer[-1].named_parameters():
                    param.requires_grad = True

    def forward(self, x_input_ids, x_attention_mask):
        x = self.bert(x_input_ids, x_attention_mask)
        return x

class SimpleClassifierNet(nn.Module):
    def __init__(self, _in_dim=768, _out_dim=9):
        super().__init__()
        self.in_dim = _in_dim
        self.out_dim = _out_dim
        self.classifier = nn.Linear(self.in_dim, self.out_dim, bias=True)

    def forward(self, x):
        return self.classifier(x)

class Yamato_Pooling(nn.Module):
    def __init__(self, init_weights=[nn.Parameter(torch.tensor(0.5)), nn.Parameter(torch.tensor(0.5))]):
        super().__init__()
        self.p = init_weights[0]
        self.q = init_weights[1]

    def get_param(self):
        # pとqを計算（q = 1 - p)
        new_p = self.p.detach()
        new_q = self.q.detach()
        sum = (new_p ** 2 + new_q ** 2)
        p = new_p ** 2 / sum
        q = new_q ** 2 / sum
        return {'p':p.cpu().numpy(),'q':q.cpu().numpy()}

    def forward(self, x_cls, x_avg):
        # pとqを計算（q = 1 - p）
        sum = (self.p ** 2 + self.q ** 2)
        p = self.p ** 2 / sum
        q = self.q ** 2 / sum
        return q * x_cls + p * x_avg

class Takayama_Pooling(nn.Module):
    def __init__(self, init_weights=[nn.Parameter(torch.tensor(1/3.0)), nn.Parameter(torch.tensor(1/3.0)), nn.Parameter(torch.tensor(1/3.0))]):
        super().__init__()
        self.p = init_weights[0]
        self.q = init_weights[1]
        self.r = init_weights[2]

    def get_param(self):
        # pとqを計算（q = 1 - p)
        new_p = self.p.detach()
        new_q = self.q.detach()
        new_r = self.r.detach()
        sum = (new_p ** 2 + new_q ** 2 + new_r ** 2)
        p = new_p ** 2 / sum
        q = new_q ** 2 / sum
        r = new_r ** 2 / sum
        return {'p':p.cpu().numpy(),'q':q.cpu().numpy(), 'r':r.cpu().numpy()}

    def forward(self, x_cls, x_avg, s_cls):
        # pとqを計算（q = 1 - p）
        sum = (self.p ** 2 + self.q ** 2 + self.r ** 2)
        p = self.p ** 2 / sum
        q = self.q ** 2 / sum
        r = self.r ** 2 / sum
        return p * x_cls + q * x_avg + r * s_cls

In [None]:
class Takayama_Pooling_pqrs(nn.Module):
    def __init__(self, init_weights=[nn.Parameter(torch.tensor(1/4.0)), nn.Parameter(torch.tensor(1/4.0)), nn.Parameter(torch.tensor(1/4.0)), nn.Parameter(torch.tensor(1/4.0))]):
        super().__init__()
        self.p = init_weights[0]
        self.q = init_weights[1]
        self.r = init_weights[2]
        self.s = init_weights[3]

    def get_param(self):
        # pとqを計算（q = 1 - p)
        new_p = self.p.detach()
        new_q = self.q.detach()
        new_r = self.r.detach()
        new_s = self.s.detach()
        sum = (new_p ** 2 + new_q ** 2 + new_r ** 2 + new_s ** 2)
        p = new_p ** 2 / sum
        q = new_q ** 2 / sum
        r = new_r ** 2 / sum
        s = new_s ** 2 / sum
        return {'p':p.cpu().numpy(),'q':q.cpu().numpy(), 'r':r.cpu().numpy(), 's':s.cpu().numpy()}

    def forward(self, x_cls, x_avg, s_cls, s_avg):
        # pとqを計算（q = 1 - p）
        sum = (self.p ** 2 + self.q ** 2 + self.r ** 2 + self.s ** 2)
        p = self.p ** 2 / sum
        q = self.q ** 2 / sum
        r = self.r ** 2 / sum
        s = self.s ** 2 / sum
        return p * x_cls + q * x_avg + r * s_cls + s * s_avg

In [None]:
class ExperimentNet_2024_0812(nn.Module):
    def __init__(self, _args: Args):
        super().__init__()
        self.bert = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.fc = SimpleClassifierNet()
    def forward(self, x):
        bert_out = self.bert(x)[0] # 最後の隠れ層
        out = self.fc(bert_out[:, 0, :]) # [CLS] に相当する部分のみ使う
        return out

class ExperimentNet_2024_poster_ex1(nn.Module):
    def __init__(self, _args: Args):
        super().__init__()
        self.bert = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.pooling = Yamato_Pooling()
        self.classifier = SimpleClassifierNet()
    def forward(self, x_input_ids, x_attention_mask):
        bert_out = self.bert(x_input_ids, x_attention_mask)[0]
         # [CLS]トークンのベクトルを取得
        cls_vec = bert_out[:, 0, :]  # [batch_size, hidden_size]
        # 残りのトークンのベクトルを平均プーリング
        avg_vec = bert_out[:, 1:, :].mean(dim=1)  # [batch_size, hidden_size]

        # 重み付き和を計算
        weighted_sum = self.pooling(cls_vec, avg_vec)  # [batch_size, hidden_size]

        # 分類器に渡す
        return self.classifier(weighted_sum)  # [batch_size, num_classes]

class ExperimentNet_2024_poster_ex2(nn.Module):
    def __init__(self, _args: Args):
        super().__init__()
        self.bert = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.bert_summary = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.pooling = Takayama_Pooling()
        self.classifier = SimpleClassifierNet()
    def forward(self, x_input_ids, x_attention_mask, s_input_ids, s_attention_mask):
        bert_out = self.bert(x_input_ids, x_attention_mask)[0]
         # [CLS]トークンのベクトルを取得
        cls_vec = bert_out[:, 0, :]  # [batch_size, hidden_size]
        # 残りのトークンのベクトルを平均プーリング
        out_without_cls = bert_out[:, 1:, :]
        expanded_mask = x_attention_mask[:, 1:].unsqueeze(-1).expand(out_without_cls.size())
        masked_embeddings = out_without_cls * expanded_mask
        # 各バッチのトークン数（パディング部分を除く）を計算
        sum_mask = expanded_mask.sum(dim=1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # 0除算を防ぐためにクランプ
        avg_vec = masked_embeddings.sum(dim=1) / sum_mask
        #summaryのcls
        summary_cls_vec = self.bert_summary(s_input_ids, s_attention_mask)[0][:, 0, :]
        # 重み付き和を計算
        weighted_sum = self.pooling(cls_vec, avg_vec, summary_cls_vec)  # [batch_size, hidden_size]
        # 分類器に渡す
        return self.classifier(weighted_sum)  # [batch_size, num_classes]

In [None]:
class ExperimentNet_2024_poster_ex2_pqrs(nn.Module):
    def __init__(self, _args: Args):
        super().__init__()
        self.bert = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.bert_summary = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.pooling = Takayama_Pooling_pqrs()
        self.classifier = SimpleClassifierNet()
    def forward(self, x_input_ids, x_attention_mask, s_input_ids, s_attention_mask):
        bert_out = self.bert(x_input_ids, x_attention_mask)[0]
         # [CLS]トークンのベクトルを取得
        cls_vec = bert_out[:, 0, :]  # [batch_size, hidden_size]
        # 残りのトークンのベクトルを平均プーリング
        out_without_cls = bert_out[:, 1:, :]
        expanded_mask = x_attention_mask[:, 1:].unsqueeze(-1).expand(out_without_cls.size())
        masked_embeddings = out_without_cls * expanded_mask
        # 各バッチのトークン数（パディング部分を除く）を計算
        sum_mask = expanded_mask.sum(dim=1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # 0除算を防ぐためにクランプ
        avg_vec = masked_embeddings.sum(dim=1) / sum_mask

        summary_bert_out = self.bert_summary(s_input_ids, s_attention_mask)[0]
        #summaryのcls
        summary_cls_vec = summary_bert_out[:, 0, :]
        # 残りのトークンのベクトルを平均プーリング
        out_without_cls = summary_bert_out[:, 1:, :]
        expanded_mask = s_attention_mask[:, 1:].unsqueeze(-1).expand(out_without_cls.size())
        masked_embeddings = out_without_cls * expanded_mask
        # 各バッチのトークン数（パディング部分を除く）を計算
        sum_mask = expanded_mask.sum(dim=1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # 0除算を防ぐためにクランプ
        summary_avg_vec = masked_embeddings.sum(dim=1) / sum_mask

        # 重み付き和を計算
        weighted_sum = self.pooling(cls_vec, avg_vec, summary_cls_vec, summary_avg_vec)  # [batch_size, hidden_size]
        # 分類器に渡す
        return self.classifier(weighted_sum)  # [batch_size, num_classes]

In [None]:
class ExperimentNet_2024_poster_ex4(nn.Module):
    def __init__(self, _args: Args):
        super().__init__()
        self.bert = BertPretrainedNet(_model_name=_args.bert_pretrained_model_name, _fine_tuning_last=True, _fine_tuning_all=False)
        self.pooling = Yamato_Pooling()
        self.classifier = SimpleClassifierNet()
    def forward(self, x_input_ids, x_attention_mask, weights):
        bert_out = self.bert(x_input_ids, x_attention_mask)[0] * weights.unsqueeze(-1)
         # [CLS]トークンのベクトルを取得
        cls_vec = bert_out[:, 0, :] # [batch_size, hidden_size]
        # 残りのトークンのベクトルを平均プーリング
        out_without_cls = bert_out[:, 1:, :]
        expanded_mask = x_attention_mask[:, 1:].unsqueeze(-1).expand(out_without_cls.size())
        masked_embeddings = out_without_cls * expanded_mask
        # 各バッチのトークン数（パディング部分を除く）を計算
        sum_mask = expanded_mask.sum(dim=1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # 0除算を防ぐためにクランプ
        avg_vec = masked_embeddings.sum(dim=1) / sum_mask

        # 重み付き和を計算
        weighted_sum = self.pooling(cls_vec, avg_vec)  # [batch_size, hidden_size]
        # 分類器に渡す
        return self.classifier(weighted_sum)  # [batch_size, num_classes]

# Experiment

In [None]:
class Experiment():
    def __init__(self, _args: Args):
        #実験設定
        self.args = _args
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.phase = ['train', 'val', 'test']
        self.mode = 'train'
        self.tf_idf_dic = defaultdict(lambda: defaultdict(list)) #key: category-id value: 重み
        self.tf_idf_a = 5.0
        self.b = 0.1 * self.tf_idf_a + 1.0
        #トークナイザー設定
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.args.bert_pretrained_model_name,
            model_max_length = self.args.max_seq_len,
        )
        #モデル(Net)
        self.net = ExperimentNet_2024_poster_ex4(_args=self.args).eval().to(self.device)
        #データローダー
        self.dataloader = defaultdict(lambda: defaultdict(list))
        #データセット(DataFrame)
        self.dataset = defaultdict(lambda: defaultdict(list))
        for p in self.phase:
            self.dataloader[p] = self.load_dataset(phase = p, shuffle = (p=='train'), sampling_flag = self.args.sampling_flag[p], sampling_rate = self.args.sampling_rate[p])
        #損失関数とオプティマイザーの定義
        self.criterion = torch.nn.CrossEntropyLoss()
        # self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr)
        self.optimizer = torch.optim.Adam([
            {'params':self.net.bert.parameters(), 'lr':self.args.lr},
            {'params':self.net.pooling.p, 'lr':self.args.lr},
            {'params':self.net.pooling.q, 'lr':self.args.lr},
            {'params':self.net.classifier.parameters(), 'lr':self.args.lr}
        ])
        #結果保存用
        self.results = defaultdict(lambda: defaultdict(list))
        self.results['train'] = []
        self.results['val'] = []
        self.results['test'] = []
        self.results['param'] = []
        self.best_model_state = {
            'epoch': 0,
            'accuracy': 0.0 - 1e-5,
            'best_model_state_dict': None
        }



    def results_update(self, phase, new_eval_metrics):
        self.results[phase].append(new_eval_metrics)
        if phase == 'val':
            if self.best_model_state['accuracy'] < new_eval_metrics['accuracy']:
                print('\nbest score updated :{0}'.format(new_eval_metrics['accuracy']))
                self.best_model_state = {
                    'epoch': new_eval_metrics['epoch'],
                    'accuracy': new_eval_metrics['accuracy'],
                    'best_model_state_dict': self.net.state_dict().copy()
                }

    def load_dataset(self, phase, shuffle: bool = False, sampling_flag = False, sampling_rate = 1.0):
        path = self.args.dataset_dir / f'{phase}.jsonl'
        summary_data = load_json(self.args.summary_dir / f'livedoor_summary_plamo_beta_{phase}.json')
        original = load_jsonl(path)
        #summary列を追加
        original['summary'] = original['category-id'].map(summary_data)
        if sampling_flag == True:
            original = original.groupby('label', group_keys=False).apply(
                lambda x: x.sample(frac=sampling_rate, random_state=self.args.seed)
            ).reset_index(drop=True)
        data = original.to_dict(orient="records")
        self.dataset[phase] = pd.DataFrame(data.copy())
        return DataLoader(data, collate_fn=self.collate_fn, batch_size=self.args.batch_size,shuffle=shuffle,num_workers=2,pin_memory=True)




    # カスタムcollate_fnの定義
    def collate_fn(self, batch):
        texts = [data['title'] + " [SEP] " + data['body'] for data in batch]
        labels = torch.tensor([data['label'] for data in batch])
        encoding = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        encoding['labels'] = labels
        #category-idから元のデータを参照できるようにする
        category_ids = [data['category-id'] for data in batch]
        tf_idf_weights = torch.stack([self.tf_idf_dic[data['category-id']] for data in batch])

        input_ids = encoding['input_ids']

        return {'input_ids': input_ids, 'attention_mask': encoding['attention_mask'], 'labels': labels, 'category-ids': category_ids, 'tf_idf_weights':tf_idf_weights}

    def category_id2article(self, phase, category_id):
        return self.dataset[phase][self.dataset[phase]['category-id'] == category_id]


    def train(self):
        self.results_update('param', self.net.pooling.get_param())
        for epoch in range(self.args.epochs):
            time_start = time.time()
            print('Epoch {}/{}'.format(epoch + 1, self.args.epochs))
            print('-------------')
            print('p:{0}, q:{1}\n'.format(self.results['param'][-1]['p'], self.results['param'][-1]['q']))

            for phase in ['train', 'val']:
                self.mode = phase
                self.net.train() if phase == 'train' else self.net.eval()
                if phase == 'val':
                    self.results_update('param', self.net.pooling.get_param())
                dataloader_size = len(self.dataloader[phase])
                #### epochごとの記録保存用
                running_loss = 0.0
                all_preds = []
                all_labels = []
                ####

                for batch_idx, batch in enumerate(self.dataloader[phase]):
                    print("{}/{}".format(batch_idx+1,dataloader_size))

                    ####入力データの情報
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['labels'].to(self.device)
                    category_ids = batch['category-ids']

                    tf_idf_weights = batch['tf_idf_weights'].to(self.device)
                    ####

                    self.optimizer.zero_grad()
                    with torch.set_grad_enabled(phase == 'train'):
                        y_pred = self.net(input_ids, attention_mask, tf_idf_weights)
                        _, predicted = torch.max(y_pred.data, 1)
                        # loss 計算・加算
                        loss = self.criterion(y_pred, labels)

                        running_loss += loss.item()
                        all_preds.extend(predicted.cpu().numpy())
                        all_labels.extend(labels.cpu().numpy())

                        #### 訓練時のみバックプロパゲーション
                        if phase == 'train':
                            loss.backward()
                            self.optimizer.step()
                            #### ここでパラメーターp,qの値を取って来ると良い

                mean_loss = running_loss / dataloader_size
                accuracy = accuracy_score(all_labels, all_preds)
                precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
                cm = confusion_matrix(all_labels, all_preds)
                ##################
                new_eval_metrics = {
                    'accuracy': accuracy,
                    'recall': recall,
                    'precision': precision,
                    'f1': f1,
                    'loss': mean_loss,
                    'confusion_matrix': cm,
                    'epoch': epoch
                }
                ##################
                self.results_update(phase, new_eval_metrics)
                print(new_eval_metrics['confusion_matrix'])

                # Validation 結果
                if phase == 'val':
                    print("---Validation---")
                else:
                    print("---TRAIN---")
                print("Acc : %.4f" % accuracy)
                print("loss : {}".format(mean_loss))

            time_finish = time.time() - time_start
            print("====================================")
            print("残り時間 : {0}".format(time_finish * (self.args.epochs - epoch)))

        self.save_model()
        self.visualize()
        return 0

    def save_model(self):
        torch.save(self.best_model_state['best_model_state_dict'], self.args.output_model_dir)

    def save_log(self):
        for phase in self.phase:
            #混合行列以外の acc や loss の推移を dataframe にして csv 保存
            filtered_data = [{k: v for k, v in d.items() if k != 'confusion_matrix'} for d in self.results[phase]]
            df = pd.DataFrame(filtered_data)
            df.to_csv(self.args.result_csv_dir[phase], index=False)
            if phase == 'test':
                display(df)
            #混合行列をjsonに保存
            cm_list = [d['confusion_matrix'].tolist() for d in self.results[phase]]
            with open(self.args.result_confusion_matrix_dir[phase], 'w') as f:
                json.dump(cm_list, f)


    def visualize(self):
        #学習曲線
        epochs = list(range(1, self.args.epochs + 1))  # 1からepochsまでのエポック
        train_acc = [d['accuracy'] for d in self.results['train']]
        val_acc = [d['accuracy'] for d in self.results['val']]
        train_loss = [d['loss'] for d in self.results['train']]
        val_loss = [d['loss'] for d in self.results['val']]
        p_data = [d['p'] for d in self.results['param']]
        q_data = [d['q'] for d in self.results['param']]

        self.make_plot(epochs, [{'data':train_acc, 'label':'Train Accuracy'}, {'data':val_acc, 'label':'Validation Accuracy'}],
                                title='Train and Validation Accuracy',
                                xlabel='Epoch', ylabel='Accuracy', path=self.args.result_accuracy_dir)
        self.make_plot(epochs, [{'data':train_loss, 'label':'Train Loss'}, {'data':val_loss, 'label':'Validation Loss'}],
                                title='Train and Validation Loss',
                                xlabel='Epoch', ylabel='Loss', path=self.args.result_loss_dir)
        self.make_plot(list(range(0, self.args.epochs + 1)), [{'data':p_data, 'label':'p'}, {'data':q_data, 'label':'q'}],
                                title='p, q',
                                xlabel='Epoch', ylabel='Value', path=self.args.result_param_dir)
        #paramの変位をグラフ化
        pass


    def make_plot(self, x_data, y_data, title, xlabel, ylabel, path):
        # グラフの作成
        plt.clf()
        plt.figure(figsize=(10, 6))
        for d in y_data:
            plt.plot(x_data, d['data'], label=d['label'])
        # グラフのタイトルとラベル
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        # 凡例の表示
        plt.legend()
        # グラフをPNGファイルとして保存
        plt.savefig(path)
        # グラフの表示
        plt.show()


    def test(self):
        #ベストモデルをテストデータにロードして評価
        self.net.load_state_dict(self.best_model_state['best_model_state_dict'])
        self.net.eval()
        self.mode = 'test'
        all_preds = []
        all_labels = []

        print('final p:{0}, q:{1}\n'.format(self.results['param'][-1]['p'], self.results['param'][-1]['q']))

        with torch.no_grad():
            for batch_idx, batch in enumerate(self.dataloader['test']):
                ####入力データの情報
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                category_ids = batch['category-ids']

                tf_idf_weights = batch['tf_idf_weights'].to(self.device)
                # print(labels)
                # print(tf_idf_weights)
                ####
                y_pred = self.net(input_ids, attention_mask, tf_idf_weights)
                _, predicted = torch.max(y_pred.data, 1)
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                pre = predicted.cpu().numpy()
                labs = labels.cpu().numpy()
                for idx in range(0, pre.shape[0]):
                    if pre[idx] != labs[idx]:
                        print(category_ids[idx])

        # テストデータの評価指標の計算
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        cm = confusion_matrix(all_labels, all_preds)
        ##################
        new_eval_metrics = {
            'accuracy': accuracy,
            'recall': recall,
            'precision': precision,
            'f1': f1,
            'loss': None,
            'confusion_matrix': cm,
            'epoch': None
        }
        ##################
        self.results_update('test', new_eval_metrics)
        print(new_eval_metrics['confusion_matrix'])

        self.save_log()

    def run(self):
        print("ex start")
        print(self.device)
        # self.set_tf_idf()
        self.label_distribution()
        self.train()
        self.test()

        # display(self.dataset['train'])

    def label_distribution(self):
        # ラベルごとのデータ数を集計
        for p in self.phase:
            label_counts = self.dataset[p]['label'].value_counts()
            print(p, label_counts)

    def load_confusion_matrix(self, path):
        # JSONファイルからリストを読み込む
        with open(path, 'r') as f:
            loaded_matrices_list = json.load(f)
        # リストをNumPy配列に変換
        loaded_matrices = [np.array(matrix) for matrix in loaded_matrices_list]
        return loaded_matrices

    def set_tf_idf(self):
        # 2文字以上のトークンのみを保持するトークナイザ
        def custom_tokenizer(text):
            return [token for token in text.split(' ') if len(token) >= 2]

        def pad_or_truncate(tensor, n):
            if tensor.numel() < n:
                # 0.0 で埋める
                padding = torch.zeros(n - tensor.numel())
                return torch.cat([tensor, padding])
            else:
                # truncate する
                return tensor[:n]

        stop_words = ["が", "の", "は", "に", "と", "も", "で", "を", "から", "なら", "そして", "しかし", "だから", "ので", "また", "です", "ます", "より", "まし", "この", "その", "あの", "どの"]
        new_stop_words = ['。', '、', '[', ']', '「', '」', '「', '『', '』', '!', '?', '！', '？', '】', '【', '・', '■', ':', ';', '—', '"', '(', ')', '<', '>', '/',' ','[CLS]','[SEP]','[PAD]','しかし', 'ため', 'へん', 'せい', '回', '段', '自体', '品', '器', 'そちら', 'に', 'か', '全部', 'なかっ', 'うち', '名前', 'たい', 'なり', 'だっ', 'その後', '万', '十', '分', 'きっかけ', '線', 'こう', 'あな', 'なっ', '年', '通', '円', '六', 'ながら', 'られる', 'かつて', 'など', 'その', 'とも', 'し', '子', '近く', 'おまえ', 'だけ', '春', 'もの', '私', 'なん', '書', 'そう', 'だ', 'かけ', 'ヵ所', '多く', '火', '週', 'かたち', 'こ', '略', 'べき', '作', 'おり', '何', '関係', 'ま', '地', '伸', '今', 'まし', 'カ所', 'にて', 'しよう', '都', 'くせ', '課', '以前', '頃', '力', 'こと', 'あちら', '箇所', '違い', 'ほぼ', '百', '半ば', '誌', '係', 'すぐ', '法', '彼', '下記', '者', '家', 'ぬ', 'それぞれ', 'ね', 'ん', 'ここ', '以上', 'つけ', 'すべて', 'ただし', 'なか', '例', '外', 'ヶ月', 'より', 'それ', 'この', 'い', 'ぶり', 'もう', '境', 'え', '手', 'レ', '簿', 'る', 'みつ', 'たら', 'ず', 'すか', 'せる', '新た', 'まま', 'しかた', 'よそ', 'ほか', '毎日', 'さらい', '秒', '系', 'はず', 'どこ', '箇月', 'どちら', '度', '中', '第', 'が', 'これ', 'はるか', 'ちゃ', 'ほど', '士', 'あそこ', 'も', 'から', '二', 'ない', 'そこ', '匹', 'は', 'ヵ月', '彼女', '先', '目', '千', 'つい', 'よう', '間', 'あと', '毎', '土', 'ご', '喜', 'つ', '冬', '男', 'ごと', 'です', 'どれ', '台', '金', '国', 'の', '怒', '様', 'あっ', 'のみ', '町', 'これら', '化', 'いくつ', 'また', '形', '哀', 'みなさん', 'もん', '紀', '場合', 'さまざま', '所', 'ある', '水', '列', 'および', 'できる', 'よ', 'どっか', 'なら', '県', 'かなり', 'で', 'わたし', '府', '本当', 'まで', '店', '的', '室', 'すね', 'らしい', '時間', 'おけ', 'する', '右', 'のち', 'れる', '左', 'なお', '字', '席', '上', '時点', '前', '一', 'みんな', 'あるいは', '以下', 'ぜんぶ', 'かつ', '束', '方法', 'こっち', 'あり', 'くる', 'どこか', 'れ', '見', '結局', '区', 'わけ', 'まとも', '気', '等', 'もっ', 'いう', '点', '何人', '口', 'ほう', 'まさ', '名', '部', 'たび', 'いわ', '類', 'もと', 'さ', 'やっ', '俺', 'べつ', 'よく', '他', 'あまり', 'いく', '集', '下', 'つつ', 'てん', 'たり', 'そっち', '自分', 'こちら', '式', '際', 'しまっ', '前回', '兆', 'ひと', '屋', 'ち', 'を', 'おおまか', 'さん', 'いずれ', '校', '会', 'ば', '我々', 'られ', 'なる', 'おれ', '行', 'なに', '首', 'みたい', '九', 'きた', 'あっち', 'へ', '界', 'ら', 'とおり', 'かく', 'がら', 'あなた', '面', '体', '奴', '木', '女', 'み', '村', '手段', 'す', 'さらに', '個', '論', 'ます', 'ぺん', '楽', '五', 'はじめ', 'なけれ', '文', 'せ', 'なく', '場', '扱い', '話', 'それなり', 'おら', 'いっ', 'あ', '連', '次', '高', 'いい', 'たくさん', '秋', '一つ', '達', '四', 'た', 'おい', 'ちゃん', 'と', '感', '内', '方', '感じ', '後', 'しか', '各', 'くん', '数', 'や', '枚', '確か', 'だめ', 'き', '七', '以降', 'どっち', 'そして', '用', '玉', 'いる', 'でき', 'がい', '八', 'あれ', 'とき', '様々', 'カ月', '同じ', 'なかば', '婦', 'いろいろ', '億', 'お', '元', '誰', 'ふく', 'いま', '歳', '性', '年生', 'いつ', '市', 'ヶ所', '別', 'そで', 'あたり', 'かやの', '事', 'よれ', 'ところ', '未満', '時', 'な', 'ごっちゃ', '員', 'ずつ', '三', 'ハイ', '道', '情', '日', 'て', 'ごろ', 'いや', '以後', '輪', 'しまう', '人', '歴', '上記', 'どう', '向こう', 'なし', 'ひとつ', '今回', 'とっ', 'よる', '観', 'ほとんど', 'よっ', '幾つ', '月', 'やつ', '夏', 'たち']
        new_stop_words_without = ['。', '、', '[', ']', '「', '」', '「', '『', '』', '!', '?', '！', '？', '】', '【', '・', '■', ':', ';', '—', '"', '(', ')', '<', '>', '/',' ','[PAD]','しかし', 'ため', 'へん', 'せい', '回', '段', '自体', '品', '器', 'そちら', 'に', 'か', '全部', 'なかっ', 'うち', '名前', 'たい', 'なり', 'だっ', 'その後', '万', '十', '分', 'きっかけ', '線', 'こう', 'あな', 'なっ', '年', '通', '円', '六', 'ながら', 'られる', 'かつて', 'など', 'その', 'とも', 'し', '子', '近く', 'おまえ', 'だけ', '春', 'もの', '私', 'なん', '書', 'そう', 'だ', 'かけ', 'ヵ所', '多く', '火', '週', 'かたち', 'こ', '略', 'べき', '作', 'おり', '何', '関係', 'ま', '地', '伸', '今', 'まし', 'カ所', 'にて', 'しよう', '都', 'くせ', '課', '以前', '頃', '力', 'こと', 'あちら', '箇所', '違い', 'ほぼ', '百', '半ば', '誌', '係', 'すぐ', '法', '彼', '下記', '者', '家', 'ぬ', 'それぞれ', 'ね', 'ん', 'ここ', '以上', 'つけ', 'すべて', 'ただし', 'なか', '例', '外', 'ヶ月', 'より', 'それ', 'この', 'い', 'ぶり', 'もう', '境', 'え', '手', 'レ', '簿', 'る', 'みつ', 'たら', 'ず', 'すか', 'せる', '新た', 'まま', 'しかた', 'よそ', 'ほか', '毎日', 'さらい', '秒', '系', 'はず', 'どこ', '箇月', 'どちら', '度', '中', '第', 'が', 'これ', 'はるか', 'ちゃ', 'ほど', '士', 'あそこ', 'も', 'から', '二', 'ない', 'そこ', '匹', 'は', 'ヵ月', '彼女', '先', '目', '千', 'つい', 'よう', '間', 'あと', '毎', '土', 'ご', '喜', 'つ', '冬', '男', 'ごと', 'です', 'どれ', '台', '金', '国', 'の', '怒', '様', 'あっ', 'のみ', '町', 'これら', '化', 'いくつ', 'また', '形', '哀', 'みなさん', 'もん', '紀', '場合', 'さまざま', '所', 'ある', '水', '列', 'および', 'できる', 'よ', 'どっか', 'なら', '県', 'かなり', 'で', 'わたし', '府', '本当', 'まで', '店', '的', '室', 'すね', 'らしい', '時間', 'おけ', 'する', '右', 'のち', 'れる', '左', 'なお', '字', '席', '上', '時点', '前', '一', 'みんな', 'あるいは', '以下', 'ぜんぶ', 'かつ', '束', '方法', 'こっち', 'あり', 'くる', 'どこか', 'れ', '見', '結局', '区', 'わけ', 'まとも', '気', '等', 'もっ', 'いう', '点', '何人', '口', 'ほう', 'まさ', '名', '部', 'たび', 'いわ', '類', 'もと', 'さ', 'やっ', '俺', 'べつ', 'よく', '他', 'あまり', 'いく', '集', '下', 'つつ', 'てん', 'たり', 'そっち', '自分', 'こちら', '式', '際', 'しまっ', '前回', '兆', 'ひと', '屋', 'ち', 'を', 'おおまか', 'さん', 'いずれ', '校', '会', 'ば', '我々', 'られ', 'なる', 'おれ', '行', 'なに', '首', 'みたい', '九', 'きた', 'あっち', 'へ', '界', 'ら', 'とおり', 'かく', 'がら', 'あなた', '面', '体', '奴', '木', '女', 'み', '村', '手段', 'す', 'さらに', '個', '論', 'ます', 'ぺん', '楽', '五', 'はじめ', 'なけれ', '文', 'せ', 'なく', '場', '扱い', '話', 'それなり', 'おら', 'いっ', 'あ', '連', '次', '高', 'いい', 'たくさん', '秋', '一つ', '達', '四', 'た', 'おい', 'ちゃん', 'と', '感', '内', '方', '感じ', '後', 'しか', '各', 'くん', '数', 'や', '枚', '確か', 'だめ', 'き', '七', '以降', 'どっち', 'そして', '用', '玉', 'いる', 'でき', 'がい', '八', 'あれ', 'とき', '様々', 'カ月', '同じ', 'なかば', '婦', 'いろいろ', '億', 'お', '元', '誰', 'ふく', 'いま', '歳', '性', '年生', 'いつ', '市', 'ヶ所', '別', 'そで', 'あたり', 'かやの', '事', 'よれ', 'ところ', '未満', '時', 'な', 'ごっちゃ', '員', 'ずつ', '三', 'ハイ', '道', '情', '日', 'て', 'ごろ', 'いや', '以後', '輪', 'しまう', '人', '歴', '上記', 'どう', '向こう', 'なし', 'ひとつ', '今回', 'とっ', 'よる', '観', 'ほとんど', 'よっ', '幾つ', '月', 'やつ', '夏', 'たち']
        len(new_stop_words)
        # ラベルごとのデータ数を集計
        for p in self.phase:
            data = self.dataset[p]
            for i in range(len(data)):
                # print(data['title'][i])
                text1 = [data['title'][i] + " [SEP] " + data['body'][i]]
                t1 = self.tokenizer.convert_ids_to_tokens(self.tokenizer(text1, return_tensors='pt', padding=False, truncation=False)['input_ids'][0], skip_special_tokens=False)
                # encoding_1 = ' '.join(map(str, self.tokenizer(text1, return_tensors='pt', padding=False, truncation=False)['input_ids'][0].tolist()))
                encoding_1 = ' '.join(map(str, t1))
                # print(encoding_1)

                text2 = [data['summary'][i]]
                t2 = self.tokenizer.convert_ids_to_tokens(self.tokenizer(text2, return_tensors='pt', padding=False, truncation=False)['input_ids'][0], skip_special_tokens=False)
                # encoding_2 = ' '.join(map(str, self.tokenizer(text2, return_tensors='pt', padding=False, truncation=False)['input_ids'][0].tolist()))
                encoding_2 = ' '.join(map(str, t2))
                # print(encoding_2)

                # TF-IDF ベクトル化
                # vectorizer = TfidfVectorizer(stop_words=new_stop_words, lowercase=False, token_pattern=r"(?u)\b\w{2,}\b")
                vectorizer = TfidfVectorizer(stop_words=new_stop_words,token_pattern=None, preprocessor=None, lowercase=False, tokenizer=custom_tokenizer)

                tfidf_matrix = vectorizer.fit_transform([encoding_1,encoding_2])
                feature_names = vectorizer.get_feature_names_out()
                # print(feature_names)

                # TF-IDF スコア取得
                tfidf_scores_2 = dict(zip(feature_names, tfidf_matrix.toarray()[1]))
                # print(tfidf_scores_2)
                # new_dict = {self.tokenizer.convert_ids_to_tokens(int(k)): v for k, v in tfidf_scores_1.items()}
                # print(new_dict)
                top_n = 10
                top_words1 = sorted(tfidf_scores_2.items(), key=lambda x: x[1], reverse=True)[:top_n]
                # print(top_words1)

                #重みを求める
                self.tf_idf_dic[data['category-id'][i]] = pad_or_truncate(torch.tensor([-self.tf_idf_a if key in new_stop_words_without else (self.tf_idf_a * float(tfidf_scores_2.get(key, 0.0)) + 1.0) for key in encoding_1.split(" ")]), self.args.max_seq_len)[0:512]

In [None]:
!pip install fugashi
!pip install unidic_lite

Collecting fugashi
  Downloading fugashi-1.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Downloading fugashi-1.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (698 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/698.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m698.0/698.0 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fugashi
Successfully installed fugashi-1.4.0
Collecting unidic_lite
  Downloading unidic-lite-1.0.8.tar.gz (47.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unidic_lite
  Building wheel for unidic_lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic_lite: filename=unidic_lite-1.0.8-py3-none-any.whl size=47658818 sha256

# Main

In [None]:
ex = Experiment(_args = Args())
ex.set_tf_idf()


Experiment directory created at: /content/drive/MyDrive/ex2024/results/experiment_2025-02-19_23-39-51


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors


In [None]:
ex.run()
print('---finish---')