### BERT分類モデル特徴量作成ファイル

※実行毎にRestart, 変数指定をしてください。<br>

【このファイルでの変数指定順序】<br>
スパンモデル用特徴量を作成してください。<br>
(365, 2-3hour)<br>
① span: 365 , col: title , model: whole<br>
② span: 365 , col: story , model: whole<br>
③ span: 365 , col: keyword , model: whole<br>
④ span: 365 , col: title , model: v2<br>
⑤ span: 365 , col: story , model: v2<br>
⑥ span: 365 , col: keyword , model: v2<br>
(730, 2-3hour)<br>
⑦ span: 730 , col: title , model: whole<br>
⑧ span: 730 , col: story , model: whole<br>
⑨ span: 730 , col: keyword , model: whole<br>
⑩ span: 730 , col: title , model: v2<br>
⑪ span: 730 , col: story , model: v2<br>
⑫ span: 730 , col: keyword , model: v2<br>

メインモデル用特徴量を作成してください。<br>
(0, 3-4hour)
⑬ span: 0 , col: title , model: whole<br>
⑭ span: 0 , col: story , model: whole<br>
⑮ span: 0 , col: keyword , model: whole<br>
⑯ span: 0 , col: title , model: v2<br>
⑰ span: 0 , col: story , model: v2<br>
⑱ span: 0 , col: keyword , model: v2<br>

### 目次

・ライブラリインストール<br>
・ランダム指定<br>
・変数指定<br>
・定数<br>
・データ読み込み・前処理<br>
・最大トークン数決定（参考）<br>
・関数<br>
・BERT訓練〜特徴量作成<br>

### ライブラリインストール

In [1]:
import datetime
import pickle
import re
import unicodedata
import warnings

import GPUtil
import numpy as np
import pandas as pd
import regex
import scipy as sp
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertJapaneseTokenizer, BertForSequenceClassification, AdamW
  
warnings.filterwarnings('ignore')

### ランダム指定

In [2]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### 変数指定

In [3]:
# B_mainと同じディレクトリ
# BASE_DIR = r'D:/Data/Nishika/Extending_bookmarks/data/'
BASE_DIR = r''
# MODEL_NAME
MODEL_NAME = r'cl-tohoku/bert-base-japanese-whole-word-masking'
# MODEL_NAME = r'cl-tohoku/bert-base-japanese-v2'
span = 0 # 全て:0, 365:365日(1年), 730:730日(2年)
col = 'keyword' # 'title', 'story', 'keyword'

###　定数

In [4]:
URL_PATTERN = re.compile(r'http[\w:./\d]+') # URL
DATE_PATTERN = re.compile(r'\d+/\d+/\d+') # DATE

### データ読み込み・前処理

In [5]:
raw = pd.read_csv(BASE_DIR + 'train.csv')
raw.keyword.fillna('', inplace=True)
raw_test = pd.read_csv(BASE_DIR + 'test.csv')
raw_test.keyword.fillna('', inplace=True)

In [6]:
def firstup_prep(df):
    df = df.copy()
    # 経過日数カラム
    elapsed_days = lambda x: (base_date - datetime.datetime.strptime(x.split()[0], '%Y-%m-%d')).days
    # データをダウンロードした日をベース
    base_date = datetime.datetime(2021,9,29)
    df['elapsed_days'] = df.general_firstup.apply(elapsed_days)
    df.drop('general_firstup', axis=1, inplace=True)
    return df

In [7]:
df_train = firstup_prep(raw)

In [8]:
if (span == 365) | (span == 730):
    days = df_train.elapsed_days.unique().min() + span
    df_train = df_train[df_train.elapsed_days < days]
    df_train = df_train.drop('elapsed_days', axis=1)
    df_train.reset_index(drop=True, inplace=True)

In [9]:
len(df_train)

40000

### 最大token数決定（参考）

最大token数は訓練データの90%分位点で決めました。

In [None]:
# 【title】
# token数を確認 (train)-> mean:10, 中央値:7, 第三四分位点:12, max:100, 90:20
# token数を確認(test) -> mean:10, 中央値:7, 第三四分位点:12, max:72, 90:22 
# 最大token数: 12(第三四分位点)+2で設定 -> 20(90%分位点)+2に変更

# 【story】
# token数を確認(train) -> mean:86, 中央値:54, 第三四分位点:115, max:735, 90:202
# token数を確認(test) -> mean:63, 中央値:38, 第三四分位点:82, max:999, 90:146
# 最大token数: 115(第三四分位点)+2で設定 -> 202(90%分位点)+2に変更

# 【keyword】
# token数を確認(train) -> mean:14, 中央値:11, 第三四分位点:19, max:117, 90:28
# token数を確認(test) -> mean:14, 中央値:9, 第三四分位点:16, max:78, 90:25
# 最大token数: 19(第三四分位点)+2で設定 -> 28(90%分位点)+2に変更

In [16]:
# # 最大token数を確認
# def show_tokendata(df, col):
#     len_list = []
#     for i in range(len(df)):
#         text = text_normalize(df[col][i])
#         text = URL_PATTERN.sub('', text)
#         text = DATE_PATTERN.sub('', text)
#         text = text.translate(str.maketrans({"\n":"", "\t":"", "\r":""}))
#         len_token = len(tokenizer.tokenize(text))
#         len_list.append(len_token)
#     len_list = np.array(len_list)    
#     print('mean:', np.mean(len_list).round(), 'max:', np.max(len_list))
#     print('percentile(50,75,90): ', np.percentile(len_list, [50, 75, 90]))

In [1]:
# show_tokendata(raw, 'title')

### 関数

In [10]:
def text_normalize(text):
    # 文字列を正規化
    return unicodedata.normalize('NFKC', text)


def sharp_del(token_list):
    """トークンから##を消して連結"""
    count = 0
    for t in token_list:
        if '##' in t:
            count += 1
    while count > 0:
        for i in range(len(token_list)):
            if '##' in token_list[i]:
                token_list[i-1] = token_list[i-1] + token_list[i][2:]
                del token_list[i]
                count -= 1
                break
    return token_list


def bert_prep(df, col):
    """BERTに入力できるようにデータを前処理"""
    input_ids = []
    attention_masks = []
    c = 0
    
    for i in range(len(df)):
        text = text_normalize(df[col][i])
        text = URL_PATTERN.sub('', text)
        text = DATE_PATTERN.sub('', text)
        text = text.translate(str.maketrans({"\n":"", "\t":"", "\r":""}))
        
        if col == 'title': # 14 < 22
            max_length = 22
        if col == 'story': # 117 < 204
            max_length = 204
        if col == 'keyword':
            max_length = 30 # 21 < 30
        
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True, # special token追加
            max_length=max_length, # 最大長固定
            pad_to_max_length=True, # padding
            return_attention_mask=True, # attention mask作成
            return_tensors='pt', # tensor返す
            truncation=True, # 最大長で切り捨て
            )
        # tokenIDを取得
        input_ids.append(encoded_dict['input_ids'])
        # attention maskを取得
        attention_masks.append(encoded_dict['attention_mask'])

        c += 1
        print(f'\r{c}', end='')
    print()
    
    # データ整形
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    if 'fav_novel_cnt_bin' in df.columns:
        labels = torch.tensor(df.fav_novel_cnt_bin.values)
        return input_ids, attention_masks, labels
    else:
        # テストデータ用
        return input_ids, attention_masks


def train():
    # 訓練
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids = batch[0].to(device)
        input_masks = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, 
                        attention_mask=input_masks, 
                        labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.cpu().item()
    return train_loss


def one_hot(labels_list):
    # 5カラムのワンホット
    l = np.zeros((len(labels_list), 5))
    for i, label in enumerate(labels_list):
        l[i][label] = 1
    return l


def validation():
    # 検証
    model.eval()
    val_loss = 0
    logits_list = []
    labels_list = []

    for batch_data in validation_dataloader:
        labels_list += list(batch_data[2].numpy().reshape(-1))
        input_ids = batch_data[0].to(device)
        input_masks = batch_data[1].to(device)
        labels = batch_data[2].to(device)
        with torch.no_grad():
            outputs = model(input_ids, 
                            attention_mask=input_masks,
                            labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        val_loss += loss.cpu().item()
        logits_list.append(logits.cpu().numpy())

    # logloss, acc
    df_logits = pd.DataFrame()
    for l in logits_list:
        df_logits = df_logits.append(pd.DataFrame(l))
    df_logits = sp.special.softmax(df_logits, axis=1)
    one_hot_labels_list = one_hot(labels_list)
    logloss = log_loss(one_hot_labels_list, df_logits)
    acc = accuracy_score(labels_list, df_logits.values.argmax(axis=1))
    
    return val_loss, logloss, acc, df_logits


def make_train_features(skf_logits_list, skf_val_index_list, col):
    # 訓練データ用の特徴量
    # reshapeできいないため、データフレームで連結
    df = pd.DataFrame()
    for l in skf_logits_list:
        df = df.append(l)
    df.index = skf_val_index_list
    df = df.sort_index()
    if span == 0:
        df.columns = [f'bert_{col}_{i}' for i in range(5)]
    if (span == 365) | (span == 730):
        df.columns = [f'bert_{span}_{col}_{i}' for i in range(5)]
    return df


def make_test_features(df, model_list, col):
    # テストデータ用の特徴量作成
    input_ids, attention_masks = bert_prep(df, col)
    # データセット
    test_dataset = TensorDataset(input_ids, 
                                 attention_masks
                                )
    # データローダー
    test_dataloader = DataLoader(
        test_dataset,
        sampler = SequentialSampler(test_dataset),
        batch_size = batch_size
    )
    # 予測
    model_logits_list = []
    for model in model_list:
        model.cuda()
        model.eval()
        logits_list = []
        for batch_data in test_dataloader:
            input_ids = batch_data[0].to(device)
            input_masks = batch_data[1].to(device)
            with torch.no_grad():   
                preds = model(input_ids, 
                              attention_mask=input_masks)
            logits = preds.logits.cpu().numpy()
            logits_list.append(logits)
            
        # reshapeできないためデータフレームを使う
        df_logits = pd.DataFrame()
        for l in logits_list:
            df_logits = df_logits.append(pd.DataFrame(l))
        df_logits = sp.special.softmax(df_logits, axis=1)
        model_logits_list.append(df_logits)
        model.cpu()
    # モデル予測の平均をとる
    last_logits_list = []
    for i in range(len(model_logits_list[0])):
        last_logits_list.append(np.array(model_logits_list)[:,i,:].mean(axis=0))
    # テストデータの特徴量データフレーム
    df_test_bert = pd.DataFrame(last_logits_list)
    if span == 0:
        df_test_bert.columns = [f'bert_test_{col}_{i}' for i in range(5)]
    if (span == 365) | (span == 730):
        df_test_bert.columns = [f'bert_{span}_test_{col}_{i}' for i in range(5)]
    
    return df_test_bert

### BERT訓練〜特徴量作成

In [11]:
%%time
# トークナイザ
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

# StratifiedKFold
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
skf_val_index_list = []
skf_model_list = []
skf_logits_list = []
df_bert = pd.DataFrame()
df_test_bert = pd.DataFrame()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# バッチサイズ
if (col=='title') | (col=='keyword'):
    batch_size = 32
if col=='story':
    batch_size = 16

# 前処理
input_ids, attention_masks, labels = bert_prep(df_train, col)

for train_index, val_index in skf.split(input_ids, labels):

    # 特徴量作成用
    skf_val_index_list += val_index.tolist()

    # データセット
    train_dataset = TensorDataset(input_ids[train_index], 
                                  attention_masks[train_index], 
                                  labels[train_index])
    val_dataset = TensorDataset(input_ids[val_index], 
                                attention_masks[val_index], 
                                labels[val_index])

    # データローダー
    train_dataloader = DataLoader(
        train_dataset,
        sampler = RandomSampler(train_dataset),
        batch_size = batch_size
    )
    validation_dataloader = DataLoader(
        val_dataset,
        sampler = SequentialSampler(val_dataset),
        batch_size = batch_size
    )

    # modelをロード
    clf = BertForSequenceClassification
    model = clf.from_pretrained(
        MODEL_NAME,
        num_labels = 5,
        output_attentions = False,
        output_hidden_states = False
    )
    
    optimizer = AdamW(model.parameters(), lr=2e-5) # ▲1e-5

    # 訓練
    # epoch数
    if MODEL_NAME == 'cl-tohoku/bert-base-japanese-whole-word-masking':
        max_epoch = 1
    if MODEL_NAME == 'cl-tohoku/bert-base-japanese-v2':
        if (col=='title') | (col=='keyword'):
            max_epoch = 2
        if col=='story':
            max_epoch = 1
    
    train_loss_list = []
    val_loss_list = []
    model_logits_list = []
    
    model.cuda()

    for epoch in range(max_epoch):
        print('==========')
        print(f'{epoch+1} epoch')
        print('==========')
        
        # 訓練
        train_loss = train()
        train_loss_list.append(train_loss)
        
        # 検証
        val_loss, logloss, acc, df_logits = validation()
        val_loss_list.append(val_loss)
        # logits_listは最終epochのみ保存(特徴量化用)
        if epoch+1 == max_epoch:
            skf_logits_list.append(df_logits)
        
        print('train_loss:', round(train_loss, 2), end='')
        print(' || val_loss:', round(val_loss, 2), 
              '| logloss:', round(logloss, 4),
              '| accuracy:', round(acc, 2))
    
    model.cpu()
    skf_model_list.append(model)
    # モデル削除前後のメモリの使用量確認
    print('train@')
    GPUtil.showUtilization()
    del model
    torch.cuda.empty_cache()
    print('del@')
    GPUtil.showUtilization()

#     break # テスト用

# 訓練データ用の特徴量作成
df_bert = pd.concat([df_bert, make_train_features(skf_logits_list, skf_val_index_list, col)], axis=1)

# テストデータ用の特徴量作成
df_test_bert = pd.concat([df_test_bert, make_test_features(raw_test, skf_model_list, col)], axis=1)

torch.cuda.empty_cache()
print('Done!')

40000


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

1 epoch
train_loss: 984.5 || val_loss: 232.43 | logloss: 0.9297 | accuracy: 0.58
train@
| ID | GPU | MEM |
------------------
|  0 |  2% | 33% |
del@
| ID | GPU | MEM |
------------------
|  0 |  0% | 20% |


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

1 epoch
train_loss: 980.15 || val_loss: 234.32 | logloss: 0.9373 | accuracy: 0.58
train@
| ID | GPU | MEM |
------------------
|  0 |  0% | 33% |
del@
| ID | GPU | MEM |
------------------
|  0 |  1% | 21% |


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

1 epoch
train_loss: 979.59 || val_loss: 235.31 | logloss: 0.9413 | accuracy: 0.57
train@
| ID | GPU | MEM |
------------------
|  0 | 17% | 33% |
del@
| ID | GPU | MEM |
------------------
|  0 |  0% | 20% |


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

1 epoch
train_loss: 985.69 || val_loss: 232.21 | logloss: 0.9288 | accuracy: 0.57
train@
| ID | GPU | MEM |
------------------
|  0 | 29% | 33% |
del@
| ID | GPU | MEM |
------------------
|  0 |  0% | 22% |


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

1 epoch
train_loss: 986.05 || val_loss: 232.07 | logloss: 0.9283 | accuracy: 0.57
train@
| ID | GPU | MEM |
------------------
|  0 | 18% | 33% |
del@
| ID | GPU | MEM |
------------------
|  0 |  1% | 21% |
8522
Done!
Wall time: 13min 8s


In [14]:
# モデルの名前指定
if MODEL_NAME == 'cl-tohoku/bert-base-japanese-whole-word-masking':
    model_name = ''
if MODEL_NAME == 'cl-tohoku/bert-base-japanese-v2':
    model_name = '_v2_'

In [15]:
# カラムにモデル名を入れる
df_bert.columns = [(col[:-1] + model_name[1:] + col[-1]) for col in df_bert.columns]
df_test_bert.columns = [(col[:-1] + model_name[1:] + col[-1]) for col in df_test_bert.columns]

In [16]:
df_bert, df_test_bert

(       bert_keyword_0  bert_keyword_1  bert_keyword_2  bert_keyword_3  \
 0            0.254550        0.573901        0.141917        0.024596   
 1            0.193701        0.486782        0.243959        0.058443   
 2            0.236993        0.559316        0.161463        0.034647   
 3            0.175567        0.560525        0.232173        0.026142   
 4            0.163415        0.535045        0.221831        0.064523   
 ...               ...             ...             ...             ...   
 39995        0.240248        0.546653        0.168535        0.034037   
 39996        0.369958        0.471106        0.129718        0.021965   
 39997        0.423704        0.468815        0.091772        0.012945   
 39998        0.316698        0.452462        0.183636        0.039967   
 39999        0.483522        0.360493        0.123212        0.024471   
 
        bert_keyword_4  
 0            0.005035  
 1            0.017115  
 2            0.007581  
 3        

In [17]:
# モデル名を入れて保存
if span == 0:
    filename = BASE_DIR + f'df_bert_{col}{model_name[:-1]}.pkl'
if (span == 365) | (span == 730):
    filename = BASE_DIR + f'df_bert_{span}_{col}{model_name[:-1]}.pkl'
with open(filename, mode='wb') as f:
    pickle.dump(df_bert, f)

In [18]:
if span == 0:
    test_filename = BASE_DIR + f'df_test_bert_{col}{model_name[:-1]}.pkl'
if (span == 365) | (span == 730):
    test_filename = BASE_DIR + f'df_test_bert_{span}_{col}{model_name[:-1]}.pkl'
with open(test_filename, mode='wb') as f:
    pickle.dump(df_test_bert, f)