# 準備

In [None]:
# ドライブをマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!apt install aptitude swig

In [None]:
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y

In [None]:
!pip install mecab-python3

## ライブラリ、データセットの読み込み

In [None]:
import pandas as pd
import numpy as np
# from matplotlib import pyplot as plt

import time
from tqdm import tqdm
from pprint import pprint
import datetime
import pytz
import os

In [None]:
dt = datetime.datetime.now(pytz.timezone('Asia/Tokyo'))
print(f"{dt.year}{str(dt.month).zfill(2)}{str(dt.day).zfill(2)}{str(dt.hour).zfill(2)}{str(dt.minute).zfill(2)}")

20221015


In [None]:
# ベースとなるディレクトリ
BASE_DIR = "/content/drive/My Drive/hate-speech-detection-nishika/"

DATA_PATH = os.path.join(BASE_DIR, "data/raw")    # 対象データの保存ディレクトリ
MODEL_PATH = os.path.join(BASE_DIR, f"data/predicted/submission_{dt.year}{str(dt.month).zfill(2)}{str(dt.day).zfill(2)}_BERT_TensorFlow")  # モデルを保存するディレクトリ

In [None]:
df_train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
df_test = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))
target_column = "label"

In [None]:
# データをマージ
df_test[target_column] = np.nan
df = pd.concat([df_train, df_test], ignore_index=True, sort=False)

print(df_train.shape)
print(df_test.shape)
print(df.shape)
print(df.columns)
df.head(3)

(5256, 4)
(3223, 4)
(8479, 4)
Index(['id', 'source', 'text', 'label'], dtype='object')


Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0.0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0
2,c535f5613,livejupiter,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ\n甘えるな,1.0


# 前処理
- 改行文字等を削除

In [None]:
def text_preprocess(df):
  new_text = []

  for line in df["text"]:
      line = line.strip("\n")
      line = line.replace("\n", "")
      line = line.rstrip("\u3000")
      line = line.replace("\u3000", "")
      new_text.append(line)

  df["text"] = new_text
  return df

In [None]:
df = text_preprocess(df)
df.head(3)

Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんてそんな大事にするものか？,0.0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0.0
2,c535f5613,livejupiter,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ甘えるな,1.0


# BERTの実装

In [None]:
# !pip install -q silence_tensorflow

In [None]:
# from silence_tensorflow import silence_tensorflow
# silence_tensorflow()

import tensorflow as tf
import tensorflow.keras.layers as kl

## Tokenizerの準備
import transformers
from transformers import BertJapaneseTokenizer
from transformers import logging
logging.set_verbosity_error()

In [None]:
!pip install fugashi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Load pre-trained tokenizer
pretrained_model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"

tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_name)

In [None]:
# tokenizerの動作の確認

sample_text = df["text"][0]
print(sample_text)

token_words = tokenizer.tokenize(sample_text)
print(token_words)

encode_token = tokenizer(sample_text, padding="max_length", max_length=22, truncation=True)
pprint(encode_token)

print(tokenizer.decode(encode_token["input_ids"]))

まともに相手されてない人との関係なんてそんな大事にするものか？
['まとも', 'に', '相手', 'さ', 'れ', 'て', 'ない', '人', 'と', 'の', '関係', 'なんて', 'そんな', '大事', 'に', 'する', 'もの', 'か', '?']
{'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    0],
 'input_ids': [2,
               23135,
               7,
               1879,
               26,
               20,
               16,
               80,
               53,
               13,
               5,
               633,
               15060,
               4799,
               15872,
               7,
               34,
               120,


In [None]:
# データの抽出
sentences = df.text.values

# 最大単語数の確認
max_len = []
# 1文づつ処理
for sent in sentences:
    # Tokenizeで分割
    token_words = tokenizer.tokenize(sent)
    # 文章数を取得してリストへ格納
    max_len.append(len(token_words))

# 最大の値を確認
print('最大単語数: ', max(max_len))
print('上記の最大単語数にSpecial token（[CLS], [SEP]）の+2をした値が最大単語数')

# 単語数を設定
sequence_max_length = max(max_len)+2
if sequence_max_length > 512:
  sequence_max_length = 512

最大単語数:  97
上記の最大単語数にSpecial token（[CLS], [SEP]）の+2をした値が最大単語数


# モデルの作成

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [None]:
learning_rate = 0.1

In [None]:
def build_model(learning_rate, is_print=False):
  # BERTモデルをロード
  bert_model = transformers.TFAutoModel.from_pretrained(
      "cl-tohoku/bert-base-japanese-whole-word-masking", # 日本語Pre trainedモデルの指定
      num_labels = 2, # ラベル数（今回はBinayなので2、数値を増やせばマルチラベルも対応可）
      # output_attentions = False, # アテンションベクトルを出力するか
      # output_hidden_states = False, # 隠れ層を出力するか
      )

  # tfへの入力テンソルを作成
  # 入力はsequence_max_lengthサイズを3つ(['input_ids', 'token_type_ids', 'attention_mask'])
  inputs = [
      kl.Input(shape=(sequence_max_length,), dtype=tf.int32, name=name)
      for name in tokenizer.model_input_names
  ]

  if is_print:
      pprint(inputs)

  # BERTモデルの出力を得る
  # x[0](last_hidden_​​state) : 最後のレイヤーの出力
  # x[1](pooler_output)     : 分類トークンの状態
  x = bert_model(inputs)

  # BERT出力の0番目がクラス分類で使う出力
  x1 = x[0][:, 0, :]

  # 分類用の出力層を用意
  # 出力層の構成はTFBertForSequenceClassificationを参考
  x1 = kl.Dropout(0.1)(x1)
  x1 = kl.Dense(1, activation='sigmoid', kernel_initializer=transformers.modeling_tf_utils.get_initializer(0.02))(x1)
  model_train = tf.keras.Model(inputs=inputs, outputs=x1)

  # オリジナルの出力値を特徴量としたいので予測専用のモデルも別途作っておく
  model_pred = tf.keras.Model(inputs=inputs, outputs=[x1, x[0][:, 0, :]])

  # optimizerは AdamW を使用
  optimizer = transformers.AdamWeightDecay(learning_rate=learning_rate)
  model_train.compile(optimizer, loss="binary_crossentropy", metrics=["acc"])
  #model_train.compile(optimizer, loss="categorical_crossentropy", metrics=["acc"])  # softmaxの場合
  if is_print:
      print(model_train.summary())

  return model_train, model_pred

# 試しに実行
build_model(learning_rate, is_print=True)

[<KerasTensor: shape=(None, 99) dtype=int32 (created by layer 'input_ids')>,
 <KerasTensor: shape=(None, 99) dtype=int32 (created by layer 'token_type_ids')>,
 <KerasTensor: shape=(None, 99) dtype=int32 (created by layer 'attention_mask')>]
Model: "model_34"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 99)]         0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 99)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 99)]         0           []                               
                                                

(<keras.engine.functional.Functional at 0x7f0f89f08f10>,
 <keras.engine.functional.Functional at 0x7f1022f02a90>)

In [None]:
runtime_type = ""

try:
    if "COLAB_TPU_ADDR" in os.environ:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver('grpc://' + os.environ['COLAB_TPU_ADDR'])
    else:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()

    #--- TPU
    print('Running on TPU ', resolver.cluster_spec().as_dict()['worker'])
    runtime_type = "TPU"

    # This is the TPU initialization code that has to be at the beginning.
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    tpu_strategy = tf.distribute.TPUStrategy(resolver)

    tf.keras.backend.clear_session()
    print("All devices: ", tf.config.list_logical_devices('TPU'))

except ValueError:

    if tf.test.gpu_device_name() != "":
        #--- GPU
        runtime_type = "GPU"
    else:
        runtime_type = "CPU"

print("runtime_type: ", runtime_type)

runtime_type:  GPU


In [None]:
if runtime_type == "TPU":
    # TPU はモデル作成に tpu_strategy.scope で囲む
    with tpu_strategy.scope():
        model_train, model_pred = build_model(learning_rate)
else:
    model_train, model_pred = build_model(learning_rate)

In [None]:
import sklearn.metrics
def train_bert(
        df_train,       # 学習用のデータ
        text_column,    # 対象のカラム名
        target_column,  # 目的変数のカラム名
        df_valid=None,  # 検証用データ
        df_pred_list=[],       # 予測用データ
        model_file_prefix="",  # 保存時のファイル名識別子
        epochs=20,
        batch_size=8,
    ):

    #--------------------
    # 学習率
    #--------------------
    lr0 = 0.000005
    learning_rate = [
        0.00001,
        0.00002,
    ]
    if epochs-len(learning_rate) > 0:
        lr_list = np.linspace(0.00002, 0, epochs-len(learning_rate))
        learning_rate.extend(lr_list)
    def lr_scheduler(epoch):
        return learning_rate[epoch]
    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)


    #--------------------
    # file
    #--------------------
    model_path = "{}_{}.h5".format(
        model_file_prefix, 
        pretrained_model_name, 
    )


    #--------------------
    # モデル
    #--------------------
    if runtime_type == "TPU":
        with tpu_strategy.scope():
            model_train, model_pred = build_model(lr0)
    else:
        model_train, model_pred = build_model(lr0)


    #-----------------------------
    # モデル入出力用のデータ作成関数
    #-----------------------------
    def _build_x_from_df(df):
        # Series -> list
        x = df[text_column].tolist()

        # tokenize
        x = tokenizer(x, padding="max_length", max_length=sequence_max_length, 
            truncation=True, return_tensors="tf")

        # BatchEncoding -> dict
        return dict(x)

    def _build_y_from_df(df):
        return df[target_column]
        #return tf.keras.utils.to_categorical(df[target_column], num_classes=2)  # softmax用


    #-------------------
    # valid用のdatasetを作成
    #-------------------
    if df_valid is not None:
        valid_x = _build_x_from_df(df_valid)
        valid_y = _build_y_from_df(df_valid)
        valid_dataset = (
            tf.data.Dataset.from_tensor_slices((valid_x, valid_y))
            .batch(batch_size)
            .cache()
        )
    else:
        valid_dataset = None


    #-------------------
    # 学習
    #-------------------
    # if os.path.isfile(model_path):
    #     # 学習済みモデルをload
    #     print(model_path)
    #     model_train.load_weights(model_path)
    # else:
    train_x = _build_x_from_df(df_train)
    train_y = _build_y_from_df(df_train)
    train_dataset = (
        tf.data.Dataset.from_tensor_slices((train_x, train_y))
        .shuffle(len(train_x), seed=1234)
        .batch(batch_size)
        .prefetch(tf.data.experimental.AUTOTUNE)  # GPUが計算している間にBatchデータをCPU側で用意しておく機能
    )

    model_train.fit(train_dataset, epochs=epochs, validation_data=valid_dataset, callbacks=[lr_callback])
    # model_train.save_weights(model_path)

    #-------------------
    # 評価
    #-------------------
    if df_valid is not None:
        print("valid")
        pred_y = model_train.predict(valid_dataset, verbose=1)

        # 正解率
        pred_y_label = np.where(pred_y < 0.5, 0, 1)
        metric = sklearn.metrics.accuracy_score(valid_y, pred_y_label)
        print("acc", metric)
    else:
        metric = 0

    #-------------------
    # 予測
    #-------------------
    print("pred")
    pred_y_list = []
    emb_list = []
    for df_pred in df_pred_list:

        pred_x = _build_x_from_df(df_pred)
        pred_dataset = (
            tf.data.Dataset.from_tensor_slices((pred_x,))
            .batch(batch_size)
            .cache()
        )

        # 予測
        pred_output = model_pred.predict(pred_dataset, verbose=1)

        # pred
        pred_y = pred_output[0].reshape((-1,))  # (-1,1) -> (-1)
        #pred_y = pred_y[0][:,1]  # softmax用
        pred_y_list.append(pred_y)

        # emb
        emb_list.append(pred_output[1])

    return metric, pred_y_list, emb_list

#--- 実行
metric, pred_y_list, emb_list = train_bert(
    df_train=df[df["label"].notnull()],  # 学習データ
    text_column="text",
    target_column="label",
    df_valid=df[df["label"].notnull()][:10],  # 検証データ
    df_pred_list=[df[df["label"].isnull()]],  # 予測データ
    epochs=4,
)
print(metric)
print(pred_y_list[0].shape)
print(emb_list[0].shape)

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4
valid
acc 1.0
pred
1.0
(3223,)
(3223, 768)


In [None]:
import sklearn.model_selection

def train_cv(df, text_column, target_column, n_splits):

    df_train = df[df[target_column].notnull()]
    df_test = df[df[target_column].isnull()]

    df_train_idx = df_train.index

    # 結果用
    df_pred = pd.DataFrame(df.index, columns=["index"]).set_index("index")
    df_emb = pd.DataFrame(df.index, columns=["index"]).set_index("index")
    df_emb_pred = None
    metric_list = []

    #----------------
    # cross validation
    #----------------
    kf = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1234)
    for i, (train_idx, test_idx) in enumerate(kf.split(df_train, df_train[target_column])):
        df_train_sub = df_train.iloc[train_idx]
        df_test_sub = df_train.iloc[test_idx]

        df_pred_list = [df_test_sub]
        df_pred_list.append(df_test)

        model_file_prefix = "cv_{}".format(i)

        # train
        metric, pred_y_list, emb_list = train_bert(
            df_train=df_train_sub, 
            text_column=text_column,
            target_column=target_column, 
            df_valid=df_test_sub,
            df_pred_list=df_pred_list,
            model_file_prefix=model_file_prefix,
        )
        metric_list.append(metric)

        # 予測結果を保存
        result_name = "result_{}".format(i)
        df_pred.loc[df_train_idx[test_idx], result_name] = pred_y_list[0]
        df_pred.loc[df_test.index, result_name] = pred_y_list[1]

        #---------
        a = pd.DataFrame(emb_list[0], index=df_train_idx[test_idx])
        df_emb = df_emb.combine_first(a)

        if df_emb_pred is None:
            df_emb_pred = pd.DataFrame(emb_list[1], index=df_test.index)
        else:
            df_emb_pred += emb_list[1]


    pred_y = df_pred.mean(axis=1)

    df_emb_pred /= n_splits
    df_emb = df_emb.combine_first(df_emb_pred)

    return np.mean(metric_list), pred_y.values, df_emb

#--- 結果と特徴量を取得
metric, pred_y, df_emb = train_cv(df, "text", "label", n_splits=3)
print(metric)
print(pred_y.shape)
print(df_emb.shape)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
valid
acc 0.9389269406392694
pred
Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
valid
acc 0.9571917808219178
pred
Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
valid
acc 0.954337899543379
pred
0.9501522070015221
(8479,)
(8479, 768)


# 予測の出力

In [None]:
df["BERT"] = pred_y
df["BERT_label"] = np.where(pred_y < 0.5, 0, 1)

# 学習データの正解率
_df = df[df["label"].notnull()]
print(sklearn.metrics.accuracy_score(_df["label"], _df["BERT_label"]))

# ディレクトリがなければ作成
os.makedirs(MODEL_PATH, exist_ok=True)

# 予測結果をcsvで出力
_df = df[df["label"].isnull()]
df_submit = pd.DataFrame()
df_submit["id"] = _df["id"]
df_submit["label"] = _df["BERT_label"]
df_submit.to_csv(
    os.path.join(MODEL_PATH, f'submit_{dt.year}{str(dt.month).zfill(2)}{str(dt.day).zfill(2)}{str(dt.hour).zfill(2)}{str(dt.minute).zfill(2)}_BERT_TensorFlow.csv'), 
    header=True, 
    index=False
    )

# モデルを保存
# model_train.save(
#     os.path.join(MODEL_PATH, f'model_{dt.year}{str(dt.month).zfill(2)}{str(dt.day).zfill(2)}_BERT_TensorFlow.h5')
#     )

0.950152207001522
