# バートのポジネガ

## パッケージのインポート

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from datetime import datetime
import collections
import numpy as np
from transformers import Trainer, TrainingArguments
from IPython.display import HTML

## データの読み込み、成型

In [2]:
df = pd.read_csv("./data/torikizoku_twitter.csv")
df = df.reindex()
df["label"] = 0


In [3]:
df["date"] = [datetime.strptime(i, '%Y/%m/%d') for i in df["date"]]
df.loc[(datetime(2020,4,7)<=(df["date"])) & (df["date"]<datetime(2020,5,25)),["label"]] = 1
df.loc[(datetime(2021,1,7)<=df["date"]) & (df["date"]<datetime(2021,3,18)),["label"]] = 1
df.loc[(datetime(2021,4,23)<=df["date"]) & (df["date"]<datetime(2021,6,21)),["label"]] = 1
df.loc[(datetime(2021,4,23)<=df["date"]) & (df["date"]<datetime(2021,6,21)),["label"]] = 1
df.loc[(datetime(2021,7,12)<=df["date"]) & (df["date"]<datetime(2021,9,30)),["label"]] = 1
len(df["tweet"])


46315

In [4]:
df_anno=pd.DataFrame({"text":df["tweet"],"label":df["label"]})
df_anno = df_anno.dropna()
df_anno = df_anno.sample(5000)
df_train,df_test = train_test_split(df_anno)
df_anno = df_anno.dropna()
df_anno.head()

Unnamed: 0,text,label
36853,そうです。たまにライブ行ってる者です。前に池袋で客寄せしてる小西さんに水差し入れた者です笑\...,1
37249,21日部屋空いてたら出るたぶん！！！！！！！！鳥貴族よろぴこ〜,1
23301,鳥貴族でハイボール飲んだらくじ引けて、400円当たった。ラッキー,0
6973,鳥貴族に行きたいです！鳥貴族でおいしい酒とおすすめメニューを教えてください！元カノにした事を...,0
34648,早く鳥貴族でハイボール飲みながらﾄﾞﾝﾁｬﾝ騒ぎしたいよ～！！！ただの一度も居酒屋に行かない...,0


In [5]:
train_docs = df_train["text"].tolist()
train_labels = df_train["label"].tolist()
test_docs = df_test["text"].tolist()
test_labels = df_test["label"].tolist()
len(train_docs)

75

## GPU が利用できる場合は GPU を利用する


In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

## bert modelの実行

In [7]:
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
# model_name = "cl-tohoku/bert-base-japanese-v2"

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2,output_hidden_states=True, output_attentions=True)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

### 分かち書き

In [8]:
train_encodings = tokenizer(train_docs, return_tensors='pt', padding=True, truncation=True, max_length=80).to(device)
test_encodings = tokenizer(test_docs, return_tensors='pt', padding=True, truncation=True, max_length=80).to(device)

### ID化用の関数定義

In [9]:
class JpSentiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JpSentiDataset(train_encodings, train_labels)
test_dataset = JpSentiDataset(test_encodings, test_labels)

### 評価用の関数定義

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions[0].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### bertの実行

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.
    dataloader_pin_memory=False,  # Whether you want to pin memory in data loaders or not. Will default to True
    # evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    evaluation_strategy="steps",
    logging_steps=2500,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics  # The function that will be used to compute metrics at evaluation
)

trainer.train()

***** Running training *****
  Num examples = 75
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=0.7453908920288086, metrics={'train_runtime': 14.3255, 'train_samples_per_second': 5.235, 'train_steps_per_second': 0.349, 'total_flos': 3083332680000.0, 'train_loss': 0.7453908920288086, 'epoch': 1.0})

### モデルの評価

In [12]:
trainer.evaluate(eval_dataset=test_dataset)

***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.7266618609428406,
 'eval_accuracy': 0.36,
 'eval_f1': 0.3476923076923077,
 'eval_precision': 0.5536842105263158,
 'eval_recall': 0.36,
 'eval_runtime': 1.0002,
 'eval_samples_per_second': 24.996,
 'eval_steps_per_second': 2.0,
 'epoch': 1.0}

In [13]:
from transformers import pipeline
torikizoku_analyzer = pipeline("sentiment-analysis", model=model.to("cpu"), tokenizer=model_name)

loading configuration file https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json from cache at /home/yuuuuutaro/.cache/huggingface/transformers/573af37b6c39d672f2df687c06ad7d556476cbe43e5bf7771097187c45a3e7bf.abeb707b5d79387dd462e8bfb724637d856e98434b6931c769b8716c6f287258
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertJapaneseTokenizer",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading file https://huggingface.co/cl-tohoku/bert-b

In [14]:
torikizoku_analyzer(df["tweet"][5669])[0]["label"]

'LABEL_1'

In [15]:
# HTMLを作成する関数を実装
def highlight(word, attn):
    html_color = '#%02X%02X%02X' % (
        255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}"> {}</span>'.format(html_color, word)

def mk_html(index):
    inputs = tokenizer(df["tweet"][index], return_tensors="pt")
    labels = torch.tensor([df["label"][index]]).unsqueeze(0)  # Batch size 1
    outputs = model(**inputs, labels=labels)


    # indexの結果を抽出
    sentence = [int(i) for i in list(inputs["input_ids"][0])]  # 文章
    label = df["label"][index]  # ラベル
    pred = torikizoku_analyzer(df["tweet"][index])[0]["label"]  # 予測

    # ラベルと予測結果を文字に置き換え
    if label == 0:
        label_str = "Negative"
    else:
        label_str = "Positive"

    if pred == "LABEL_0":
        pred_str = "Negative"
    else:
        pred_str = "Positive"

    # 表示用のHTMLを作成する
    html = '正解ラベル：{}<br>推論ラベル：{}<br><br>'.format(label_str, pred_str)
    attention=[]
    for i in outputs.attentions[11][0]:
        attention.append([float(j) for j in i[0]])

    # Self-Attentionの重みを可視化。Multi-Headが12個なので、12種類のアテンションが存在
    for i in range(12):
        attens = [j/max(attention[i]) for j in attention[i]]
        html += '[BERTのAttentionを可視化_' + str(i+1) + ']<br>'
        for word, attn in zip(sentence, attens):

            # 単語が[SEP]の場合は文章が終わりなのでbreak
            if tokenizer.convert_ids_to_tokens([word])[0] == "[SEP]":
                break

            # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
            html += highlight(tokenizer.convert_ids_to_tokens([word])[0], attn)
        html += "<br><br>"

    # 12種類のAttentionの平均を求める。最大値で規格化
    all_attens = []  # all_attensという変数を作成する
    for i in range(12):
        all_attens += attention[i]
    all_attens = [i/max(all_attens) for i in all_attens]


    html += '[BERTのAttentionを可視化_ALL]<br>'
    for word, attn in zip(sentence, all_attens):

        # 単語が[SEP]の場合は文章が終わりなのでbreak
        if tokenizer.convert_ids_to_tokens([word])[0] == "[SEP]":
            break

        # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
        html += highlight(tokenizer.convert_ids_to_tokens([word])[0], attn)
    html += "<br><br>"

    return html

In [16]:

index = 5669  # 出力させたいデータ
html_output = mk_html(index)  # HTML作成
HTML(html_output)  # HTML形式で出力