In [3]:

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install transformers[ja]

Collecting transformers[ja]
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 6.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 26.1MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K   

In [5]:
df = pd.read_csv('drive/MyDrive/tweet_data_cleaned.csv')
df = df[['text', 'emo']]
train_df, test_df = train_test_split(df, test_size = 0.2)
train_df.to_csv('drive/MyDrive/train.tsv', sep='\t', index=False, header=None)
test_df.to_csv('drive/MyDrive/test.tsv', sep='\t', index=False, header=None)

In [14]:

from transformers import BertModel
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')


MAX_LENGTH = 512
def bert_tokenizer(text):
    return tokenizer.encode(text, max_length=MAX_LENGTH, truncation=True, return_tensors='pt')[0]

TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=bert_tokenizer, use_vocab=False, lower=False,
                            include_lengths=True, batch_first=True, fix_length=MAX_LENGTH, pad_token=0)
LABEL = torchtext.legacy.data.Field(sequential=False, use_vocab=False)

train_data, test_data = torchtext.legacy.data.TabularDataset.splits(
    path='drive/MyDrive', train='train.tsv', test='test.tsv', format='tsv', fields=[('Text', TEXT), ('Label', LABEL)])


BATCH_SIZE = 32
train_iter, test_iter = torchtext.legacy.data.Iterator.splits((train_data, test_data), batch_sizes=(BATCH_SIZE, BATCH_SIZE), repeat=False, sort=False)

In [29]:

class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()


        self.bert = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking',
                                              output_attentions=True,
                                              output_hidden_states=True)


        self.linear = nn.Linear(768*4, 3)


        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)

    def _get_cls_vec(self, vec):
        return vec[:,0,:].view(-1, 768)

    def forward(self, input_ids):

        output = self.bert(input_ids)
        attentions = output['attentions']
        hidden_states = output['hidden_states']


        vec1 = self._get_cls_vec(hidden_states[-1])
        vec2 = self._get_cls_vec(hidden_states[-2])
        vec3 = self._get_cls_vec(hidden_states[-3])
        vec4 = self._get_cls_vec(hidden_states[-4])


        vec = torch.cat([vec1, vec2, vec3, vec4], dim=1)


        out = self.linear(vec)

        return F.log_softmax(out, dim=1), attentions

classifier = BertClassifier()

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:


for param in classifier.parameters():
    param.requires_grad = False


for param in classifier.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

for param in classifier.bert.encoder.layer[-2].parameters():
    param.requires_grad = True

for param in classifier.bert.encoder.layer[-3].parameters():
    param.requires_grad = True

for param in classifier.bert.encoder.layer[-4].parameters():
    param.requires_grad = True


for param in classifier.linear.parameters():
    param.requires_grad = True


optimizer = optim.Adam([
    {'params': classifier.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
    {'params': classifier.bert.encoder.layer[-2].parameters(), 'lr': 5e-5},
    {'params': classifier.bert.encoder.layer[-3].parameters(), 'lr': 5e-5},
    {'params': classifier.bert.encoder.layer[-4].parameters(), 'lr': 5e-5},
    {'params': classifier.linear.parameters(), 'lr': 1e-4}
])

In [31]:


loss_function = nn.NLLLoss()


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


classifier.to(device)
losses = []
epoch = 20

for epoch in range(epoch):


    all_loss = 0

    for idx, batch in enumerate(train_iter):

      classifier.zero_grad()

      input_ids = batch.Text[0].to(device)
      label_ids = batch.Label.to(device)

      out, _ = classifier(input_ids)

      batch_loss = loss_function(out, label_ids)
      batch_loss.backward()

      optimizer.step()

      all_loss += batch_loss.item()

    print("epoch", epoch, "\t" , "loss", all_loss)



epoch 0 	 loss 609.8599146604538
epoch 1 	 loss 417.34804433584213
epoch 2 	 loss 344.3662644326687
epoch 3 	 loss 260.9938557544956
epoch 4 	 loss 154.42577229253948
epoch 5 	 loss 78.00698536528216
epoch 6 	 loss 72.48712145430545
epoch 7 	 loss 45.1747447671944
epoch 8 	 loss 33.422685147073935
epoch 9 	 loss 35.1499112306758
epoch 10 	 loss 29.4158189808295
epoch 11 	 loss 30.049728129495634
epoch 12 	 loss 26.719187616454292
epoch 13 	 loss 26.19840474901457
epoch 14 	 loss 27.402458135446068
epoch 15 	 loss 21.397829050030055
epoch 16 	 loss 21.015190969090327
epoch 17 	 loss 22.124865675563342
epoch 18 	 loss 21.15882081655309
epoch 19 	 loss 20.39051566930901


In [32]:

answer = []
prediction = []

with torch.no_grad():
    for batch in test_iter:

        text_tensor = batch.Text[0].to(device)
        label_tensor = batch.Label.to(device)

        score, _ = classifier(text_tensor)
        _, pred = torch.max(score, 1)

        prediction += list(pred.cpu().numpy())
        answer += list(label_tensor.cpu().numpy())

print(classification_report(prediction, answer))


              precision    recall  f1-score   support

           0       0.72      0.84      0.77      2045
           1       0.87      0.74      0.80      2751
           2       0.82      0.84      0.83      2389

    accuracy                           0.80      7185
   macro avg       0.80      0.81      0.80      7185
weighted avg       0.81      0.80      0.80      7185



In [33]:
torch.save(classifier.state_dict(), 'drive/MyDrive/bert_tweet_gpu_20_2.pth')
torch.save(classifier.to('cpu').state_dict(), 'drive/MyDrive/bert_tweet_cpu_20_2.pth')
