In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

In [2]:
import pandas as pd
import torch
import re
# from transformers.modeling_bert import BertModel
from transformers import BertTokenizer, BertForPreTraining, BertModel,BertForSequenceClassification
from sklearn.metrics import precision_score
import torch
from torch import nn
import json
import torchtext
import string
import random
import sys

In [3]:
df_train = pd.read_csv('/content/drive/My Drive/data/train1.csv')
df_test = pd.read_csv('/content/drive/My Drive/data/test1.csv')

In [4]:
df_train['jobflag'] = df_train['jobflag'] -1
df_train['jobflag'].value_counts()

2    1376
0     624
3     583
1     348
Name: jobflag, dtype: int64

In [5]:
df_train.drop_duplicates(subset='description', keep='last', inplace=True)
df_train = df_train.reset_index(drop=True)

## tokenizer and preprocessing

In [6]:
tokenizer_bert = BertTokenizer(vocab_file='/content/drive/My Drive/vocab/bert-base-uncased-vocab.txt')

def preprocessing(text):
    return text


def token_same_len(token):
    token.insert(0,'[CLS]')
    if len(token) < 512:
        while len(token) != 512:
            token.insert(512,'[PAD]')
    token.insert(128,['SEP'])
    return token

def token_and_prepro(text, tokenizer = tokenizer_bert):
    text = preprocessing(text)
    token = tokenizer.tokenize(text)
    token = token_same_len(token)
    ids = tokenizer.convert_tokens_to_ids(token[:128])
    return ids

## dataset(torchtext)

In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer,token_same_len):
        self.df = df
        self.tokenizer = tokenizer
        self.token_same_len = token_same_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, input_ids):
        text = self.df['description'][input_ids]
        label = self.df['jobflag'][input_ids]
        
        token = self.tokenizer(text)
        
        tensor_token = torch.tensor(token)
        tensor_label = torch.tensor(label)
        
        return tensor_token, tensor_label

In [8]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer,token_same_len):
        self.df = df
        self.tokenizer = tokenizer
        self.token_same_len = token_same_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, input_ids):
        text = self.df['description'][input_ids]
        
        token = self.tokenizer(text)
        
        tensor_token = torch.tensor(token)
        
        return tensor_token

In [9]:

# del df_test['id']
dataset = Dataset(df = df_train, tokenizer = token_and_prepro,token_same_len=token_same_len)
test_dataset = TestDataset(df = df_test, tokenizer = token_and_prepro,token_same_len=token_same_len)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), int(len(dataset)*0.2)])

In [10]:
from torch.utils.data.sampler import BatchSampler
from torch.utils.data import DataLoader
import numpy as np
class BalancedBatchSampler(BatchSampler):
    """
    BatchSampler - from a MNIST-like dataset, samples n_classes and within these classes samples n_samples.
    Returns batches of size n_classes * n_samples
    """

    def __init__(self, dataset, n_classes, n_samples):
        loader = DataLoader(dataset)
        self.labels_list = []
        for _, label in loader:
            self.labels_list.append(label)
        self.labels = torch.LongTensor(self.labels_list)
        self.labels_set = list(set(self.labels.numpy()))
        self.label_to_indices = {label: np.where(self.labels.numpy() == label)[0]
                                 for label in self.labels_set}
        for l in self.labels_set:
            np.random.shuffle(self.label_to_indices[l])
        self.used_label_indices_count = {label: 0 for label in self.labels_set}
        self.count = 0
        self.n_classes = n_classes
        self.n_samples = n_samples
        self.dataset = dataset
        self.batch_size = self.n_samples * self.n_classes

    def __iter__(self):
        self.count = 0
        while self.count + self.batch_size < len(self.dataset):
            classes = np.random.choice(self.labels_set, self.n_classes, replace=False)
            indices = []
            for class_ in classes:
                indices.extend(self.label_to_indices[class_][
                               self.used_label_indices_count[class_]:self.used_label_indices_count[
                                                                         class_] + self.n_samples])
                self.used_label_indices_count[class_] += self.n_samples
                if self.used_label_indices_count[class_] + self.n_samples > len(self.label_to_indices[class_]):
                    np.random.shuffle(self.label_to_indices[class_])
                    self.used_label_indices_count[class_] = 0
            yield indices
            self.count += self.n_classes * self.n_samples

    def __len__(self):
        return len(self.dataset) // self.batch_size

In [11]:
train_sampler = BalancedBatchSampler(train_dataset, 4, 4)
val_sampler = BalancedBatchSampler(val_dataset, 4, 4)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_sampler=train_sampler)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_sampler=val_sampler)
load_dict = {"train":train_dataloader, 'val':val_dataloader}

## dataloader

In [12]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= 16, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size= 16, shuffle=False)
load_dict = {"train":train_dataloader, 'val':val_dataloader}

In [12]:
dl_test = torch.utils.data.DataLoader(test_dataset, batch_size = 16, shuffle = False)

## model構築

In [13]:
model = BertModel.from_pretrained(pretrained_model_name_or_path='bert-base-uncased')
# model1 =BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path='bert-base-uncased',num_labels=4)
# print(model1)

In [14]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert = model
        # self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(in_features=768, out_features=4)
#         self.dropout = nn.Dropout(0.3)
        
        nn.init.normal_(self.classifier.weight, std=0.02)
        nn.init.normal_(self.classifier.bias, 0)
        
    def forward(self, inputs):
        output = self.bert(inputs)
        vec_0 = output[0]  
        vec_0 = vec_0[:, 0, :] 
        vec_0 = vec_0.view(-1, 768)
        
        result = self.classifier(vec_0)
        
        return result

In [None]:
bert_model = BERT()
bert_model.train()
# print(bert_model)

### 勾配の計算場所

In [16]:
# 1.勾配計算Falseにする（ALl）
for param in bert_model.parameters():
    param.requires_grad = False

# 2. BertLayer[12]そう目
for param in bert_model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

# 3. label
for param in bert_model.classifier.parameters():
    param.requires_grad = True


In [17]:
df_train['jobflag'].value_counts()

2    1364
0     622
3     579
1     340
Name: jobflag, dtype: int64

### oprimzer and loss

In [19]:
import torch.optim as optim


# BERTの元の部分はファインチューニング
optimizer = optim.Adam([
    {'params': bert_model.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
    {'params': bert_model.classifier.parameters(), 'lr': 1e-4}
])

# 損失関数の設定
# weights = torch.tensor([2.2,4.0,1.0,2.35]).cuda()
# criterion = nn.CrossEntropyLoss(weight=weights)
criterion = nn.CrossEntropyLoss()

In [22]:
import sys
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    net.to(device)
    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # ミニバッチのサイズ
    # batch_size = dataloaders_dict["train"].batch_size
    batch_size = 16

    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()

            epoch_loss = 0.0
            f1_batch =0
            epoch_corrects = 0
            iteration = 1

            # データローダーからミニバッチを取り出すループ
            for batch in (dataloaders_dict[phase]):
                # batchはTextとLableの辞書型変数

                # GPUが使えるならGPUにデータを送る
                inputs = batch[0].to(device)  # 文章
                labels = batch[1].to(device)  # ラベル

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):

                    # BERTに入力
                    outputs = net(inputs)
                    
                    loss = criterion(outputs, labels)  # 損失を計算

                    _, preds = torch.max(outputs, 1)  # ラベルを予測

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                        if (iteration % 25 == 0):  # 10iterに1度、lossを表示
                            acc = (torch.sum(preds == labels.data)
                                   ).double()/batch_size
                            f1 = precision_score(preds.cpu().numpy(),
                                                 labels.data.cpu().numpy(),
                                                 average='macro')
                            print(' All / batch　{} || Loss: {:.4f} | ACC：{}　| F1 :{}'.format(
                                iteration, loss.item(),  acc, f1))
                    
                    

                    iteration += 1

                    # 損失と正解数の合計を更新
                    epoch_loss += loss.item() * batch_size
                    epoch_corrects += torch.sum(preds == labels.data)
                    # f1_batch += precision_score(preds.cpu().numpy(),
                    #                         labels.data.cpu().numpy(),
                    #                         average='macro')
                    # print(f1_batch)
                        
            # epochごとのlossと正解率
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double(
            ) / len(dataloaders_dict[phase].dataset)
            # epoch_f1 = f1_batch/len(dataloaders_dict[phase].dataset)
            ##########lineに送信##########################
            if phase != 'train':
                url = "https://notify-api.line.me/api/notify"
                token = 'tQSMjodBp3mHEA6wGscIofzVyDUquKliy6diNv5eP78'
                headers = {"Authorization" : "Bearer "+ token}
                message =  'Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs,
                                                                            phase, epoch_loss, epoch_acc)
                payload = {"message" :  message}
                r = requests.post(url ,headers = headers ,params=payload)
            ###############################################
            print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs,
                                                                           phase, epoch_loss, epoch_acc))

    return net

In [48]:
import requests
num_epochs = 1
net_trained = train_model(bert_model, load_dict,
                          criterion, optimizer, num_epochs=num_epochs)

 All / batch　25 || Loss: 0.8428 | ACC：0.625　| F1 :0.625
 All / batch　50 || Loss: 0.6833 | ACC：0.625　| F1 :0.625
 All / batch　75 || Loss: 1.1528 | ACC：0.375　| F1 :0.375
 All / batch　100 || Loss: 0.7495 | ACC：0.625　| F1 :0.625
 All / batch　125 || Loss: 0.9930 | ACC：0.5　| F1 :0.5
Epoch 1/1 | train |  Loss: 0.8628 Acc: 0.6377
Epoch 1/1 |  val  |  Loss: 1.0991 Acc: 0.5284


In [27]:
from tqdm import tqdm
preds_list = []
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

bert_model.eval()
bert_model.to(device)


for batch in tqdm(dl_test): 
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    inputs = batch.to(device)  
    
    with torch.set_grad_enabled(False):
        outputs = net_trained(inputs)
        # loslogits = outputs
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        preds_list.append(preds)

100%|██████████| 109/109 [00:15<00:00,  7.03it/s]


In [45]:
import collections
def submit_csv(test_id, preds,device):
    label_list = []
    if device == 'cpu':
        preds = list(map(lambda x: x+1, preds))
        submit = pd.DataFrame({'id':test_id,'pred':preds})
        submit.to_csv('submit.csv',index=False, header=False)
        print('完了cpu')
    else:
        for  i in preds_list:
            labels = i.to('cpu').detach().numpy().copy()
            label_list.extend(labels)
        label_list = list(map(lambda x: x+1, label_list))
        submit = pd.DataFrame({'id':test_id,'pred':label_list})
        submit.to_csv('submit1.csv',index=False, header=False)
        c = collections.Counter(label_list)
        print(c)
        print('完了GPU')


In [46]:
test_id = df_test['id'].to_list()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
label_list = submit_csv(test_id, preds,device)

Counter({3: 670, 4: 499, 1: 421, 2: 153})
完了GPU


In [31]:
# ストップワード
# scoreと文字の長さをlgb

Counter({3: 670, 4: 499, 1: 421, 2: 153})
