In [1]:
import pandas as pd
import numpy as np

from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn, cuda
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler, TensorDataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from bert_op import *


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
inputpath = 'D:/project/OA_paper/DATA/Combine/'
savepath = ''

df = pd.read_csv(inputpath + 'labeling_merge.csv', encoding='cp949')

x = list(df['rejectionContentDetail'])
y = list(map(lambda x: x.split(', '), df['label']))

In [3]:
x = x[:1000]
y = y[:1000]

In [13]:
y

[['외관유사'],
 ['발음유사', '관념유사', '외관유사'],
 ['식별력'],
 ['발음유사'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['발음유사', '관념유사'],
 ['발음유사', '관념유사'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사'],
 ['외관유사'],
 ['발음유사', '관념유사', '식별력'],
 ['상품 불명확'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사', '관념유사', '상품 불명확'],
 ['발음유사'],
 ['발음유사'],
 ['발음유사'],
 ['발음유사', '관념유사', '외관유사'],
 ['상품 불명확'],
 ['발음유사'],
 ['발음유사', '관념유사'],
 ['발음유사'],
 ['식별력'],
 ['발음유사', '상품 불명확'],
 ['발음유사', '관념유사', '상품 불명확'],
 ['상품 불명확'],
 ['기타'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['발음유사'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['관념유사', '외관유사', '상품 불명확'],
 ['관념유사', '식별력'],
 ['발음유사'],
 ['발음유사', '관념유사', '외관유사'],
 ['발음유사', '관념유사', '외관유사'],
 ['발음유사', '외관유사'],
 ['발음유사', '외관유사'],
 ['발음유사', '외관유사'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['발음유사'],
 ['발음유사', '관념유사', '상품 불명확'],
 ['발음유사'],
 ['발음유사', '관념유사'],
 ['식별력', '상품 불명확'],
 ['식별력', '상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['발음유사'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사'],

In [4]:
mlb = MultiLabelBinarizer()
yt = mlb.fit_transform(y)
print(mlb.classes_)

['관념유사' '기타' '발음유사' '상품 불명확' '식별력' '외관유사']


In [5]:
# Next split Train in to training and validation
x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

In [16]:
y_tr

[['기타'],
 ['발음유사'],
 ['발음유사', '관념유사'],
 ['기타'],
 ['외관유사'],
 ['발음유사', '관념유사', '외관유사'],
 ['발음유사', '외관유사'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사', '관념유사'],
 ['발음유사', '관념유사', '외관유사', '상품 불명확'],
 ['식별력', '상품 불명확'],
 ['식별력', '상품 불명확'],
 ['식별력'],
 ['식별력'],
 ['발음유사', '관념유사', '외관유사'],
 ['발음유사', '식별력'],
 ['관념유사'],
 ['상품 불명확'],
 ['식별력'],
 ['발음유사'],
 ['발음유사'],
 ['발음유사', '관념유사'],
 ['발음유사'],
 ['식별력'],
 ['식별력'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사', '관념유사'],
 ['발음유사', '상품 불명확'],
 ['상품 불명확'],
 ['발음유사', '관념유사'],
 ['발음유사'],
 ['발음유사', '식별력'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['상품 불명확'],
 ['발음유사'],
 ['발음유사', '관념유사', '상품 불명확'],
 ['상품 불명확'],
 ['발음유사'],
 ['발음유사', '식별력'],
 ['식별력'],
 ['발음유사', '관념유사', '식별력'],
 ['발음유사', '외관유사'],
 ['발음유사', '관념유사', '외관유사'],
 ['발음유사'],
 ['발음유사'],
 ['발음유사', '관념유사'],
 ['식별력'],
 ['발음유사', '관념유사'],
 ['발음유사', '상품 불명확'],
 ['발음유사'],
 ['발음유사'],
 ['발음유사'],
 ['발음유사'],
 ['식별력'],
 ['식별력'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사'],
 ['발음유사'],
 ['상품 불명확'],
 ['발음유사', '관념유사'],
 ['발음유사', '상품 불명확'],
 ['발음유사'],
 ['상품 불명확']

In [6]:
# Initialize the Bert tokenizer
BERT_MODEL_NAME = "bert-base-multilingual-cased"
Bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

max_word_cnt = 512
content_cnt = 0

contents = x

# For every sentence...
for content in contents:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = Bert_tokenizer.encode(content, add_special_tokens=True)

    # Update the maximum sentence length.
    if len(input_ids) > max_word_cnt:
        content_cnt += 1


Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors


In [7]:
# Initialize the parameters that will be use for training
N_EPOCHS = 3
BATCH_SIZE = 8
MAX_LEN = 512
LR = 2e-05

In [8]:
# Instantiate and set up the data_module
OAdata_module = TrainOADataModule(x_tr=x_tr, y_tr=y_tr, x_val=x_val, y_val=y_val, tokenizer=Bert_tokenizer,
                                   batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
OAdata_module.setup()

In [9]:
# Instantiate the classifier model
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = OAClassifier(n_classes=6, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# saves a file like: input/OA-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',                         # monitored quantity
    filename='OA-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,                               # save the top 3 models
    mode='min',                                 # mode of the monitored quantity for optimization
)

In [11]:
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs=N_EPOCHS, gpus=1, callbacks=[checkpoint_callback], progress_bar_refresh_rate=30)

# Train the Classifier Model
trainer.fit(model, OAdata_module)

# Evaluate the model performance on the test dataset
trainer.test(model, datamodule=OAdata_module)

# Retreive the checkpoint path for best model
model_path = checkpoint_callback.best_model_path

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 177 M 
1 | classifier | Linear            | 4.6 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.432   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



ValueError: too many dimensions 'str'