In [1]:
import pandas as pd
import numpy as np

from torch.utils.data import TensorDataset
from torchsummary import summary

from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from bert_op import *

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
inputpath = 'D:/project/OA_paper/DATA/Part/'
savepath = 'D:/project/OA_paper/output/'

df = pd.read_csv(inputpath + 'Train.csv', encoding='cp949')

x = list(df['rejectionContentDetail'])
y = list(map(lambda x: x.split(', '), df['label']))

In [3]:
x = x[:1000]
y = y[:1000]

In [4]:
mlb = MultiLabelBinarizer()
yt = mlb.fit_transform(y)

In [5]:
# Getting a sense of how the tags data looks like
print(yt[0])
print(mlb.inverse_transform(yt[0].reshape(1,-1)))
print(mlb.classes_)

[1 0 1 1 0 0]
[('관념유사', '발음유사', '상품 불명확')]
['관념유사' '기타' '발음유사' '상품 불명확' '식별력' '외관유사']


In [6]:
# compute no. of words in each content
contents = x
word_cnt = [len(content.split()) for content in contents]

def bigger(x):
    return x>512
def smaller(x):
    return x<513

# word_cnt_bigger = list(filter(bigger, word_cnt))
word_cnt_smaller = list(filter(smaller, word_cnt))

# print(len(word_cnt))
# print(len(word_cnt_bigger))

In [None]:
# Plot the distribution
plt.figure(figsize=[8,5])
plt.hist(word_cnt_smaller, bins = 40)
plt.xlabel('Word Count/Content')
plt.ylabel('# of Occurences')
plt.title("Frequency of Word Counts/sentence")
plt.show()

### Split the dataset into training ,validation and test set.

In [7]:
# Split Train in to training and validation
x_tr, x_val, y_tr, y_val = train_test_split(x, yt, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

In [8]:
print(x_tr[:10])

[' 본원상표는 지정상품 &"칫솔&"과 관련하여 볼 때 타인의 선등록 400287475 (캐디악)호와 1요부의 칭호, 외관, 관념이 동일유사하므로 상표법 제7조 제1항 제7호에 해당하여 상표등록을 받을 수 없습니다.다만, 당해 지정상품을 삭제 보정하면 다른 거절이유가 없는 한 본원상표는 등록을 받을 수 있습니다.', '   상표법 제10조 제1항 이 출원상표는 아래와 같이 지정상품의 명칭 및 상품류가 불명확하므로 등록을 받을 수 없습니다.  다만, 아래 제시된 상품명을 삭제하는 보정을 하거나 상표법 시행규칙 [별표1]의 상품류 구분표에 예시된 상품명칭을 참고하여 명확한 지정상품명으로 수정하면 그러하지 아니합니다. ○ 지정상품 : 가죽 악세사리 예시) 가죽제 액세서리 - 제26류. 끝.', '  이 출원서비스표 “1++ 한우전문점”은 지정서비스업 “전부”와 관련하여 볼 때,‘최고등급의 한우를 전문으로 판매하는 식당’을 의미하여 서비스업의 성질(품질, 제공내용 등)을 직접적으로 표시한 것으로서 식별력이 없을 뿐만아니라 일반수요자들이 누구의 업무와 관련된 서비스업을 나타내는지 식별할 수 없는 서비스표이므로 상표법 제6조 제1항 제3호 및 제7호에 해당하여 서비스표등록을 받을 수 없습니다. 다만, 지적된 지정서비스업을 삭제하거나 분할하여 출원하는 등의 적법한 조치를 하면 다른 거절이유가 없는 한 이 출원서비스표는 등록받을 수 있습니다. 끝.', ' 본원상표는 &"환상적인, 멋진등의 뜻으로서 지정상품 중 &"애완동물용 장난감,크리스마스트리용 받침대,크리스마스트리용 인공눈(雪),크리스마스트리용 촛대,크리스마스트리용벨,합성수지제크리스마스트리,가면,고무제 완구,금속제 완구,깜짝상자,등제 완구,딸랑이,리모트콘트롤을 이용한 작동완구,마스코트인형,모빌,목제(木製) 완구,세트완구,손가락인형,어린이용 걸음마차,어린이용 모형승용차,어린이용 삼륜차,어린이용 흔들목마,완구악기,완구용 공,완구용 블록,완구용 스쿠터,완구용 원반,장난감권총,지제(紙製) 완구,포제(布製)완구,플라스틱제 완구,다

### Preparing the Dataset and DataModule

In [8]:
# Initialize the Bert tokenizer
BERT_MODEL_NAME = "bert-base-multilingual-cased"
Bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [9]:
# Initialize the parameters that will be use for training
N_EPOCHS = 1
BATCH_SIZE = 8
MAX_LEN = 300
LR = 2e-05

In [10]:
# Instantiate and set up the data_module
OAdata_module = TrainOADataModule(x_tr=x_tr, y_tr=y_tr, x_val=x_val, y_val=y_val, tokenizer=Bert_tokenizer,
                                  batch_size=BATCH_SIZE, max_token_len=MAX_LEN)
OAdata_module.setup()

In [11]:
# Instantiate the classifier model
steps_per_epoch = len(x_tr)//BATCH_SIZE
model = OAClassifier(n_classes=6, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
summary(model, (1, 28, 28))

TypeError: forward() missing 1 required positional argument: 'attn_mask'

In [13]:
#Initialize Pytorch Lightning callback for Model checkpointing

# saves a file like: input/OA-epoch=02-val_loss=0.32.ckpt
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',                         # monitored quantity
    filename='OA-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,                               # save the top 3 models
    mode='min',                                 # mode of the monitored quantity  for optimization
)

In [14]:
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs=N_EPOCHS, gpus=1, callbacks=[checkpoint_callback], progress_bar_refresh_rate=30)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [15]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [16]:
# Train the Classifier Model
trainer.fit(model, OAdata_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 177 M 
1 | classifier | Linear            | 4.6 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.432   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [18]:
# Evaluate the model performance on the validation dataset
trainer.validate(model,datamodule=OAdata_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_loss': 0.7495506405830383}
--------------------------------------------------------------------------------


[{'val_loss': 0.7495506405830383}]

In [19]:
# Retreive the checkpoint path for best model
model_path = checkpoint_callback.best_model_path

In [20]:
# Tokenize all contents in x_test
input_ids = []
attention_masks = []

for content in x_val:
    encoded_con = Bert_tokenizer.encode_plus(
        content,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=False,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    # Add the input_ids from encoded content to the list.
    input_ids.append(encoded_con['input_ids'])
    # Add its attention mask
    attention_masks.append(encoded_con['attention_mask'])

In [21]:
# Now convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_val)

In [37]:
# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=BATCH_SIZE)

In [38]:
flat_pred_outs = 0
flat_true_labels = 0

In [39]:
# Put model in evaluation mode
model = model.to(device)
model.eval()

OAClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [40]:
# Tracking variables
pred_outs, true_labels = [], []

# Predict
for batch in pred_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device).long() for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_attn_mask, b_labels = batch

    with torch.no_grad():
        # Forward pass, calculate logit predictions
        pred_out = model(b_input_ids, b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        # Move predicted output and labels to CPU
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
    # Store predictions and true labels
    pred_outs.append(pred_out)
    true_labels.append(label_ids)

In [41]:
# Combine the results across all batches.
flat_pred_outs = np.concatenate(pred_outs, axis=0)

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [42]:
# define candidate threshold values
threshold = np.arange(0.4, 0.51, 0.01)

In [43]:
# convert probabilities into 0 or 1 based on a threshold value
def classify(pred_prob, thresh):
    y_pred = []
    for tag_label_row in pred_prob:
        temp = []
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1)  # Infer tag value as 1 (present)
            else:
                temp.append(0)  # Infer tag value as 0 (absent)
        y_pred.append(temp)
    return y_pred

In [44]:
scores = []  # Store the list of f1 scores for prediction on each threshold

# convert labels to 1D array
y_true = flat_true_labels.ravel()
y_score = flat_pred_outs.ravel()

In [45]:
for thresh in threshold:
    # classes for each threshold
    pred_bin_label = classify(flat_pred_outs, thresh)

    # convert to 1D array
    y_pred = np.array(pred_bin_label).ravel()

    scores.append(metrics.f1_score(y_true, y_pred))

In [46]:
# find and save the optimal threshold
opt_thresh = threshold[scores.index(max(scores))]
# f = open(savepath + "opt_thresh.txt", 'w')
# f.write(str(opt_thresh))
# f.close()
print(f'Optimal Threshold Value = {opt_thresh}')

Optimal Threshold Value = 0.4


In [47]:
# predictions for optimal threshold
y_pred_labels = classify(flat_pred_outs, opt_thresh)
y_pred = np.array(y_pred_labels).ravel()  # Flatten

In [48]:
# report 작성
report = metrics.classification_report(y_true=y_true, y_pred=y_pred, output_dict=True)
report = pd.DataFrame(report).transpose()
print(report)

              precision    recall  f1-score  support
0              0.800000  0.013937  0.027397   861.00
1              0.283544  0.991150  0.440945   339.00
accuracy       0.290000  0.290000  0.290000     0.29
macro avg      0.541772  0.502544  0.234171  1200.00
weighted avg   0.654101  0.290000  0.144224  1200.00


In [49]:
y_pred = mlb.inverse_transform(np.array(y_pred_labels))
y_act = mlb.inverse_transform(flat_true_labels)

In [None]:
# 결과 저장용 output
# output = pd.DataFrame({'Body':x_val,'Actual labels':y_act,'Predicted labels':y_pred})