### 0. Loading data
#### We've attached the raw data. To use this, load the data by `pandas.read_excel()`.

In [None]:
import pandas as pd
import os

# root_dir = '/content/drive/My Drive/QIA2023'
root_dir = os.getcwd() + '/data/'
questions_dir = 'Question.xlsx'
train_dir = 'p1_train.csv'
test_dir = 'p1_test.csv'

model_IE_dir = 'model_IE.pt'
model_SN_dir = 'model_SN.pt'
model_TF_dir = 'model_TF.pt'
model_JP_dir = 'model_JP.pt'

questions = pd.read_excel(os.path.join(root_dir, questions_dir))
questions.drop(['index', 'index.1'], axis='columns', inplace=True)
display(questions.head())

Mounted at /content/drive


Unnamed: 0,Question
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.


### 1. Preprocessing


#### 1-1. Encoding MBTI Labels
Since the lable is given as String type (e.g. ISTP), we have to convert it to vectors.

In [None]:
def MBTI_to_vec(mbti):
  if len(mbti) != 4:
    raise Exception(f"Not valid: {mbti}")

  if mbti[0] == 'I':
    fst = [1, 0]
  elif mbti[0] == 'E':
    fst = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")

  if mbti[1] == 'S':
    snd = [1, 0]
  elif mbti[1] == 'N':
    snd = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")

  if mbti[2] == 'T':
    trd = [1, 0]
  elif mbti[2] == 'F':
    trd = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")
  
  if mbti[3] == 'J':
    fth = [1, 0]
  elif mbti[3] == 'P':
    fth = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")
  
  return [fst, snd, trd, fth]

# Unit Test
assert(MBTI_to_vec("ISTJ") == [[1, 0], [1, 0], [1, 0], [1, 0]])
assert(MBTI_to_vec("ENFP") == [[0, 1], [0, 1], [0, 1], [0, 1]])

### 2. Model Proposal
#### BERT-based model
Please refer to the report!

### 3. Implementation of the BERT-based Model
We decide to use pre-trained BERT model as the base model, so let's implement it!

#### 3-0. Data Setup
First of all, we should organize our data with some organized logics.  
In this section, I will load three datasets and aggregate them into two datasets: Questions, Training Data and Test Data.

In [None]:
questions = pd.read_excel(os.path.join(root_dir, questions_dir))
questions.drop(['index', 'index.1'], axis='columns', inplace=True)
display(questions.head())

Unnamed: 0,Question
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.


In [None]:
# Retrieve the original question in String, using the question number
def retrival_q(q_num):
  return questions.loc[q_num - 1]['Question']

# Unit Test
assert(retrival_q(1) == "주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁금해요.")


training = pd.read_csv(os.path.join(root_dir, train_dir), encoding="CP949")

# We will only use the question-answer pair, at this time.
training.drop(['Data_ID', 'User_ID', 'Gender', 'Age'], axis='columns', inplace=True)

# Retreieve the original question!
training['Question'] = training['Q_number'].apply(retrival_q)
training.drop('Q_number', axis='columns', inplace=True)

# Reordering
training = training[['Question', 'Answer', 'MBTI']]

training.head()

Unnamed: 0,Question,Answer,MBTI
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,<아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...,INFP
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,<중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하...,INFP
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,<그렇다> 감정 이입이 잘되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...,INFP
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,<중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다. 일의 변수가 생길 수 있고...,INFP
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,<아니다> 평정심을 유지 못 하는 편입니다. 머릿속은 백지화가 된 상태로 말도 제대...,INFP


In [None]:
testing = pd.read_csv(os.path.join(root_dir, test_dir), encoding="CP949")

# Again, we will only use the question-answer pair, at this moment.
testing.drop(['Gender', 'Age'], axis='columns', inplace=True)

# Retreieve the original question!
testing['Question'] = testing['Q_number'].apply(retrival_q)
testing.drop('Q_number', axis='columns', inplace=True)

# Reordering
testing = testing[['Question', 'Answer']]

testing.head()

Unnamed: 0,Question,Answer
0,마감 기한을 지키기가 힘든가요? 경험을 이야기해보아요.,<아니다> 저는 모든 일은 정해진 시간을 지켜서 해야 된다고 생각되어서 마감 기한을...
1,조용하고 사적인 장소보다는 사람들로 붐비고 떠들썩한 장소를 좋아하나요? 답변에 대한...,<중립> 저는 조용하고 사적인 장소도 좋아하고 사람들로 붐비고 떠들썩한 장소도 좋아...
2,단계를 건너뛰는 일 없이 절차대로 일을 완수하는 편인가요? 그러한 최근 경험은 어떤...,<그렇다> 저는 규칙을 잘 지키고 매뉴얼 대로 일하는 사람입니다. 그래서 데이터 라...
3,일이 원하는 대로 진행될 것이라는 자신감이 있나요? 그렇게 된 계기나 이유가 있나요.,<그렇다> 저는 항상 긍정적인 사고방식을 가지고 살려고 노력하고 있습니다. 이유는 ...
4,대부분의 시간을 혼자서 일할 수 있는 직업을 원하나요? 이유도 말씀해주세요.,<중립> 혼자서 일하는 것도 좋고 함께 일하는 것도 모두 좋은데 같이 의논하는 일도...


#### 3-1. Preprocessing
For utilizing BERT, we have to satisify its own preprocessing requirements.  
For example, it requires speical tokens such as SEP and CLS in the input.  
But don't worry, since these requirements can be simply satisfied if we use the BERT Tokenizer.


In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for train_sentence in training['Question']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

for train_sentence in training['Answer']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 206 in tokens, so set max_length as 256, safely
max_len = 256

input_ids = []
att_masks = []
labels_IE = []
labels_SN = []
labels_TF = []
labels_JP = []

# Preprocessing
for idx in training.index:
  question = training['Question'][idx]
  answer = training['Answer'][idx]
  mbti = training['MBTI'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  # Convert MBTI of string to list of integers.
  mbti = MBTI_to_vec(mbti)

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  labels_IE.append(torch.tensor([mbti[0]]))
  labels_SN.append(torch.tensor([mbti[1]]))
  labels_TF.append(torch.tensor([mbti[2]]))
  labels_JP.append(torch.tensor([mbti[3]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
labels_IE = torch.cat(labels_IE, dim=0)
labels_SN = torch.cat(labels_SN, dim=0)
labels_TF = torch.cat(labels_TF, dim=0)
labels_JP = torch.cat(labels_JP, dim=0)

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

206


In [None]:
print(input_ids[0])
print(att_masks[0])
print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids), len(att_masks), len(labels_IE))

tensor([    2, 25753, 14567, 28897, 18069, 14526,  2033, 19742, 22742,  8082,
        31724,  3463, 32771,  8061, 19773, 16941, 24296,  8055,  2016,     3,
         2030, 15345,  2032, 18430,  3463,  5724,  8423, 26850, 20699, 14204,
        15916, 17729, 25878, 18895, 14045, 27024,  8107, 28669,  8120,  6266,
        24832,  2016,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
import random
import numpy as np

# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#### 3-2. Data Split
Currently, we do not have the answers for testing dataset, so we must split the training data to evaluate our model. (8:1:1)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, labels_IE)
dataset_SN = TensorDataset(input_ids, att_masks, labels_SN)
dataset_TF = TensorDataset(input_ids, att_masks, labels_TF)
dataset_JP = TensorDataset(input_ids, att_masks, labels_JP)

train_size = int(0.9 * len(dataset_IE))
val_size = int(0.05 * len(dataset_IE))
test_size = len(dataset_IE) - train_size - val_size

print(f"lengths are {train_size}:{val_size}:{test_size}")

# Split into train dataset, validation dataset and test dataset.
train_dataset_IE, val_dataset_IE, test_dataset_IE = random_split(dataset_IE, [train_size, val_size, test_size])
train_dataset_SN, val_dataset_SN, test_dataset_SN = random_split(dataset_SN, [train_size, val_size, test_size])
train_dataset_TF, val_dataset_TF, test_dataset_TF = random_split(dataset_TF, [train_size, val_size, test_size])
train_dataset_JP, val_dataset_JP, test_dataset_JP = random_split(dataset_JP, [train_size, val_size, test_size])

batch_size = 16 # 16 or 32

# Define dataloaders
train_dataloader_IE = DataLoader(
    train_dataset_IE,
    sampler = RandomSampler(train_dataset_IE),
    batch_size = batch_size
)

val_dataloader_IE = DataLoader (
    val_dataset_IE,
    sampler = SequentialSampler(val_dataset_IE),
    batch_size = batch_size
)

test_dataloader_IE = DataLoader (
    test_dataset_IE,
    sampler = SequentialSampler(test_dataset_IE),
    batch_size = batch_size
)

train_dataloader_SN = DataLoader(
    train_dataset_SN,
    sampler = RandomSampler(train_dataset_SN),
    batch_size = batch_size
)

val_dataloader_SN = DataLoader (
    val_dataset_SN,
    sampler = SequentialSampler(val_dataset_SN),
    batch_size = batch_size
)

test_dataloader_SN = DataLoader (
    test_dataset_SN,
    sampler = SequentialSampler(test_dataset_SN),
    batch_size = batch_size
)

train_dataloader_TF = DataLoader(
    train_dataset_TF,
    sampler = RandomSampler(train_dataset_TF),
    batch_size = batch_size
)

val_dataloader_TF = DataLoader (
    val_dataset_TF,
    sampler = SequentialSampler(val_dataset_TF),
    batch_size = batch_size
)

test_dataloader_TF = DataLoader (
    test_dataset_TF,
    sampler = SequentialSampler(test_dataset_TF),
    batch_size = batch_size
)

train_dataloader_JP = DataLoader(
    train_dataset_JP,
    sampler = RandomSampler(train_dataset_JP),
    batch_size = batch_size
)

val_dataloader_JP = DataLoader (
    val_dataset_JP,
    sampler = SequentialSampler(val_dataset_JP),
    batch_size = batch_size
)

test_dataloader_JP = DataLoader (
    test_dataset_JP,
    sampler = SequentialSampler(test_dataset_JP),
    batch_size = batch_size
)

lengths are 10368:576:576


#### 3-3. Model Definition
Now we require a model written in Torch package.  
Because we decided to **fine-tune** the pretrained BERT model, I'm going to add just one layer on top of the pretrained model.

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

class MBTIClassifier(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.lin = nn.Linear(hidden_size, num_classes)
    self.classifier = nn.Softmax(dim=1)

    # Model 0.
    # self.classifier = nn.Linear(hidden_size, num_classes)

    # Model 1.
    # self.lin = nn.Linear(hidden_size, 256)
    # self.lin2 = nn.Linear(256, 128)
    # self.lin3 = nn.Linear(128, 64)
    # self.classifier = nn.Linear(64, num_classes)

    # Model 2.
    # self.lin = nn.Linear(hidden_size, 256)
    # self.lin2 = nn.Linear(256, 128)
    # self.lin3 = nn.Linear(128, 64)
    # self.lin4 = nn.Linear(64, num_classes)
    # self.classifier = nn.Softmax(dim=0)

    # Model 3.
    # self.lin = nn.Linear(hidden_size, 512)
    # self.lin2 = nn.Linear(512, 256)
    # self.lin3 = nn.Linear(256, 64)
    # self.classifier = nn.Linear(64, num_classes)

    # Model 4.
    # self.lin = nn.Linear(hidden_size, 256)
    # self.sig = nn.Sigmoid()
    # self.lin2 = nn.Linear(256, 64)
    # self.sig2 = nn.Sigmoid()
    # self.classifier = nn.Linear(64, num_classes)

    # Model 5.
    # self.lin = nn.Linear(hidden_size, 256)
    # self.relu = nn.ReLU()
    # self.lin2 = nn.Linear(256, 64)
    # self.relu2 = nn.ReLU()
    # self.lin3 = nn.Linear(64, num_classes)
    # self.classifier = nn.Sigmoid()

    # Model 6.
    # self.lin = nn.Linear(hidden_size, 128)
    # self.relu = nn.ReLU()
    # self.lin2 = nn.Linear(128, num_classes)
    # self.classifier = nn.Sigmoid()

    # Model 7.
    # self.lin = nn.Linear(hidden_size, num_classes)
    # self.classifier = nn.Sigmoid()

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output

    lin_output = self.lin(dr_output)
    return self.classifier(lin_output)

    # Model 0.
    # return self.classifier(dr_output)

    # Model 1.
    # lin_output = self.lin(dr_output)
    # lin2_output = self.lin2(lin_output)
    # lin3_output = self.lin3(lin2_output)
    # return self.classifier(lin3_output)

    # Model 2.
    # lin_output = self.lin(dr_output)
    # lin2_output = self.lin2(lin_output)
    # lin3_output = self.lin3(lin2_output)
    # lin4_output = self.lin4(lin3_output)
    # return self.classifier(lin4_output)

    # Model 3.
    # lin_output = self.lin(dr_output)
    # lin2_output = self.lin2(lin_output)
    # lin3_output = self.lin3(lin2_output)
    # return self.classifier(lin3_output)

    # Model 4.
    # lin_output = self.lin(dr_output)
    # sig_output = self.sig(lin_output)
    # lin2_output = self.lin2(sig_output)
    # sig2_output = self.sig2(lin2_output)
    # return self.classifier(lin2_output)

    # Model 5.
    # lin_output = self.lin(dr_output)
    # relu_output = self.relu(lin_output)
    # lin2_output = self.lin2(relu_output)
    # relu2_output = self.relu2(lin2_output)
    # lin3_output = self.lin3(relu2_output)
    # return self.classifier(lin3_output)

    # Model 6.
    # lin_output = self.lin(dr_output)
    # relu_output = self.relu(lin_output)
    # lin2_output = self.lin2(relu_output)
    # return self.classifier(lin2_output)

    # Model 7.
    # lin_output = self.lin(dr_output)
    # return self.classifier(lin_output)

model_bert_IE = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP = BertModel.from_pretrained("kykim/bert-kor-base")
  
model_IE = MBTIClassifier(model_bert_IE, dr_rate = 0.3)
model_SN = MBTIClassifier(model_bert_SN, dr_rate = 0.3)
model_TF = MBTIClassifier(model_bert_TF, dr_rate = 0.3)
model_JP = MBTIClassifier(model_bert_JP, dr_rate = 0.3)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters_IE = [
    {'params': [p for n, p in model_IE.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_IE.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_SN = [
    {'params': [p for n, p in model_SN.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_SN.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_TF = [
    {'params': [p for n, p in model_TF.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_TF.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_JP = [
    {'params': [p for n, p in model_JP.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_JP.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

learning_rate = 2e-5

optimizer_IE = AdamW(optimizer_grouped_parameters_IE,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_SN = AdamW(optimizer_grouped_parameters_SN,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_TF = AdamW(optimizer_grouped_parameters_TF,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_JP = AdamW(optimizer_grouped_parameters_JP,
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 4 # 2 or 4

total_steps = len(train_dataloader_IE) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_SN = get_cosine_schedule_with_warmup(optimizer_SN, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_TF = get_cosine_schedule_with_warmup(optimizer_TF, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.

#### 3-4. Training
Finally, we can do train our model!  
Let's see how accurate our model is :)

In [None]:
import numpy as np

device = torch.device("cuda")

model_IE.cuda()
model_SN.cuda()
model_TF.cuda()
model_JP.cuda()

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

torch.cuda.empty_cache()

##### 3-4-1. I vs. E

In [None]:
for epoch in range(epochs):
  print(f"\n======= I/E : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_IE.train()

  for step, batch in enumerate(train_dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].float().to(device)

    optimizer_IE.zero_grad()

    b_out = model_IE(b_input_id, b_input_mask)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_IE.parameters(), 1.0)

    optimizer_IE.step()
    scheduler_IE.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  print("\n======= I/E : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_IE.eval()
  for step, batch in enumerate(val_dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].to(device)

    b_out = model_IE(b_input_id, b_input_mask)
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_IE.eval()
preds = []
labels = []

for batch in test_dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= I/E : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")




  return self.classifier(lin_output)


epoch 1 batch id 1 loss 0.6359407901763916 train acc 0.75
epoch 1 batch id 129 loss 0.6607937812805176 train acc 0.49660852713178294
epoch 1 batch id 257 loss 0.6595480442047119 train acc 0.498784046692607
epoch 1 batch id 385 loss 0.713675856590271 train acc 0.5128246753246753
epoch 1 batch id 513 loss 0.6880520582199097 train acc 0.5237573099415205
epoch 1 batch id 641 loss 0.64157634973526 train acc 0.5350039001560063

epoch 1 train acc 0.5347222222222222
epoch 1 validation acc 0.6197916666666666

epoch 2 batch id 1 loss 0.7934859991073608 train acc 0.4375
epoch 2 batch id 129 loss 0.6510183811187744 train acc 0.6017441860465116
epoch 2 batch id 257 loss 0.6619061231613159 train acc 0.6116245136186771
epoch 2 batch id 385 loss 0.4731528162956238 train acc 0.6233766233766234
epoch 2 batch id 513 loss 0.6018325090408325 train acc 0.6270711500974658
epoch 2 batch id 641 loss 0.6540859341621399 train acc 0.6312402496099844

epoch 2 train acc 0.6318479938271605
epoch 2 validation acc 0.6

모델0: pooler_output -> Linear  
: epochs = 4, Test Accuracy: 0.6154513888888888

모델1: pooler_output -> Linear(256) -> Linear(128) -> Linear(64) -> Linear  
: epochs = 4, Test Accuracy: 0.6362847222222222

모델2: pooler_output -> Linear(256) -> Linear(128) -> Linear(64) -> Linear(2) -> Softmax  
: epochs = 4, Test Accuracy: 0.6085069444444444

모델3: pooler_output -> Linear(512) -> Linear(256) -> Linear(64) -> Linear  
: epochs = 4, Test Accuracy: 0.6345486111111112  

모델4: pooler_output -> Linear(256) -> Sigmoid -> Linear(64) -> Sigmoid -> Linear  
: epochs = 4, Test Accuracy: 0.625

모델5: pooler_output -> Linear(256) -> ReLU -> Linear(64) -> ReLU -> Linear(2) -> Sigmoid  
: epochs = 4, Test Accuracy: 0.6145833333333334

모델6: pooler_output -> Linear(128) -> ReLU -> Linear(2) -> Sigmoid  
: epochs = 4, Test Accuracy: 0.4869791666666667

모델7: pooler_output -> Linear(2) -> Sigmoid  
: epochs = 4, Test Accuracy: 0.6137152777777778

In [None]:
torch.save(model_IE, model_IE_dir)

##### 3-4-2. S vs. N

In [None]:
import gc

del model_IE
gc.collect()
torch.cuda.empty_cache()

NameError: ignored

In [None]:
for epoch in range(epochs):
  print(f"\n======= S/N : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_SN.train()

  for step, batch in enumerate(train_dataloader_SN):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].float().to(device)

    optimizer_SN.zero_grad()

    b_out = model_SN(b_input_id, b_input_mask)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_SN.parameters(), 1.0)

    optimizer_SN.step()
    scheduler_SN.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  print("\n======= S/N : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_SN.eval()
  for step, batch in enumerate(val_dataloader_SN):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].to(device)

    b_out = model_SN(b_input_id, b_input_mask)
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_SN.eval()
preds = []
labels = []

for batch in test_dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= S/N : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")




  return self.classifier(lin_output)


epoch 1 batch id 1 loss 0.6561090350151062 train acc 0.5625
epoch 1 batch id 129 loss 0.7160962820053101 train acc 0.5116279069767442
epoch 1 batch id 257 loss 0.6556985378265381 train acc 0.5235894941634242
epoch 1 batch id 385 loss 0.6088603734970093 train acc 0.5314935064935065
epoch 1 batch id 513 loss 0.6628450155258179 train acc 0.5389863547758285
epoch 1 batch id 641 loss 0.6282668113708496 train acc 0.5482644305772231

epoch 1 train acc 0.5482253086419753
epoch 1 validation acc 0.5902777777777778

epoch 2 batch id 1 loss 0.7458827495574951 train acc 0.5625
epoch 2 batch id 129 loss 0.7295388579368591 train acc 0.6201550387596899
epoch 2 batch id 257 loss 0.6125571727752686 train acc 0.6288910505836576
epoch 2 batch id 385 loss 0.6020779609680176 train acc 0.6293831168831169
epoch 2 batch id 513 loss 0.5244085192680359 train acc 0.628167641325536
epoch 2 batch id 641 loss 0.6077920198440552 train acc 0.6310452418096724

epoch 2 train acc 0.6299189814814815
epoch 2 validation acc

In [None]:
torch.save(model_SN, model_SN_dir)

##### 3-4-3. T vs. F

In [None]:
import gc

del model_SN
gc.collect()
torch.cuda.empty_cache()

In [None]:
for epoch in range(epochs):
  print(f"\n======= T/F : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_TF.train()

  for step, batch in enumerate(train_dataloader_TF):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].float().to(device)

    optimizer_TF.zero_grad()

    b_out = model_TF(b_input_id, b_input_mask)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_TF.parameters(), 1.0)

    optimizer_TF.step()
    scheduler_TF.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  print("\n======= T/F : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_TF.eval()
  for step, batch in enumerate(val_dataloader_TF):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].to(device)

    b_out = model_TF(b_input_id, b_input_mask)
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_TF.eval()
preds = []
labels = []

for batch in test_dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= T/F : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")




  return self.classifier(lin_output)


epoch 1 batch id 1 loss 0.7232190370559692 train acc 0.4375
epoch 1 batch id 129 loss 0.7479486465454102 train acc 0.499515503875969
epoch 1 batch id 257 loss 0.6741865277290344 train acc 0.5133754863813229
epoch 1 batch id 385 loss 0.8030481934547424 train acc 0.5194805194805194
epoch 1 batch id 513 loss 0.6884487867355347 train acc 0.5281432748538012
epoch 1 batch id 641 loss 0.7721391320228577 train acc 0.5357839313572543

epoch 1 train acc 0.5372299382716049
epoch 1 validation acc 0.5954861111111112

epoch 2 batch id 1 loss 0.6030490398406982 train acc 0.6875
epoch 2 batch id 129 loss 0.5946832895278931 train acc 0.6172480620155039
epoch 2 batch id 257 loss 0.6368663907051086 train acc 0.604328793774319
epoch 2 batch id 385 loss 0.5814990997314453 train acc 0.610551948051948
epoch 2 batch id 513 loss 0.5211884379386902 train acc 0.6193957115009746
epoch 2 batch id 641 loss 0.750889778137207 train acc 0.6216848673946958

epoch 2 train acc 0.6206597222222222
epoch 2 validation acc 0.

In [None]:
torch.save(model_TF, model_TF_dir)

##### 3-4-4. J vs. P

In [None]:
import gc

del model_TF
gc.collect()
torch.cuda.empty_cache()

In [None]:
for epoch in range(epochs):
  print(f"\n======= J/P : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_JP.train()

  for step, batch in enumerate(train_dataloader_JP):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].float().to(device)

    optimizer_JP.zero_grad()

    b_out = model_JP(b_input_id, b_input_mask)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_JP.parameters(), 1.0)

    optimizer_JP.step()
    scheduler_JP.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  print("\n======= J/P : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_JP.eval()
  for step, batch in enumerate(val_dataloader_JP):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].to(device)

    b_out = model_JP(b_input_id, b_input_mask)
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_JP.eval()
preds = []
labels = []

for batch in test_dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= J/P : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")


epoch 1 batch id 1 loss 0.7527980804443359 train acc 0.375
epoch 1 batch id 129 loss 0.7059968709945679 train acc 0.4791666666666667
epoch 1 batch id 257 loss 0.7406200170516968 train acc 0.49464980544747084
epoch 1 batch id 385 loss 0.7142237424850464 train acc 0.5003246753246753
epoch 1 batch id 513 loss 0.6580074429512024 train acc 0.5043859649122807
epoch 1 batch id 641 loss 0.7139447927474976 train acc 0.5084828393135725

epoch 1 train acc 0.5086805555555556
epoch 1 validation acc 0.5017361111111112

epoch 2 batch id 1 loss 0.7277507185935974 train acc 0.5
epoch 2 batch id 129 loss 0.6368276476860046 train acc 0.5406976744186046
epoch 2 batch id 257 loss 0.6004140377044678 train acc 0.5510700389105059
epoch 2 batch id 385 loss 0.6674277782440186 train acc 0.5589285714285714
epoch 2 batch id 513 loss 0.6724637746810913 train acc 0.5645711500974658
epoch 2 batch id 641 loss 0.712556004524231 train acc 0.5733229329173167

epoch 2 train acc 0.5734953703703703
epoch 2 validation acc 0

In [None]:
torch.save(model_JP, model_JP_dir)

##### 3-4-5. Total Test

In [None]:
import gc

del model_JP
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_IE = torch.load(model_IE_dir).cuda()
model_SN = torch.load(model_SN_dir).cuda()
model_TF = torch.load(model_TF_dir).cuda()
model_JP = torch.load(model_JP_dir).cuda()

# Test: I vs. E
model_IE.eval()
preds = []
labels = []

for batch in test_dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= I/E : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

# Test: S vs. N
model_SN.eval()
preds = []
labels = []

for batch in test_dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= S/N : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

# Test: T vs. F
model_TF.eval()
preds = []
labels = []

for batch in test_dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= T/F : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

# Test: J vs. P
model_JP.eval()
preds = []
labels = []

for batch in test_dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= J/P : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

  return self.classifier(lin_output)



Test Accuracy: 0.6267361111111112

Test Accuracy: 0.6215277777777778

Test Accuracy: 0.6319444444444444

Test Accuracy: 0.6423611111111112


#### 3-5. Export Results
Using our trained model, produce the output for real test inputs (variable`testing`)

In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for test_sentence in testing['Question']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

for test_sentence in testing['Answer']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 105 in tokens, but set max_length as 256, as before
max_len = 256

input_ids = []
att_masks = []

# Preprocessing
for idx in testing.index:
  question = testing['Question'][idx]
  answer = testing['Answer'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)

150


In [None]:
# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks)
dataset_SN = TensorDataset(input_ids, att_masks)
dataset_TF = TensorDataset(input_ids, att_masks)
dataset_JP = TensorDataset(input_ids, att_masks)

dataloader_IE = DataLoader (
    dataset_IE,
    sampler = SequentialSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = SequentialSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = SequentialSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = SequentialSampler(dataset_JP),
    batch_size = batch_size
)

In [None]:
model_IE.eval()
model_SN.eval()
model_TF.eval()
model_JP.eval()

preds_IE = []
preds_prob_IE = []
preds_SN = []
preds_prob_SN = []
preds_TF = []
preds_prob_TF = []
preds_JP = []
preds_prob_JP = []

# Predict I vs. E
for batch in dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_IE = preds_prob_IE + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_IE = preds_IE + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_IE = np.array(preds_IE)

# Predict S vs. N
for batch in dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_SN = preds_prob_SN + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_SN = preds_SN + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_SN = np.array(preds_SN)

# Predict T vs. F
for batch in dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_TF = preds_prob_TF + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_TF = preds_TF + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_TF = np.array(preds_TF)

# Predict J vs. P
for batch in dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_JP = preds_prob_JP + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_JP = preds_JP + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_JP = np.array(preds_JP)

  return self.classifier(lin_output)


In [None]:
idx = range(1, len(preds_IE) + 1)
# preds = {'idx': idx,'I/E': preds_IE.tolist(), 'S/N':preds_SN.tolist(), 'T/F':preds_TF.tolist(), 'J/P':preds_JP.tolist()}
preds = {'idx': idx,'I/E': preds_IE, 'S/N':preds_SN, 'T/F':preds_TF, 'J/P':preds_JP}
preds = pd.DataFrame(data=preds)
preds = preds.set_index('idx')
preds.to_csv('result.csv')

preds_prob = {'idx': idx,'I/E': preds_prob_IE, 'S/N':preds_prob_SN, 'T/F':preds_prob_TF, 'J/P':preds_prob_JP}
preds_prob = pd.DataFrame(data=preds_prob)
preds_prob = preds_prob.set_index('idx')
preds_prob.to_csv('result_prob.csv')

### 3. Implementation of the BERT-based Model (Question number)
Rather than using the question as string, just put the Question number!

#### 3-0. Data Setup
First of all, we should organize our data with some organized logics.  
In this section, I will load three datasets and aggregate them into two datasets: Questions, Training Data and Test Data.

In [None]:
training = pd.read_csv(os.path.join(root_dir, train_dir), encoding="CP949")

# We will only use the question-answer pair, at this time.
training.drop(['Data_ID', 'User_ID', 'Gender', 'Age'], axis='columns', inplace=True)

# Reordering
training = training[['Q_number', 'Answer', 'MBTI']]

training.head()

Unnamed: 0,Q_number,Answer,MBTI
0,1,<아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...,INFP
1,2,<중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하...,INFP
2,3,<그렇다> 감정 이입이 잘되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...,INFP
3,4,<중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다. 일의 변수가 생길 수 있고...,INFP
4,5,<아니다> 평정심을 유지 못 하는 편입니다. 머릿속은 백지화가 된 상태로 말도 제대...,INFP


In [None]:
testing = pd.read_csv(os.path.join(root_dir, test_dir), encoding="CP949")

# Again, we will only use the question-answer pair, at this moment.
testing.drop(['Gender', 'Age'], axis='columns', inplace=True)

# Reordering
testing = testing[['Q_number', 'Answer']]

testing.head()

Unnamed: 0,Q_number,Answer
0,59,<아니다> 저는 모든 일은 정해진 시간을 지켜서 해야 된다고 생각되어서 마감 기한을...
1,53,<중립> 저는 조용하고 사적인 장소도 좋아하고 사람들로 붐비고 떠들썩한 장소도 좋아...
2,56,<그렇다> 저는 규칙을 잘 지키고 매뉴얼 대로 일하는 사람입니다. 그래서 데이터 라...
3,60,<그렇다> 저는 항상 긍정적인 사고방식을 가지고 살려고 노력하고 있습니다. 이유는 ...
4,51,<중립> 혼자서 일하는 것도 좋고 함께 일하는 것도 모두 좋은데 같이 의논하는 일도...


#### 3-1. Preprocessing
For utilizing BERT, we have to satisify its own preprocessing requirements.  
For example, it requires speical tokens such as SEP and CLS in the input.  
But don't worry, since these requirements can be simply satisfied if we use the BERT Tokenizer.


In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for train_sentence in training['Answer']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 206 in tokens, so set max_length as 256, safely
max_len = 256

input_ids = []
att_masks = []
q_nums = []
labels_IE = []
labels_SN = []
labels_TF = []
labels_JP = []

# Preprocessing
for idx in training.index:
  q_num = training['Q_number'][idx]
  answer = training['Answer'][idx]
  mbti = training['MBTI'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  # Convert MBTI of string to list of integers.
  mbti = MBTI_to_vec(mbti)

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  q_nums.append(torch.tensor([[q_num]]))
  labels_IE.append(torch.tensor([mbti[0]]))
  labels_SN.append(torch.tensor([mbti[1]]))
  labels_TF.append(torch.tensor([mbti[2]]))
  labels_JP.append(torch.tensor([mbti[3]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
q_nums = torch.cat(q_nums, dim=0)
labels_IE = torch.cat(labels_IE, dim=0)
labels_SN = torch.cat(labels_SN, dim=0)
labels_TF = torch.cat(labels_TF, dim=0)
labels_JP = torch.cat(labels_JP, dim=0)

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

206


In [None]:
print(input_ids[0])
print(att_masks[0])
print(q_nums[0])
print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids), len(att_masks), len(q_nums), len(labels_IE))

tensor([    2,  2030, 15345,  2032, 18430,  3463,  5724,  8423, 26850, 20699,
        14204, 15916, 17729, 25878, 18895, 14045, 27024,  8107, 28669,  8120,
         6266, 24832,  2016,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
import random
import numpy as np

# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#### 3-2. Data Split
Currently, we do not have the answers for testing dataset, so we must split the training data to evaluate our model. (8:1:1)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, q_nums, labels_IE)
dataset_SN = TensorDataset(input_ids, att_masks, q_nums, labels_SN)
dataset_TF = TensorDataset(input_ids, att_masks, q_nums, labels_TF)
dataset_JP = TensorDataset(input_ids, att_masks, q_nums, labels_JP)

train_size = int(0.9 * len(dataset_IE))
val_size = int(0.05 * len(dataset_IE))
test_size = len(dataset_IE) - train_size - val_size

print(f"lengths are {train_size}:{val_size}:{test_size}")

# Split into train dataset, validation dataset and test dataset.
train_dataset_IE, val_dataset_IE, test_dataset_IE = random_split(dataset_IE, [train_size, val_size, test_size])
train_dataset_SN, val_dataset_SN, test_dataset_SN = random_split(dataset_SN, [train_size, val_size, test_size])
train_dataset_TF, val_dataset_TF, test_dataset_TF = random_split(dataset_TF, [train_size, val_size, test_size])
train_dataset_JP, val_dataset_JP, test_dataset_JP = random_split(dataset_JP, [train_size, val_size, test_size])

batch_size = 16 # 16 or 32

# Define dataloaders
train_dataloader_IE = DataLoader(
    train_dataset_IE,
    sampler = RandomSampler(train_dataset_IE),
    batch_size = batch_size
)

val_dataloader_IE = DataLoader (
    val_dataset_IE,
    sampler = SequentialSampler(val_dataset_IE),
    batch_size = batch_size
)

test_dataloader_IE = DataLoader (
    test_dataset_IE,
    sampler = SequentialSampler(test_dataset_IE),
    batch_size = batch_size
)

train_dataloader_SN = DataLoader(
    train_dataset_SN,
    sampler = RandomSampler(train_dataset_SN),
    batch_size = batch_size
)

val_dataloader_SN = DataLoader (
    val_dataset_SN,
    sampler = SequentialSampler(val_dataset_SN),
    batch_size = batch_size
)

test_dataloader_SN = DataLoader (
    test_dataset_SN,
    sampler = SequentialSampler(test_dataset_SN),
    batch_size = batch_size
)

train_dataloader_TF = DataLoader(
    train_dataset_TF,
    sampler = RandomSampler(train_dataset_TF),
    batch_size = batch_size
)

val_dataloader_TF = DataLoader (
    val_dataset_TF,
    sampler = SequentialSampler(val_dataset_TF),
    batch_size = batch_size
)

test_dataloader_TF = DataLoader (
    test_dataset_TF,
    sampler = SequentialSampler(test_dataset_TF),
    batch_size = batch_size
)

train_dataloader_JP = DataLoader(
    train_dataset_JP,
    sampler = RandomSampler(train_dataset_JP),
    batch_size = batch_size
)

val_dataloader_JP = DataLoader (
    val_dataset_JP,
    sampler = SequentialSampler(val_dataset_JP),
    batch_size = batch_size
)

test_dataloader_JP = DataLoader (
    test_dataset_JP,
    sampler = SequentialSampler(test_dataset_JP),
    batch_size = batch_size
)

lengths are 10368:576:576


#### 3-3. Model Definition
Now we require a model written in Torch package.  
Because we decided to **fine-tune** the pretrained BERT model, I'm going to add just one layer on top of the pretrained model.

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

class MBTIClassifier(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.lin = nn.Linear(hidden_size + 1, 256)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(256, num_classes)
    self.classifier = nn.Softmax(dim=1)

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks, q_nums):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    bert_output = torch.cat((bert_output, q_nums), dim=1)
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output

    lin_output = self.lin(dr_output)
    relu_output = self.relu(lin_output)
    lin2_output = self.lin2(relu_output)
    return self.classifier(lin2_output)

model_bert_IE = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP = BertModel.from_pretrained("kykim/bert-kor-base")
  
model_IE = MBTIClassifier(model_bert_IE, dr_rate = 0.3)
model_SN = MBTIClassifier(model_bert_SN, dr_rate = 0.3)
model_TF = MBTIClassifier(model_bert_TF, dr_rate = 0.3)
model_JP = MBTIClassifier(model_bert_JP, dr_rate = 0.3)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters_IE = [
    {'params': [p for n, p in model_IE.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_IE.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_SN = [
    {'params': [p for n, p in model_SN.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_SN.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_TF = [
    {'params': [p for n, p in model_TF.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_TF.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_JP = [
    {'params': [p for n, p in model_JP.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_JP.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

learning_rate = 2e-5

optimizer_IE = AdamW(optimizer_grouped_parameters_IE,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_SN = AdamW(optimizer_grouped_parameters_SN,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_TF = AdamW(optimizer_grouped_parameters_TF,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_JP = AdamW(optimizer_grouped_parameters_JP,
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 4 # 2 or 4

total_steps = len(train_dataloader_IE) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_SN = get_cosine_schedule_with_warmup(optimizer_SN, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_TF = get_cosine_schedule_with_warmup(optimizer_TF, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.

#### 3-4. Training
Finally, we can do train our model!  
Let's see how accurate our model is :)

In [None]:
import numpy as np

device = torch.device("cuda")

model_IE.cuda()
model_SN.cuda()
model_TF.cuda()
model_JP.cuda()

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

torch.cuda.empty_cache()

##### 3-4-1. I vs. E

In [None]:
for epoch in range(epochs):
  print(f"\n======= I/E : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_IE.train()

  for step, batch in enumerate(train_dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].float().to(device)

    optimizer_IE.zero_grad()

    b_out = model_IE(b_input_id, b_input_mask, b_q_num)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_IE.parameters(), 1.0)

    optimizer_IE.step()
    scheduler_IE.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  print("\n======= I/E : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_IE.eval()
  for step, batch in enumerate(val_dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].to(device)

    with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_q_num)
    
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_IE.eval()
preds = []
labels = []

for batch in test_dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= I/E : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")


epoch 1 batch id 1 loss 0.6961251497268677 train acc 0.5
epoch 1 batch id 129 loss 0.6874635219573975 train acc 0.48207364341085274
epoch 1 batch id 257 loss 0.7024554014205933 train acc 0.5002431906614786
epoch 1 batch id 385 loss 0.7227259278297424 train acc 0.5060064935064935
epoch 1 batch id 513 loss 0.693671703338623 train acc 0.5158382066276803
epoch 1 batch id 641 loss 0.6802080869674683 train acc 0.5273010920436817

epoch 1 train acc 0.5283564814814815
epoch 1 validation acc 0.5850694444444444

epoch 2 batch id 1 loss 0.6020957827568054 train acc 0.75
epoch 2 batch id 129 loss 0.7003815174102783 train acc 0.5935077519379846
epoch 2 batch id 257 loss 0.6623760461807251 train acc 0.5994649805447471
epoch 2 batch id 385 loss 0.7972474098205566 train acc 0.6058441558441559
epoch 2 batch id 513 loss 0.6040309071540833 train acc 0.6092836257309941
epoch 2 batch id 641 loss 0.6477915644645691 train acc 0.6120319812792512

epoch 2 train acc 0.6128472222222222
epoch 2 validation acc 0.

In [None]:
torch.save(model_IE, model_IE_dir)

##### 3-4-2. S vs. N

In [None]:
import gc

del model_IE
gc.collect()
torch.cuda.empty_cache()

In [None]:
for epoch in range(epochs):
  print(f"\n======= S/N : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_SN.train()

  for step, batch in enumerate(train_dataloader_SN):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].float().to(device)

    optimizer_SN.zero_grad()

    b_out = model_SN(b_input_id, b_input_mask, b_q_num)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_SN.parameters(), 1.0)

    optimizer_SN.step()
    scheduler_SN.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  print("\n======= S/N : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_SN.eval()
  for step, batch in enumerate(val_dataloader_SN):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].to(device)

    with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_q_num)
    
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_SN.eval()
preds = []
labels = []

for batch in test_dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= S/N : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")


epoch 1 batch id 1 loss 0.6920459270477295 train acc 0.5625
epoch 1 batch id 129 loss 0.7073368430137634 train acc 0.5266472868217055
epoch 1 batch id 257 loss 0.6653560400009155 train acc 0.5313715953307393
epoch 1 batch id 385 loss 0.6951838135719299 train acc 0.5387987012987013
epoch 1 batch id 513 loss 0.7430845499038696 train acc 0.5483674463937622
epoch 1 batch id 641 loss 0.6899130344390869 train acc 0.5530421216848674

epoch 1 train acc 0.5520833333333334
epoch 1 validation acc 0.6180555555555556

epoch 2 batch id 1 loss 0.5626250505447388 train acc 0.875
epoch 2 batch id 129 loss 0.6293864846229553 train acc 0.6206395348837209
epoch 2 batch id 257 loss 0.5490229725837708 train acc 0.6208657587548638
epoch 2 batch id 385 loss 0.7460149526596069 train acc 0.6227272727272727
epoch 2 batch id 513 loss 0.5601366758346558 train acc 0.6269493177387915
epoch 2 batch id 641 loss 0.7160625457763672 train acc 0.6304602184087363

epoch 2 train acc 0.630883487654321
epoch 2 validation acc

In [None]:
torch.save(model_SN, model_SN_dir)

##### 3-4-3. T vs. F

In [None]:
import gc

del model_SN
gc.collect()
torch.cuda.empty_cache()

In [None]:
for epoch in range(epochs):
  print(f"\n======= T/F : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_TF.train()

  for step, batch in enumerate(train_dataloader_TF):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].float().to(device)

    optimizer_TF.zero_grad()

    b_out = model_TF(b_input_id, b_input_mask, b_q_num)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_TF.parameters(), 1.0)

    optimizer_TF.step()
    scheduler_TF.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  print("\n======= T/F : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_TF.eval()
  for step, batch in enumerate(val_dataloader_TF):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].to(device)

    with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_q_num)
    
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_TF.eval()
preds = []
labels = []

for batch in test_dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= T/F : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")




NameError: ignored

In [None]:
torch.save(model_TF, model_TF_dir)

##### 3-4-4. J vs. P

In [None]:
import gc

del model_TF
gc.collect()
torch.cuda.empty_cache()

In [None]:
for epoch in range(epochs):
  print(f"\n======= J/P : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_JP.train()

  for step, batch in enumerate(train_dataloader_JP):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].float().to(device)

    optimizer_JP.zero_grad()

    b_out = model_JP(b_input_id, b_input_mask, b_q_num)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_JP.parameters(), 1.0)

    optimizer_JP.step()
    scheduler_JP.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  print("\n======= J/P : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_JP.eval()
  for step, batch in enumerate(val_dataloader_JP):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_q_num = batch[2].to(device)
    b_label = batch[3].to(device)

    with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_q_num)
    
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_JP.eval()
preds = []
labels = []

for batch in test_dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= J/P : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")


epoch 1 batch id 1 loss 0.6965028047561646 train acc 0.4375
epoch 1 batch id 129 loss 0.701758623123169 train acc 0.5130813953488372
epoch 1 batch id 257 loss 0.6813562512397766 train acc 0.4995136186770428
epoch 1 batch id 385 loss 0.7434478402137756 train acc 0.499512987012987
epoch 1 batch id 513 loss 0.6941266059875488 train acc 0.5001218323586745
epoch 1 batch id 641 loss 0.6772091388702393 train acc 0.5049726989079563

epoch 1 train acc 0.5055941358024691
epoch 1 validation acc 0.5260416666666666

epoch 2 batch id 1 loss 0.6907752752304077 train acc 0.5625
epoch 2 batch id 129 loss 0.7362107038497925 train acc 0.5818798449612403
epoch 2 batch id 257 loss 0.6972082853317261 train acc 0.5707684824902723
epoch 2 batch id 385 loss 0.6782873868942261 train acc 0.5696428571428571
epoch 2 batch id 513 loss 0.6424216032028198 train acc 0.5713937621832359
epoch 2 batch id 641 loss 0.744220495223999 train acc 0.5747854914196567

epoch 2 train acc 0.5752314814814815
epoch 2 validation acc 

In [None]:
torch.save(model_JP, model_JP_dir)

##### 3-4-5. Total Test

In [None]:
import gc

del model_JP
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_IE = torch.load(model_IE_dir).cuda()
model_SN = torch.load(model_SN_dir).cuda()
model_TF = torch.load(model_TF_dir).cuda()
model_JP = torch.load(model_JP_dir).cuda()

# Test: I vs. E
model_IE.eval()
preds = []
labels = []

for batch in test_dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= I/E : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

# Test: S vs. N
model_SN.eval()
preds = []
labels = []

for batch in test_dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= S/N : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

# Test: T vs. F
model_TF.eval()
preds = []
labels = []

for batch in test_dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= T/F : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

# Test: J vs. P
model_JP.eval()
preds = []
labels = []

for batch in test_dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num, b_label = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= J/P : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")


Test Accuracy: 0.6510416666666666

Test Accuracy: 0.6267361111111112

Test Accuracy: 0.6163194444444444

Test Accuracy: 0.6180555555555556


#### 3-5. Export Results
Using our trained model, produce the output for real test inputs (variable`testing`)

In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for test_sentence in testing['Answer']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 105 in tokens, but set max_length as 256, as before
max_len = 256

input_ids = []
att_masks = []
q_nums = []

# Preprocessing
for idx in testing.index:
  q_num = testing['Q_number'][idx]
  answer = testing['Answer'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  q_nums.append(torch.tensor([[q_num]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
q_nums = torch.cat(q_nums, dim=0)

150


In [None]:
# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, q_nums)
dataset_SN = TensorDataset(input_ids, att_masks, q_nums)
dataset_TF = TensorDataset(input_ids, att_masks, q_nums)
dataset_JP = TensorDataset(input_ids, att_masks, q_nums)

dataloader_IE = DataLoader (
    dataset_IE,
    sampler = SequentialSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = SequentialSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = SequentialSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = SequentialSampler(dataset_JP),
    batch_size = batch_size
)

In [None]:
model_IE.eval()
model_SN.eval()
model_TF.eval()
model_JP.eval()

preds_IE = []
preds_prob_IE = []
preds_SN = []
preds_prob_SN = []
preds_TF = []
preds_prob_TF = []
preds_JP = []
preds_prob_JP = []

# Predict I vs. E
for batch in dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_IE = preds_prob_IE + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_IE = preds_IE + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_IE = np.array(preds_IE)

# Predict S vs. N
for batch in dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_SN = preds_prob_SN + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_SN = preds_SN + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_SN = np.array(preds_SN)

# Predict T vs. F
for batch in dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_TF = preds_prob_TF + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_TF = preds_TF + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_TF = np.array(preds_TF)

# Predict J vs. P
for batch in dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_q_num = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_q_num)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_JP = preds_prob_JP + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_JP = preds_JP + np.argmax(b_out_np, axis=1).flatten().tolist()

# preds_JP = np.array(preds_JP)

In [None]:
idx = range(1, len(preds_IE) + 1)
# preds = {'idx': idx,'I/E': preds_IE.tolist(), 'S/N':preds_SN.tolist(), 'T/F':preds_TF.tolist(), 'J/P':preds_JP.tolist()}
preds = {'idx': idx,'I/E': preds_IE, 'S/N':preds_SN, 'T/F':preds_TF, 'J/P':preds_JP}
preds = pd.DataFrame(data=preds)
preds = preds.set_index('idx')
preds.to_csv('result.csv')

preds_prob = {'idx': idx,'I/E': preds_prob_IE, 'S/N':preds_prob_SN, 'T/F':preds_prob_TF, 'J/P':preds_prob_JP}
preds_prob = pd.DataFrame(data=preds_prob)
preds_prob = preds_prob.set_index('idx')
preds_prob.to_csv('result_prob.csv')

### 3. Implementation of the BERT-based Model (separated encoding)
But this time, I will encode the question and the answer separately, instead of using \<SEP\> token.

#### 3-0. Data Setup
First of all, we should organize our data with some organized logics.  
In this section, I will load three datasets and aggregate them into two datasets: Questions, Training Data and Test Data.

In [None]:
questions = pd.read_excel(os.path.join(root_dir, questions_dir))
questions.drop(['index', 'index.1'], axis='columns', inplace=True)
display(questions.head())

Unnamed: 0,Question
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.


In [None]:
# Retrieve the original question in String, using the question number
def retrival_q(q_num):
  return questions.loc[q_num - 1]['Question']

# Unit Test
assert(retrival_q(1) == "주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁금해요.")


training = pd.read_csv(os.path.join(root_dir, train_dir), encoding="CP949")

# We will only use the question-answer pair, at this time.
training.drop(['Data_ID', 'User_ID', 'Gender', 'Age'], axis='columns', inplace=True)

# Retreieve the original question!
training['Question'] = training['Q_number'].apply(retrival_q)
training.drop('Q_number', axis='columns', inplace=True)

# Reordering
training = training[['Question', 'Answer', 'MBTI']]

training.head()

Unnamed: 0,Question,Answer,MBTI
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,<아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...,INFP
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,<중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하...,INFP
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,<그렇다> 감정 이입이 잘되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...,INFP
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,<중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다. 일의 변수가 생길 수 있고...,INFP
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,<아니다> 평정심을 유지 못 하는 편입니다. 머릿속은 백지화가 된 상태로 말도 제대...,INFP


In [None]:
testing = pd.read_csv(os.path.join(root_dir, test_dir), encoding="CP949")

# Again, we will only use the question-answer pair, at this moment.
testing.drop(['Gender', 'Age'], axis='columns', inplace=True)

# Retreieve the original question!
testing['Question'] = testing['Q_number'].apply(retrival_q)
testing.drop('Q_number', axis='columns', inplace=True)

# Reordering
testing = testing[['Question', 'Answer']]

testing.head()

Unnamed: 0,Question,Answer
0,마감 기한을 지키기가 힘든가요? 경험을 이야기해보아요.,<아니다> 저는 모든 일은 정해진 시간을 지켜서 해야 된다고 생각되어서 마감 기한을...
1,조용하고 사적인 장소보다는 사람들로 붐비고 떠들썩한 장소를 좋아하나요? 답변에 대한...,<중립> 저는 조용하고 사적인 장소도 좋아하고 사람들로 붐비고 떠들썩한 장소도 좋아...
2,단계를 건너뛰는 일 없이 절차대로 일을 완수하는 편인가요? 그러한 최근 경험은 어떤...,<그렇다> 저는 규칙을 잘 지키고 매뉴얼 대로 일하는 사람입니다. 그래서 데이터 라...
3,일이 원하는 대로 진행될 것이라는 자신감이 있나요? 그렇게 된 계기나 이유가 있나요.,<그렇다> 저는 항상 긍정적인 사고방식을 가지고 살려고 노력하고 있습니다. 이유는 ...
4,대부분의 시간을 혼자서 일할 수 있는 직업을 원하나요? 이유도 말씀해주세요.,<중립> 혼자서 일하는 것도 좋고 함께 일하는 것도 모두 좋은데 같이 의논하는 일도...


#### 3-1. Preprocessing
For utilizing BERT, we have to satisify its own preprocessing requirements.  
For example, it requires speical tokens such as SEP and CLS in the input.  
But don't worry, since these requirements can be simply satisfied if we use the BERT Tokenizer.


In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for train_sentence in training['Question']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

for train_sentence in training['Answer']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 206 in tokens, so set max_length as 256, safely
max_len = 256

input_ids_Q = []
att_masks_Q = []

input_ids_A = []
att_masks_A = []

labels_IE = []
labels_SN = []
labels_TF = []
labels_JP = []

# Preprocessing
for idx in training.index:
  question = training['Question'][idx]
  answer = training['Answer'][idx]
  mbti = training['MBTI'][idx]

  # Encode with the tokenizer.
  encodings_Q = tokenizer_bert(
      question,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  encodings_A = tokenizer_bert(
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  # Convert MBTI of string to list of integers.
  mbti = MBTI_to_vec(mbti)

  input_ids_Q.append(encodings_Q['input_ids'])
  att_masks_Q.append(encodings_Q['attention_mask'])

  input_ids_A.append(encodings_A['input_ids'])
  att_masks_A.append(encodings_A['attention_mask'])

  labels_IE.append(torch.tensor([mbti[0]]))
  labels_SN.append(torch.tensor([mbti[1]]))
  labels_TF.append(torch.tensor([mbti[2]]))
  labels_JP.append(torch.tensor([mbti[3]]))

# Convert to tensors.
input_ids_Q = torch.cat(input_ids_Q, dim=0)
att_masks_Q = torch.cat(att_masks_Q, dim=0)

input_ids_A = torch.cat(input_ids_A, dim=0)
att_masks_A = torch.cat(att_masks_A, dim=0)

labels_IE = torch.cat(labels_IE, dim=0)
labels_SN = torch.cat(labels_SN, dim=0)
labels_TF = torch.cat(labels_TF, dim=0)
labels_JP = torch.cat(labels_JP, dim=0)

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

206


In [None]:
print(input_ids_Q[0])
print(att_masks_Q[0])

print(input_ids_A[0])
print(att_masks_A[0])

print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids_Q), len(att_masks_Q), len(input_ids_A), len(input_ids_A), len(labels_IE))

tensor([    2, 25753, 14567, 28897, 18069, 14526,  2033, 19742, 22742,  8082,
        31724,  3463, 32771,  8061, 19773, 16941, 24296,  8055,  2016,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
import random
import numpy as np

# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#### 3-2. Data Split
To see the training progress, so I'm going to split some of our training data to evaluate our model. (9:1)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A, labels_IE)
dataset_SN = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A, labels_SN)
dataset_TF = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A, labels_TF)
dataset_JP = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A, labels_JP)

# train_size = int(0.9 * len(dataset_IE))
# val_size = len(dataset_IE) - train_size

# print(f"lengths are {train_size}:{val_size}")

# # Split into train dataset, validation dataset and test dataset.
# train_dataset_IE, val_dataset_IE = random_split(dataset_IE, [train_size, val_size])
# train_dataset_SN, val_dataset_SN = random_split(dataset_SN, [train_size, val_size])
# train_dataset_TF, val_dataset_TF = random_split(dataset_TF, [train_size, val_size])
# train_dataset_JP, val_dataset_JP = random_split(dataset_JP, [train_size, val_size])

batch_size = 16 # 16 or 32

# Define dataloaders
dataloader_IE = DataLoader(
    dataset_IE,
    sampler = RandomSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader(
    dataset_SN,
    sampler = RandomSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader(
    dataset_TF,
    sampler = RandomSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader(
    dataset_JP,
    sampler = RandomSampler(dataset_JP),
    batch_size = batch_size
)

# # Define dataloaders
# train_dataloader_IE = DataLoader(
#     train_dataset_IE,
#     sampler = RandomSampler(train_dataset_IE),
#     batch_size = batch_size
# )

# val_dataloader_IE = DataLoader (
#     val_dataset_IE,
#     sampler = SequentialSampler(val_dataset_IE),
#     batch_size = batch_size
# )

# train_dataloader_SN = DataLoader(
#     train_dataset_SN,
#     sampler = RandomSampler(train_dataset_SN),
#     batch_size = batch_size
# )

# val_dataloader_SN = DataLoader (
#     val_dataset_SN,
#     sampler = SequentialSampler(val_dataset_SN),
#     batch_size = batch_size
# )

# train_dataloader_TF = DataLoader(
#     train_dataset_TF,
#     sampler = RandomSampler(train_dataset_TF),
#     batch_size = batch_size
# )

# val_dataloader_TF = DataLoader (
#     val_dataset_TF,
#     sampler = SequentialSampler(val_dataset_TF),
#     batch_size = batch_size
# )

# train_dataloader_JP = DataLoader(
#     train_dataset_JP,
#     sampler = RandomSampler(train_dataset_JP),
#     batch_size = batch_size
# )

# val_dataloader_JP = DataLoader (
#     val_dataset_JP,
#     sampler = SequentialSampler(val_dataset_JP),
#     batch_size = batch_size
# )

#### 3-3. Model Definition
Now we require a model written in Torch package.  
Because we decided to **fine-tune** the pretrained BERT model, I'm going to add just few layers on top of the pretrained model.

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import Adam

class MBTIClassifier(nn.Module):
  def __init__ (self,
                bertq,
                berta,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier, self).__init__()
    self.dr_rate = dr_rate

    # Pretrained BERT model
    self.bertq = bertq
    self.berta = berta

    # Fine-tuning
    self.linq = nn.Linear(hidden_size, int(hidden_size / 4))
    self.reluq = nn.ReLU()
    self.lina = nn.Linear(hidden_size, hidden_size - int(hidden_size / 4))
    self.relua = nn.ReLU()

    self.lin = nn.Linear(hidden_size, 256)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(256, 256)
    self.relu2  = nn.ReLU()
    self.classifier = nn.Linear(256, num_classes)

    # Drop out
    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids_Q, att_masks_Q, input_ids_A, att_masks_A):
    bert_output_Q = self.bertq(input_ids_Q, token_type_ids=None, attention_mask=att_masks_Q).pooler_output
    bert_output_A = self.berta(input_ids_A, token_type_ids=None, attention_mask=att_masks_A).pooler_output

    if self.dr_rate:
      dr_output_Q = self.dropout(bert_output_Q)
      dr_output_A = self.dropout(bert_output_A)
    else:
      dr_output_Q = bert_output_Q
      dr_output_A = bert_output_A
    
    # Aggregation
    lin_output_Q = self.linq(dr_output_Q)
    relu_output_Q = self.reluq(lin_output_Q)

    lin_output_A = self.lina(dr_output_A)
    relu_output_A = self.relua(lin_output_A)

    concated = torch.cat((relu_output_Q, relu_output_A), dim=1)

    lin_output = self.lin(concated)
    relu_output = self.relu(lin_output)
    lin2_output = self.lin2(relu_output)
    relu2_output = self.relu2(lin2_output)
    return self.classifier(relu2_output)

model_bert_IE_Q = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_IE_A = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN_Q = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN_A = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF_Q = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF_A = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP_Q = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP_A = BertModel.from_pretrained("kykim/bert-kor-base")
  
model_IE = MBTIClassifier(model_bert_IE_Q, model_bert_IE_A, dr_rate = 0.3)
model_SN = MBTIClassifier(model_bert_SN_Q, model_bert_SN_A, dr_rate = 0.3)
model_TF = MBTIClassifier(model_bert_TF_Q, model_bert_TF_A, dr_rate = 0.3)
model_JP = MBTIClassifier(model_bert_JP_Q, model_bert_JP_A, dr_rate = 0.3)

learning_rate = 5e-6

optimizer_IE = Adam(model_IE.parameters(),
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_SN = Adam(model_SN.parameters(),
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_TF = Adam(model_TF.parameters(),
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_JP = Adam(model_JP.parameters(),
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 10 # 2 or 4

total_steps = len(dataloader_IE) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps) 
scheduler_SN = get_cosine_schedule_with_warmup(optimizer_SN, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_TF = get_cosine_schedule_with_warmup(optimizer_TF, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.

#### 3-4. Training
Finally, we can do train our model!  
Let's see how accurate our model is :)

In [None]:
import numpy as np

device = torch.device("cuda")

model_IE.cuda()
model_SN.cuda()
model_TF.cuda()
model_JP.cuda()

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

##### 3-4-1. I vs. E

In [None]:
torch.cuda.empty_cache()
for epoch in range(epochs):
  print(f"\n======= I/E : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_IE.train()

  for step, batch in enumerate(dataloader_IE):
    b_input_id_Q = batch[0].to(device)
    b_input_mask_Q = batch[1].to(device)
    b_input_id_A = batch[2].to(device)
    b_input_mask_A = batch[3].to(device)
    b_label = batch[4].float().to(device)

    optimizer_IE.zero_grad()

    b_out = model_IE(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_IE.parameters(), 1.0)

    optimizer_IE.step()
    scheduler_IE.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))

  # print("\n======= I/E : Validation =======")
  # print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  # model_IE.eval()
  # for step, batch in enumerate(val_dataloader_IE):
  #   b_input_id_Q = batch[0].to(device)
  #   b_input_mask_Q = batch[1].to(device)
  #   b_input_id_A = batch[2].to(device)
  #   b_input_mask_A = batch[3].to(device)
  #   b_label = batch[4].to(device)

  #   with torch.no_grad():
  #     b_out = model_IE(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
  #   val_acc += calc_accuracy(b_out, b_label)
  # print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

torch.save(model_IE, model_IE_dir)
del(model_IE)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6916688680648804 train acc 0.5
epoch 1 batch id 129 loss 0.6978166103363037 train acc 0.498546511627907
epoch 1 batch id 257 loss 0.685836911201477 train acc 0.5034046692607004
epoch 1 batch id 385 loss 0.694135308265686 train acc 0.5
epoch 1 batch id 513 loss 0.6947845220565796 train acc 0.49951267056530213
epoch 1 batch id 641 loss 0.6909545660018921 train acc 0.501170046801872

epoch 2 batch id 1 loss 0.6867105960845947 train acc 0.6875
epoch 2 batch id 129 loss 0.6913474798202515 train acc 0.5528100775193798
epoch 2 batch id 257 loss 0.6633660197257996 train acc 0.555204280155642
epoch 2 batch id 385 loss 0.7088248133659363 train acc 0.5603896103896104
epoch 2 batch id 513 loss 0.7164874076843262 train acc 0.5651803118908382
epoch 2 batch id 641 loss 0.6617848873138428 train acc 0.5728354134165367

epoch 3 batch id 1 loss 0.6355686187744141 train acc 0.75
epoch 3 batch id 129 loss 0.779559314250946 train acc 0.6424418604651163
epoch 3 batch id 257 loss 0.

##### 3-4-2. S vs. N

In [None]:
for epoch in range(epochs):
  print(f"\n======= S/N : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_SN.train()

  for step, batch in enumerate(dataloader_SN):
    b_input_id_Q = batch[0].to(device)
    b_input_mask_Q = batch[1].to(device)
    b_input_id_A = batch[2].to(device)
    b_input_mask_A = batch[3].to(device)
    b_label = batch[4].float().to(device)

    optimizer_SN.zero_grad()

    b_out = model_SN(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_SN.parameters(), 1.0)

    optimizer_SN.step()
    scheduler_SN.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  # print("\n======= S/N : Validation =======")
  # print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  # model_SN.eval()
  # for step, batch in enumerate(val_dataloader_SN):
  #   b_input_id_Q = batch[0].to(device)
  #   b_input_mask_Q = batch[1].to(device)
  #   b_input_id_A = batch[2].to(device)
  #   b_input_mask_A = batch[3].to(device)
  #   b_label = batch[4].to(device)

  #   with torch.no_grad():
  #     b_out = model_SN(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
  #   val_acc += calc_accuracy(b_out, b_label)
  # print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

torch.save(model_SN, model_SN_dir)
del(model_SN)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6934932470321655 train acc 0.4375
epoch 1 batch id 129 loss 0.6960813403129578 train acc 0.4946705426356589
epoch 1 batch id 257 loss 0.6902273893356323 train acc 0.5077821011673151
epoch 1 batch id 385 loss 0.6975000500679016 train acc 0.5097402597402597
epoch 1 batch id 513 loss 0.6904122233390808 train acc 0.515229044834308
epoch 1 batch id 641 loss 0.6761630773544312 train acc 0.5217433697347894

epoch 2 batch id 1 loss 0.6965057849884033 train acc 0.5
epoch 2 batch id 129 loss 0.6623672246932983 train acc 0.5828488372093024
epoch 2 batch id 257 loss 0.639825701713562 train acc 0.5916828793774319
epoch 2 batch id 385 loss 0.7330557107925415 train acc 0.589935064935065
epoch 2 batch id 513 loss 0.6399783492088318 train acc 0.5884502923976608
epoch 2 batch id 641 loss 0.6962334513664246 train acc 0.5900936037441498

epoch 3 batch id 1 loss 0.6622306108474731 train acc 0.5625
epoch 3 batch id 129 loss 0.681347131729126 train acc 0.6308139534883721
epoch 3 ba

##### 3-4-3. T vs. F

In [None]:
for epoch in range(epochs):
  print(f"\n======= T/F : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_TF.train()

  for step, batch in enumerate(dataloader_TF):
    b_input_id_Q = batch[0].to(device)
    b_input_mask_Q = batch[1].to(device)
    b_input_id_A = batch[2].to(device)
    b_input_mask_A = batch[3].to(device)
    b_label = batch[4].float().to(device)

    optimizer_TF.zero_grad()

    b_out = model_TF(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_TF.parameters(), 1.0)

    optimizer_TF.step()
    scheduler_TF.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  # print("\n======= T/F : Validation =======")
  # print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  # model_TF.eval()
  # for step, batch in enumerate(val_dataloader_TF):
  #   b_input_id_Q = batch[0].to(device)
  #   b_input_mask_Q = batch[1].to(device)
  #   b_input_id_A = batch[2].to(device)
  #   b_input_mask_A = batch[3].to(device)
  #   b_label = batch[4].to(device)

  #   with torch.no_grad():
  #     b_out = model_TF(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
  #   val_acc += calc_accuracy(b_out, b_label)
  # print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

torch.save(model_TF, model_TF_dir)
del(model_TF)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6950359344482422 train acc 0.375
epoch 1 batch id 129 loss 0.6957096457481384 train acc 0.5092054263565892
epoch 1 batch id 257 loss 0.6945512294769287 train acc 0.5080252918287937
epoch 1 batch id 385 loss 0.6828789114952087 train acc 0.5159090909090909
epoch 1 batch id 513 loss 0.6837758421897888 train acc 0.515229044834308
epoch 1 batch id 641 loss 0.688495934009552 train acc 0.516575663026521

epoch 2 batch id 1 loss 0.6804612278938293 train acc 0.625
epoch 2 batch id 129 loss 0.6863966584205627 train acc 0.5106589147286822
epoch 2 batch id 257 loss 0.6868886947631836 train acc 0.5155642023346303
epoch 2 batch id 385 loss 0.6883941888809204 train acc 0.5301948051948052
epoch 2 batch id 513 loss 0.6990481615066528 train acc 0.5346003898635477
epoch 2 batch id 641 loss 0.7167924642562866 train acc 0.5408541341653667

epoch 3 batch id 1 loss 0.6528586149215698 train acc 0.75
epoch 3 batch id 129 loss 0.7005738019943237 train acc 0.5833333333333334
epoch 3 ba

##### 3-4-4. J vs. P

In [None]:
for epoch in range(epochs):
  print(f"\n======= J/P : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_JP.train()

  for step, batch in enumerate(dataloader_JP):
    b_input_id_Q = batch[0].to(device)
    b_input_mask_Q = batch[1].to(device)
    b_input_id_A = batch[2].to(device)
    b_input_mask_A = batch[3].to(device)
    b_label = batch[4].float().to(device)

    optimizer_JP.zero_grad()

    b_out = model_JP(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_JP.parameters(), 1.0)

    optimizer_JP.step()
    scheduler_JP.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  # print("\n======= J/P : Validation =======")
  # print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  # model_JP.eval()
  # for step, batch in enumerate(val_dataloader_JP):
  #   b_input_id_Q = batch[0].to(device)
  #   b_input_mask_Q = batch[1].to(device)
  #   b_input_id_A = batch[2].to(device)
  #   b_input_mask_A = batch[3].to(device)
  #   b_label = batch[4].to(device)

  #   with torch.no_grad():
  #     b_out = model_JP(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)
  #   val_acc += calc_accuracy(b_out, b_label)
  # print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

torch.save(model_JP, model_JP_dir)
del(model_JP)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6873605847358704 train acc 0.625
epoch 1 batch id 129 loss 0.6820535659790039 train acc 0.5048449612403101
epoch 1 batch id 257 loss 0.691504716873169 train acc 0.5065661478599222
epoch 1 batch id 385 loss 0.692514955997467 train acc 0.500974025974026
epoch 1 batch id 513 loss 0.6873648166656494 train acc 0.5015838206627681
epoch 1 batch id 641 loss 0.687645673751831 train acc 0.5013650546021841

epoch 2 batch id 1 loss 0.6804591417312622 train acc 0.6875
epoch 2 batch id 129 loss 0.694011926651001 train acc 0.5387596899224806
epoch 2 batch id 257 loss 0.6792239546775818 train acc 0.5474221789883269
epoch 2 batch id 385 loss 0.6815439462661743 train acc 0.5423701298701299
epoch 2 batch id 513 loss 0.7429770827293396 train acc 0.5423976608187134
epoch 2 batch id 641 loss 0.7678124308586121 train acc 0.5497269890795632

epoch 3 batch id 1 loss 0.6022226810455322 train acc 0.875
epoch 3 batch id 129 loss 0.6706632375717163 train acc 0.5968992248062015
epoch 3 ba

#### 3-5. Export Results
Using our trained model, produce the output for real test inputs (variable`testing`)

In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for test_sentence in testing['Question']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

for test_sentence in testing['Answer']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 105 in tokens, but set max_length as 256, as before
max_len = 256

input_ids_Q = []
att_masks_Q = []

input_ids_A = []
att_masks_A = []

# Preprocessing
for idx in testing.index:
  question = testing['Question'][idx]
  answer = testing['Answer'][idx]

  # Encode with the tokenizer.
  encodings_Q = tokenizer_bert(
      question,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  encodings_A = tokenizer_bert(
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  input_ids_Q.append(encodings_Q['input_ids'])
  att_masks_Q.append(encodings_Q['attention_mask'])

  input_ids_A.append(encodings_A['input_ids'])
  att_masks_A.append(encodings_A['attention_mask'])

# Convert to tensors.
input_ids_Q = torch.cat(input_ids_Q, dim=0)
att_masks_Q = torch.cat(att_masks_Q, dim=0)

input_ids_A = torch.cat(input_ids_A, dim=0)
att_masks_A = torch.cat(att_masks_A, dim=0)

150


In [None]:
# Construct datasets
dataset_IE = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A)
dataset_SN = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A)
dataset_TF = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A)
dataset_JP = TensorDataset(input_ids_Q, att_masks_Q, input_ids_A, att_masks_A)

dataloader_IE = DataLoader (
    dataset_IE,
    sampler = SequentialSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = SequentialSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = SequentialSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = SequentialSampler(dataset_JP),
    batch_size = batch_size
)

In [None]:
model_IE = torch.load(model_IE_dir)
model_SN = torch.load(model_SN_dir)
model_TF = torch.load(model_TF_dir)
model_JP = torch.load(model_JP_dir)

model_IE.eval()
model_SN.eval()
model_TF.eval()
model_JP.eval()

preds_IE = []
preds_prob_IE = []
preds_SN = []
preds_prob_SN = []
preds_TF = []
preds_prob_TF = []
preds_JP = []
preds_prob_JP = []

# Predict I vs. E
for batch in dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_IE = preds_prob_IE + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_IE = preds_IE + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict S vs. N
for batch in dataloader_SN:
  batch = tuple(t.to(device) for t in batch)

  b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A = batch
  with torch.no_grad():
      b_out = model_SN(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_SN = preds_prob_SN + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_SN = preds_SN + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict T vs. F
for batch in dataloader_TF:
  batch = tuple(t.to(device) for t in batch)

  b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A = batch
  with torch.no_grad():
      b_out = model_TF(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_TF = preds_prob_TF + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_TF = preds_TF + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict J vs. P
for batch in dataloader_JP:
  batch = tuple(t.to(device) for t in batch)

  b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A = batch
  with torch.no_grad():
      b_out = model_JP(b_input_id_Q, b_input_mask_Q, b_input_id_A, b_input_mask_A)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_JP = preds_prob_JP + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_JP = preds_JP + np.argmax(b_out_np, axis=1).flatten().tolist()

In [None]:
idx = range(1, len(preds_IE) + 1)
preds = {'idx': idx,'I/E': preds_IE, 'S/N':preds_SN, 'T/F':preds_TF, 'J/P':preds_JP}
preds = pd.DataFrame(data=preds)
preds = preds.set_index('idx')
preds.to_csv('result.csv')

preds_prob = {'idx': idx,'I/E': preds_prob_IE, 'S/N':preds_prob_SN, 'T/F':preds_prob_TF, 'J/P':preds_prob_JP}
preds_prob = pd.DataFrame(data=preds_prob)
preds_prob = preds_prob.set_index('idx')
preds_prob.to_csv('result_prob.csv')

#### Google?

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

# Experiment: measure the maximum number of tokens.
max_len = 0
for train_sentence in training['Question']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

for train_sentence in training['Answer']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 465 in tokens, so set max_length as 600, safely
max_len = 500

input_ids = []
att_masks = []
labels_IE = []
labels_SN = []
labels_TF = []
labels_JP = []

# Preprocessing
for idx in training.index:
  question = training['Question'][idx]
  answer = training['Answer'][idx]
  mbti = training['MBTI'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  # Convert MBTI of string to list of integers.
  mbti = MBTI_to_vec(mbti)

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  labels_IE.append(torch.tensor([mbti[0]]))
  labels_SN.append(torch.tensor([mbti[1]]))
  labels_TF.append(torch.tensor([mbti[2]]))
  labels_JP.append(torch.tensor([mbti[3]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
labels_IE = torch.cat(labels_IE, dim=0)
labels_SN = torch.cat(labels_SN, dim=0)
labels_TF = torch.cat(labels_TF, dim=0)
labels_JP = torch.cat(labels_JP, dim=0)

In [None]:
print(input_ids[0])
print(att_masks[0])
print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids), len(att_masks), len(labels_IE))

In [None]:
import random
import numpy as np

# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, labels_IE)
dataset_SN = TensorDataset(input_ids, att_masks, labels_SN)
dataset_TF = TensorDataset(input_ids, att_masks, labels_TF)
dataset_JP = TensorDataset(input_ids, att_masks, labels_JP)

train_size = int(0.8 * len(dataset_IE))
val_size = int(0.1 * len(dataset_IE))
test_size = len(dataset_IE) - train_size - val_size

print(f"lengths are {train_size}:{val_size}:{test_size}")

# Split into train dataset, validation dataset and test dataset.
train_dataset_IE, val_dataset_IE, test_dataset_IE = random_split(dataset_IE, [train_size, val_size, test_size])
train_dataset_SN, val_dataset_SN, test_dataset_SN = random_split(dataset_SN, [train_size, val_size, test_size])
train_dataset_TF, val_dataset_TF, test_dataset_TF = random_split(dataset_TF, [train_size, val_size, test_size])
train_dataset_JP, val_dataset_JP, test_dataset_JP = random_split(dataset_JP, [train_size, val_size, test_size])

batch_size = 16 # 16 or 32

# Define dataloaders
train_dataloader_IE = DataLoader(
    train_dataset_IE,
    sampler = RandomSampler(train_dataset_IE),
    batch_size = batch_size
)

val_dataloader_IE = DataLoader (
    val_dataset_IE,
    sampler = SequentialSampler(val_dataset_IE),
    batch_size = batch_size
)

test_dataloader_IE = DataLoader (
    test_dataset_IE,
    sampler = SequentialSampler(test_dataset_IE),
    batch_size = batch_size
)

train_dataloader_SN = DataLoader(
    train_dataset_SN,
    sampler = RandomSampler(train_dataset_SN),
    batch_size = batch_size
)

val_dataloader_SN = DataLoader (
    val_dataset_SN,
    sampler = SequentialSampler(val_dataset_SN),
    batch_size = batch_size
)

test_dataloader_SN = DataLoader (
    test_dataset_SN,
    sampler = SequentialSampler(test_dataset_SN),
    batch_size = batch_size
)

train_dataloader_TF = DataLoader(
    train_dataset_TF,
    sampler = RandomSampler(train_dataset_TF),
    batch_size = batch_size
)

val_dataloader_TF = DataLoader (
    val_dataset_TF,
    sampler = SequentialSampler(val_dataset_TF),
    batch_size = batch_size
)

test_dataloader_TF = DataLoader (
    test_dataset_TF,
    sampler = SequentialSampler(test_dataset_TF),
    batch_size = batch_size
)

train_dataloader_JP = DataLoader(
    train_dataset_JP,
    sampler = RandomSampler(train_dataset_JP),
    batch_size = batch_size
)

val_dataloader_JP = DataLoader (
    val_dataset_JP,
    sampler = SequentialSampler(val_dataset_JP),
    batch_size = batch_size
)

test_dataloader_JP = DataLoader (
    test_dataset_JP,
    sampler = SequentialSampler(test_dataset_JP),
    batch_size = batch_size
)

In [None]:
from transformers import BertConfig, get_linear_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

class MBTIClassifier(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.lin = nn.Linear(hidden_size, 512)
    self.lin2 = nn.Linear(512, 256)
    self.lin3 = nn.Linear(256, 64)
    self.classifier = nn.Linear(64, num_classes)

    # Model 0.
    # self.classifier = nn.Linear(hidden_size, num_classes)

    # Model 1.
    # self.lin = nn.Linear(hidden_size, 256)
    # self.lin2 = nn.Linear(256, 128)
    # self.lin3 = nn.Linear(128, 64)
    # self.classifier = nn.Linear(64, num_classes)

    # Model 2.
    # self.lin = nn.Linear(hidden_size, 256)
    # self.lin2 = nn.Linear(256, 128)
    # self.lin3 = nn.Linear(128, 64)
    # self.lin4 = nn.Linear(64, num_classes)
    # self.classifier = nn.Softmax(dim=0)

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output
    
    lin_output = self.lin(dr_output)
    lin2_output = self.lin2(lin_output)
    lin3_output = self.lin3(lin2_output)
    return self.classifier(lin3_output)

    # Model 0.
    # return self.classifier(dr_output)

    # Model 1.
    # lin_output = self.lin(dr_output)
    # lin2_output = self.lin2(lin_output)
    # lin3_output = self.lin3(lin2_output)
    # return self.classifier(lin3_output)

    # Model 2.
    # lin_output = self.lin(dr_output)
    # lin2_output = self.lin2(lin_output)
    # lin3_output = self.lin3(lin2_output)
    # lin4_output = self.lin4(lin3_output)
    # return self.classifier(lin4_output)

model_bert = BertModel.from_pretrained("bert-base-multilingual-uncased")
  
model_IE = MBTIClassifier(model_bert, dr_rate = 0.3)
model_SN = MBTIClassifier(model_bert, dr_rate = 0.3)
model_TF = MBTIClassifier(model_bert, dr_rate = 0.3)
model_JP = MBTIClassifier(model_bert, dr_rate = 0.3)

optimizer_IE = AdamW(model_IE.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )
optimizer_SN = AdamW(model_SN.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )
optimizer_TF = AdamW(model_TF.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )
optimizer_JP = AdamW(model_JP.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

epochs = 4 # 2 or 4

total_steps = len(train_dataloader_IE) * epochs

scheduler_IE = get_linear_schedule_with_warmup(optimizer_IE, num_warmup_steps = 0, num_training_steps = total_steps)
scheduler_SN = get_linear_schedule_with_warmup(optimizer_SN, num_warmup_steps = 0, num_training_steps = total_steps)
scheduler_TF = get_linear_schedule_with_warmup(optimizer_TF, num_warmup_steps = 0, num_training_steps = total_steps)
scheduler_JP = get_linear_schedule_with_warmup(optimizer_JP, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
import numpy as np

device = torch.device("cuda")

model_IE.cuda()
model_SN.cuda()
model_TF.cuda()
model_JP.cuda()

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

# def calc_accuracy(X,Y):
#     max_vals, max_indices = torch.max(X, 1)
#     train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
#     return train_acc

In [None]:
for epoch in range(epochs):
  print(f"\n======= I/E : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_IE.train()

  for step, batch in enumerate(train_dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].float().to(device)

    optimizer_IE.zero_grad()

    b_out = model_IE(b_input_id, b_input_mask)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_IE.parameters(), 1.0)

    optimizer_IE.step()
    scheduler_IE.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  print("\n======= I/E : Validation =======")
  print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
  model_IE.eval()
  for step, batch in enumerate(val_dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_label = batch[2].to(device)

    b_out = model_IE(b_input_id, b_input_mask)
    val_acc += calc_accuracy(b_out, b_label)
  print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# Test
model_IE.eval()
preds = []
labels = []

for batch in test_dataloader_IE:
  batch = tuple(t.to(device) for t in batch)

  b_input_id, b_input_mask, b_label = batch
  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask)

  b_out_np = b_out.detach().cpu().numpy()
  b_label_np = b_label.detach().cpu().numpy()
  preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

preds = np.array(preds)
labels = np.array(labels)

print("\n======= I/E : Test =======")
print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

### 4. BASELINE + AGE (Submitted Model!)
This time, add 'age' as the input

#### 4-0. Data Setup
First of all, we should organize our data with some organized logics.  
In this section, I will load three datasets and aggregate them into two datasets: Questions, Training Data and Test Data.

In [None]:
questions = pd.read_excel(os.path.join(root_dir, questions_dir))
questions.drop(['index', 'index.1'], axis='columns', inplace=True)
display(questions.head())

Unnamed: 0,Question
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.


In [None]:
# Retrieve the original question in String, using the question number
def retrival_q(q_num):
  return questions.loc[q_num - 1]['Question']

# Unit Test
assert(retrival_q(1) == "주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁금해요.")


training = pd.read_csv(os.path.join(root_dir, train_dir), encoding="CP949")

# We will only use the question-answer pair, at this time.
training.drop(['Data_ID', 'User_ID', 'Gender'], axis='columns', inplace=True)

# Retreieve the original question!
training['Question'] = training['Q_number'].apply(retrival_q)
training.drop('Q_number', axis='columns', inplace=True)

# Reordering
training = training[['Question', 'Answer', 'Age', 'MBTI']]

training.head()

Unnamed: 0,Question,Answer,Age,MBTI
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,<아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...,30,INFP
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,<중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하...,30,INFP
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,<그렇다> 감정 이입이 잘되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...,30,INFP
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,<중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다. 일의 변수가 생길 수 있고...,30,INFP
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,<아니다> 평정심을 유지 못 하는 편입니다. 머릿속은 백지화가 된 상태로 말도 제대...,30,INFP


In [None]:
testing = pd.read_csv(os.path.join(root_dir, test_dir), encoding="CP949")

# Again, we will only use the question-answer pair, at this moment.
testing.drop(['Gender'], axis='columns', inplace=True)

# Retreieve the original question!
testing['Question'] = testing['Q_number'].apply(retrival_q)
testing.drop('Q_number', axis='columns', inplace=True)

# Reordering
testing = testing[['Question', 'Answer', 'Age']]

testing.head()

Unnamed: 0,Question,Answer,Age
0,마감 기한을 지키기가 힘든가요? 경험을 이야기해보아요.,<아니다> 저는 모든 일은 정해진 시간을 지켜서 해야 된다고 생각되어서 마감 기한을...,30
1,조용하고 사적인 장소보다는 사람들로 붐비고 떠들썩한 장소를 좋아하나요? 답변에 대한...,<중립> 저는 조용하고 사적인 장소도 좋아하고 사람들로 붐비고 떠들썩한 장소도 좋아...,40
2,단계를 건너뛰는 일 없이 절차대로 일을 완수하는 편인가요? 그러한 최근 경험은 어떤...,<그렇다> 저는 규칙을 잘 지키고 매뉴얼 대로 일하는 사람입니다. 그래서 데이터 라...,40
3,일이 원하는 대로 진행될 것이라는 자신감이 있나요? 그렇게 된 계기나 이유가 있나요.,<그렇다> 저는 항상 긍정적인 사고방식을 가지고 살려고 노력하고 있습니다. 이유는 ...,40
4,대부분의 시간을 혼자서 일할 수 있는 직업을 원하나요? 이유도 말씀해주세요.,<중립> 혼자서 일하는 것도 좋고 함께 일하는 것도 모두 좋은데 같이 의논하는 일도...,30


#### 4-1. Preprocessing
For utilizing BERT, we have to satisify its own preprocessing requirements.  
For example, it requires speical tokens such as SEP and CLS in the input.  
But don't worry, since these requirements can be simply satisfied if we use the BERT Tokenizer.


In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for train_sentence in training['Question']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

for train_sentence in training['Answer']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 206 in tokens, so set max_length as 256, safely
max_len = 256

input_ids = []
att_masks = []
ages = []
labels_IE = []
labels_SN = []
labels_TF = []
labels_JP = []

# Preprocessing
for idx in training.index:
  question = training['Question'][idx]
  answer = training['Answer'][idx]
  age = training['Age'][idx]
  mbti = training['MBTI'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  # Convert MBTI of string to list of integers.
  mbti = MBTI_to_vec(mbti)

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  ages.append(torch.tensor([[age]]))
  labels_IE.append(torch.tensor([mbti[0]]))
  labels_SN.append(torch.tensor([mbti[1]]))
  labels_TF.append(torch.tensor([mbti[2]]))
  labels_JP.append(torch.tensor([mbti[3]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
ages = torch.cat(ages, dim=0)
labels_IE = torch.cat(labels_IE, dim=0)
labels_SN = torch.cat(labels_SN, dim=0)
labels_TF = torch.cat(labels_TF, dim=0)
labels_JP = torch.cat(labels_JP, dim=0)

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

206


In [None]:
print(input_ids[0])
print(att_masks[0])
print(ages[0])
print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids), len(att_masks), len(ages), len(labels_IE))

tensor([    2, 25753, 14567, 28897, 18069, 14526,  2033, 19742, 22742,  8082,
        31724,  3463, 32771,  8061, 19773, 16941, 24296,  8055,  2016,     3,
         2030, 15345,  2032, 18430,  3463,  5724,  8423, 26850, 20699, 14204,
        15916, 17729, 25878, 18895, 14045, 27024,  8107, 28669,  8120,  6266,
        24832,  2016,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
import random
import numpy as np

# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#### 4-2. Data Split
Currently, we do not have the answers for testing dataset, so we must split the training data to evaluate our model. (18:1:1)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, ages, labels_IE)
dataset_SN = TensorDataset(input_ids, att_masks, ages, labels_SN)
dataset_TF = TensorDataset(input_ids, att_masks, ages, labels_TF)
dataset_JP = TensorDataset(input_ids, att_masks, ages, labels_JP)

batch_size = 16 # 16 or 32

# Define dataloaders
dataloader_IE = DataLoader (
    dataset_IE,
    sampler = RandomSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = RandomSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = RandomSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = RandomSampler(dataset_JP),
    batch_size = batch_size
)

# ""train_size = int(0.9 * len(dataset_IE))
# val_size = int(0.05 * len(dataset_IE))
# test_size = len(dataset_IE) - train_size - val_size

# print(f"lengths are {train_size}:{val_size}:{test_size}")

# # Split into train dataset, validation dataset and test dataset.
# train_dataset_IE, val_dataset_IE, test_dataset_IE = random_split(dataset_IE, [train_size, val_size, test_size])
# train_dataset_SN, val_dataset_SN, test_dataset_SN = random_split(dataset_SN, [train_size, val_size, test_size])
# train_dataset_TF, val_dataset_TF, test_dataset_TF = random_split(dataset_TF, [train_size, val_size, test_size])
# train_dataset_JP, val_dataset_JP, test_dataset_JP = random_split(dataset_JP, [train_size, val_size, test_size])

# batch_size = 16 # 16 or 32

# # Define dataloaders
# train_dataloader_IE = DataLoader(
#     train_dataset_IE,
#     sampler = RandomSampler(train_dataset_IE),
#     batch_size = batch_size
# )

# val_dataloader_IE = DataLoader (
#     val_dataset_IE,
#     sampler = SequentialSampler(val_dataset_IE),
#     batch_size = batch_size
# )

# test_dataloader_IE = DataLoader (
#     test_dataset_IE,
#     sampler = SequentialSampler(test_dataset_IE),
#     batch_size = batch_size
# )

# train_dataloader_SN = DataLoader(
#     train_dataset_SN,
#     sampler = RandomSampler(train_dataset_SN),
#     batch_size = batch_size
# )

# val_dataloader_SN = DataLoader (
#     val_dataset_SN,
#     sampler = SequentialSampler(val_dataset_SN),
#     batch_size = batch_size
# )

# test_dataloader_SN = DataLoader (
#     test_dataset_SN,
#     sampler = SequentialSampler(test_dataset_SN),
#     batch_size = batch_size
# )

# train_dataloader_TF = DataLoader(
#     train_dataset_TF,
#     sampler = RandomSampler(train_dataset_TF),
#     batch_size = batch_size
# )

# val_dataloader_TF = DataLoader (
#     val_dataset_TF,
#     sampler = SequentialSampler(val_dataset_TF),
#     batch_size = batch_size
# )

# test_dataloader_TF = DataLoader (
#     test_dataset_TF,
#     sampler = SequentialSampler(test_dataset_TF),
#     batch_size = batch_size
# )

# train_dataloader_JP = DataLoader(
#     train_dataset_JP,
#     sampler = RandomSampler(train_dataset_JP),
#     batch_size = batch_size
# )

# val_dataloader_JP = DataLoader (
#     val_dataset_JP,
#     sampler = SequentialSampler(val_dataset_JP),
#     batch_size = batch_size
# )

# test_dataloader_JP = DataLoader (
#     test_dataset_JP,
#     sampler = SequentialSampler(test_dataset_JP),
#     batch_size = batch_size
# )""

#### 4-3. Model Definition
Now we require a model written in Torch package.  
Because we decided to **fine-tune** the pretrained BERT model, I'm going to add just one layer on top of the pretrained model.

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

class MBTIClassifier(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.linstr = nn.Linear(hidden_size, (int)(hidden_size * (127 / 128)))
    self.linage = nn.Linear(1, hidden_size - (int)(hidden_size * (127 / 128)))
    self.lin = nn.Linear(hidden_size, 128)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(128, num_classes)
    self.classifier = nn.Softmax(dim = 1)

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks, ages):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output

    linstr_output = self.linstr(dr_output)
    linage_output = self.linage(ages)

    age_added = torch.cat((linstr_output, linage_output), dim=1)

    lin_output = self.lin(age_added)
    relu_output = self.relu(lin_output)
    lin2_output = self.lin2(relu_output)
    
    return self.classifier(lin2_output)

model_bert_IE = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP = BertModel.from_pretrained("kykim/bert-kor-base")
  
model_IE = MBTIClassifier(model_bert_IE, dr_rate = 0.3)
model_SN = MBTIClassifier(model_bert_SN, dr_rate = 0.3)
model_TF = MBTIClassifier(model_bert_TF, dr_rate = 0.3)
model_JP = MBTIClassifier(model_bert_JP, dr_rate = 0.3)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters_IE = [
    {'params': [p for n, p in model_IE.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_IE.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_SN = [
    {'params': [p for n, p in model_SN.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_SN.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_TF = [
    {'params': [p for n, p in model_TF.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_TF.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_JP = [
    {'params': [p for n, p in model_JP.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_JP.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

learning_rate = 2e-5

optimizer_IE = AdamW(optimizer_grouped_parameters_IE,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_SN = AdamW(optimizer_grouped_parameters_SN,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_TF = AdamW(optimizer_grouped_parameters_TF,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_JP = AdamW(optimizer_grouped_parameters_JP,
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 4 # 2 or 4

total_steps = len(dataloader_IE) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_SN = get_cosine_schedule_with_warmup(optimizer_SN, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_TF = get_cosine_schedule_with_warmup(optimizer_TF, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relation

#### 4-4. Training
Finally, we can do train our model!  
Let's see how accurate our model is :)

In [None]:
import numpy as np

device = torch.device("cuda")

model_IE.cuda()
model_SN.cuda()
model_TF.cuda()
model_JP.cuda()

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

##### 4-4-1. I vs. E

In [None]:
for epoch in range(epochs):
  print(f"\n======= I/E : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_IE.train()

  for step, batch in enumerate(dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_age = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_IE.zero_grad()

    b_out = model_IE(b_input_id, b_input_mask, b_age)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_IE.parameters(), 1.0)

    optimizer_IE.step()
    scheduler_IE.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))

torch.save(model_IE, model_IE_dir)
del(model_IE)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.7194489240646362 train acc 0.3125
epoch 1 batch id 129 loss 0.6795001029968262 train acc 0.5310077519379846
epoch 1 batch id 257 loss 0.6759179830551147 train acc 0.5445038910505836
epoch 1 batch id 385 loss 0.6805075407028198 train acc 0.5573051948051948
epoch 1 batch id 513 loss 0.7062772512435913 train acc 0.5666423001949318
epoch 1 batch id 641 loss 0.60877525806427 train acc 0.5750780031201248

epoch 2 batch id 1 loss 0.6210188269615173 train acc 0.75
epoch 2 batch id 129 loss 0.6021895408630371 train acc 0.6778100775193798
epoch 2 batch id 257 loss 0.6020365357398987 train acc 0.6629377431906615
epoch 2 batch id 385 loss 0.6168845295906067 train acc 0.6618506493506493
epoch 2 batch id 513 loss 0.5124933123588562 train acc 0.6648391812865497
epoch 2 batch id 641 loss 0.5566591620445251 train acc 0.6646840873634945

epoch 3 batch id 1 loss 0.5883232951164246 train acc 0.75
epoch 3 batch id 129 loss 0.46246060729026794 train acc 0.751453488372093
epoch 3 b

##### 4-4-2. S vs. N

In [None]:
for epoch in range(epochs):
  print(f"\n======= S/N : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_SN.train()

  for step, batch in enumerate(dataloader_SN):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_age = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_SN.zero_grad()

    b_out = model_SN(b_input_id, b_input_mask, b_age)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_SN.parameters(), 1.0)

    optimizer_SN.step()
    scheduler_SN.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
  #   print("\n======= S/N : Validation =======")
  #   print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
    
  #   model_SN.eval()
  #   for step, batch in enumerate(val_dataloader_SN):
  #     b_input_id = batch[0].to(device)
  #     b_input_mask = batch[1].to(device)
  #     b_age = batch[2].to(device)
  #     b_label = batch[3].to(device)

  #     b_out = model_IE(b_input_id, b_input_mask, b_age)
  #     val_acc += calc_accuracy(b_out, b_label)
  #   print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

  # # Test
  # model_SN.eval()
  # preds = []
  # labels = []

  # for batch in test_dataloader_SN:
  #   batch = tuple(t.to(device) for t in batch)

  #   b_input_id, b_input_mask, b_age, b_label = batch
  #   with torch.no_grad():
  #       b_out = model_IE(b_input_id, b_input_mask, b_age)

  #   b_out_np = b_out.detach().cpu().numpy()
  #   b_label_np = b_label.detach().cpu().numpy()
  #   preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
  #   labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

  # preds = np.array(preds)
  # labels = np.array(labels)

  # print("\n======= S/N : Test =======")
  # print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

torch.save(model_SN, model_SN_dir)
del(model_SN)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.687963604927063 train acc 0.5
epoch 1 batch id 129 loss 0.7039042115211487 train acc 0.5067829457364341
epoch 1 batch id 257 loss 0.6926409006118774 train acc 0.5175097276264592
epoch 1 batch id 385 loss 0.7238081693649292 train acc 0.5188311688311689
epoch 1 batch id 513 loss 0.6786636710166931 train acc 0.5142543859649122
epoch 1 batch id 641 loss 0.6723315715789795 train acc 0.5141380655226209

epoch 2 batch id 1 loss 0.6336004137992859 train acc 0.75
epoch 2 batch id 129 loss 0.671259880065918 train acc 0.6027131782945736
epoch 2 batch id 257 loss 0.7547507286071777 train acc 0.601896887159533
epoch 2 batch id 385 loss 0.6676262021064758 train acc 0.5998376623376623
epoch 2 batch id 513 loss 0.7391057014465332 train acc 0.6048976608187134
epoch 2 batch id 641 loss 0.6585123538970947 train acc 0.6054017160686428

epoch 3 batch id 1 loss 0.6320949196815491 train acc 0.625
epoch 3 batch id 129 loss 0.5564478635787964 train acc 0.6957364341085271
epoch 3 batc

##### 4-4-3. T vs. F

In [None]:
for epoch in range(epochs):
  print(f"\n======= T/F : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_TF.train()

  for step, batch in enumerate(dataloader_TF):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_age = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_TF.zero_grad()

    b_out = model_TF(b_input_id, b_input_mask, b_age)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_TF.parameters(), 1.0)

    optimizer_TF.step()
    scheduler_TF.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
#   print("\n======= T/F : Validation =======")
#   print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
#   model_TF.eval()
#   for step, batch in enumerate(val_dataloader_TF):
#     b_input_id = batch[0].to(device)
#     b_input_mask = batch[1].to(device)
#     b_age = batch[2].to(device)
#     b_label = batch[3].to(device)

#     b_out = model_TF(b_input_id, b_input_mask, b_age)
#     val_acc += calc_accuracy(b_out, b_label)
#   print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# # Test
# model_TF.eval()
# preds = []
# labels = []

# for batch in test_dataloader_TF:
#   batch = tuple(t.to(device) for t in batch)

#   b_input_id, b_input_mask, b_age, b_label = batch
#   with torch.no_grad():
#       b_out = model_TF(b_input_id, b_input_mask, b_age)

#   b_out_np = b_out.detach().cpu().numpy()
#   b_label_np = b_label.detach().cpu().numpy()
#   preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
#   labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

# preds = np.array(preds)
# labels = np.array(labels)

# print("\n======= T/F : Test =======")
# print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

torch.save(model_TF, model_TF_dir)
del(model_TF)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6931931376457214 train acc 0.4375
epoch 1 batch id 129 loss 0.6895471811294556 train acc 0.5092054263565892
epoch 1 batch id 257 loss 0.6907410621643066 train acc 0.5080252918287937
epoch 1 batch id 385 loss 0.668999969959259 train acc 0.5178571428571429
epoch 1 batch id 513 loss 0.6740896701812744 train acc 0.533625730994152
epoch 1 batch id 641 loss 0.6554639339447021 train acc 0.5405616224648986

epoch 2 batch id 1 loss 0.7444543838500977 train acc 0.4375
epoch 2 batch id 129 loss 0.5511295795440674 train acc 0.6182170542635659
epoch 2 batch id 257 loss 0.661710262298584 train acc 0.6164883268482491
epoch 2 batch id 385 loss 0.7881341576576233 train acc 0.6196428571428572
epoch 2 batch id 513 loss 0.6297006011009216 train acc 0.6207358674463938
epoch 2 batch id 641 loss 0.6734533905982971 train acc 0.6229524180967239

epoch 3 batch id 1 loss 0.6386224031448364 train acc 0.75
epoch 3 batch id 129 loss 0.6151292324066162 train acc 0.7059108527131783
epoch 3 

##### 4-4-4. J vs. P

In [None]:
for epoch in range(epochs):
  print(f"\n======= J/P : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_JP.train()

  for step, batch in enumerate(dataloader_JP):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_age = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_JP.zero_grad()

    b_out = model_JP(b_input_id, b_input_mask, b_age)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_JP.parameters(), 1.0)

    optimizer_JP.step()
    scheduler_JP.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
#   print("\n======= J/P : Validation =======")
#   print("epoch {} train acc {}".format(epoch + 1, train_acc / (step + 1)))
  
#   model_JP.eval()
#   for step, batch in enumerate(val_dataloader_SN):
#     b_input_id = batch[0].to(device)
#     b_input_mask = batch[1].to(device)
#     b_age = batch[2].to(device)
#     b_label = batch[3].to(device)

#     b_out = model_JP(b_input_id, b_input_mask, b_age)
#     val_acc += calc_accuracy(b_out, b_label)
#   print("epoch {} validation acc {}".format(epoch + 1, val_acc / (step + 1)))

# # Test
# model_JP.eval()
# preds = []
# labels = []

# for batch in test_dataloader_JP:
#   batch = tuple(t.to(device) for t in batch)

#   b_input_id, b_input_mask, b_age, b_label = batch
#   with torch.no_grad():
#       b_out = model_JP(b_input_id, b_input_mask, b_age)

#   b_out_np = b_out.detach().cpu().numpy()
#   b_label_np = b_label.detach().cpu().numpy()
#   preds = preds + np.argmax(b_out_np, axis=1).flatten().tolist()
#   labels = labels + np.argmax(b_label_np, axis=1).flatten().tolist()

# preds = np.array(preds)
# labels = np.array(labels)

# print("\n======= J/P : Test =======")
# print(f"Test Accuracy: {np.sum(preds == labels) / len(labels)}")

torch.save(model_JP, model_JP_dir)
del(model_JP)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6815391778945923 train acc 0.6875
epoch 1 batch id 129 loss 0.7205455303192139 train acc 0.4844961240310077
epoch 1 batch id 257 loss 0.6941466331481934 train acc 0.48662451361867703
epoch 1 batch id 385 loss 0.708249568939209 train acc 0.5030844155844156
epoch 1 batch id 513 loss 0.6934715509414673 train acc 0.5120614035087719
epoch 1 batch id 641 loss 0.7067888975143433 train acc 0.5218408736349454

epoch 2 batch id 1 loss 0.6871119141578674 train acc 0.4375
epoch 2 batch id 129 loss 0.5505716800689697 train acc 0.6061046511627907
epoch 2 batch id 257 loss 0.6656162738800049 train acc 0.6077334630350194
epoch 2 batch id 385 loss 0.6202939748764038 train acc 0.6092532467532468
epoch 2 batch id 513 loss 0.7046873569488525 train acc 0.6122076023391813
epoch 2 batch id 641 loss 0.7349381446838379 train acc 0.6135920436817472

epoch 3 batch id 1 loss 0.6387790441513062 train acc 0.6875
epoch 3 batch id 129 loss 0.5102549195289612 train acc 0.7122093023255814
epo

#### 4-5. Export Results
Using our trained model, produce the output for real test inputs (variable`testing`)

In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for test_sentence in testing['Question']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

for test_sentence in testing['Answer']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 105 in tokens, but set max_length as 256, as before
max_len = 256

input_ids = []
att_masks = []
ages = []

# Preprocessing
for idx in testing.index:
  question = testing['Question'][idx]
  answer = testing['Answer'][idx]
  age = training['Age'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  ages.append(torch.tensor([[age]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
ages = torch.cat(ages, dim=0)

150


In [None]:
# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, ages)
dataset_SN = TensorDataset(input_ids, att_masks, ages)
dataset_TF = TensorDataset(input_ids, att_masks, ages)
dataset_JP = TensorDataset(input_ids, att_masks, ages)

dataloader_IE = DataLoader (
    dataset_IE,
    sampler = SequentialSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = SequentialSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = SequentialSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = SequentialSampler(dataset_JP),
    batch_size = batch_size
)

In [None]:
model_IE = torch.load(model_IE_dir)
model_SN = torch.load(model_SN_dir)
model_TF = torch.load(model_TF_dir)
model_JP = torch.load(model_JP_dir)

model_IE.eval()
model_SN.eval()
model_TF.eval()
model_JP.eval()

preds_IE = []
preds_prob_IE = []
preds_SN = []
preds_prob_SN = []
preds_TF = []
preds_prob_TF = []
preds_JP = []
preds_prob_JP = []

# Predict I vs. E
for batch in dataloader_IE:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_age = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_age)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_IE = preds_prob_IE + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_IE = preds_IE + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict S vs. N
for batch in dataloader_SN:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_age = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_age)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_SN = preds_prob_SN + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_SN = preds_SN + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict T vs. F
for batch in dataloader_TF:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_age = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_age)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_TF = preds_prob_TF + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_TF = preds_TF + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict J vs. P
for batch in dataloader_JP:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_age = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_age)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_JP = preds_prob_JP + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_JP = preds_JP + np.argmax(b_out_np, axis=1).flatten().tolist()

In [None]:
idx = range(1, len(preds_IE) + 1)

preds = {'idx': idx,'I/E': preds_IE, 'S/N':preds_SN, 'T/F':preds_TF, 'J/P':preds_JP}
preds = pd.DataFrame(data=preds)
preds = preds.set_index('idx')
preds.to_csv('result.csv')

preds_prob = {'idx': idx,'I/E': preds_prob_IE, 'S/N':preds_prob_SN, 'T/F':preds_prob_TF, 'J/P':preds_prob_JP}
preds_prob = pd.DataFrame(data=preds_prob)
preds_prob = preds_prob.set_index('idx')
preds_prob.to_csv('result_prob.csv')

### 5. BASELINE + GENDER
This time, add 'gender' as the input

#### 5-0. Data Setup
First of all, we should organize our data with some organized logics.  
In this section, I will load three datasets and aggregate them into two datasets: Questions, Training Data and Test Data.

In [None]:
questions = pd.read_excel(os.path.join(root_dir, questions_dir))
questions.drop(['index', 'index.1'], axis='columns', inplace=True)
display(questions.head())

Unnamed: 0,Question
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.


In [None]:
# Retrieve the original question in String, using the question number
def retrival_q(q_num):
  return questions.loc[q_num - 1]['Question']

# Unit Test
assert(retrival_q(1) == "주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁금해요.")


training = pd.read_csv(os.path.join(root_dir, train_dir), encoding="CP949")

# We will only use the question-answer pair, at this time.
training.drop(['Data_ID', 'User_ID', 'Age'], axis='columns', inplace=True)

# Retreieve the original question!
training['Question'] = training['Q_number'].apply(retrival_q)
training.drop('Q_number', axis='columns', inplace=True)

# Reordering
training = training[['Question', 'Answer', 'Gender', 'MBTI']]

training.head()

Unnamed: 0,Question,Answer,Gender,MBTI
0,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,<아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...,1,INFP
1,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,<중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하...,1,INFP
2,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,<그렇다> 감정 이입이 잘되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...,1,INFP
3,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,<중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다. 일의 변수가 생길 수 있고...,1,INFP
4,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,<아니다> 평정심을 유지 못 하는 편입니다. 머릿속은 백지화가 된 상태로 말도 제대...,1,INFP


In [None]:
testing = pd.read_csv(os.path.join(root_dir, test_dir), encoding="CP949")

# Again, we will only use the question-answer pair, at this moment.
testing.drop(['Age'], axis='columns', inplace=True)

# Retreieve the original question!
testing['Question'] = testing['Q_number'].apply(retrival_q)
testing.drop('Q_number', axis='columns', inplace=True)

# Reordering
testing = testing[['Question', 'Answer', 'Gender']]

testing.head()

Unnamed: 0,Question,Answer,Gender
0,마감 기한을 지키기가 힘든가요? 경험을 이야기해보아요.,<아니다> 저는 모든 일은 정해진 시간을 지켜서 해야 된다고 생각되어서 마감 기한을...,0
1,조용하고 사적인 장소보다는 사람들로 붐비고 떠들썩한 장소를 좋아하나요? 답변에 대한...,<중립> 저는 조용하고 사적인 장소도 좋아하고 사람들로 붐비고 떠들썩한 장소도 좋아...,1
2,단계를 건너뛰는 일 없이 절차대로 일을 완수하는 편인가요? 그러한 최근 경험은 어떤...,<그렇다> 저는 규칙을 잘 지키고 매뉴얼 대로 일하는 사람입니다. 그래서 데이터 라...,1
3,일이 원하는 대로 진행될 것이라는 자신감이 있나요? 그렇게 된 계기나 이유가 있나요.,<그렇다> 저는 항상 긍정적인 사고방식을 가지고 살려고 노력하고 있습니다. 이유는 ...,1
4,대부분의 시간을 혼자서 일할 수 있는 직업을 원하나요? 이유도 말씀해주세요.,<중립> 혼자서 일하는 것도 좋고 함께 일하는 것도 모두 좋은데 같이 의논하는 일도...,1


#### 5-1. Preprocessing
For utilizing BERT, we have to satisify its own preprocessing requirements.  
For example, it requires speical tokens such as SEP and CLS in the input.  
But don't worry, since these requirements can be simply satisfied if we use the BERT Tokenizer.


In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for train_sentence in training['Question']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

for train_sentence in training['Answer']:
  tok = tokenizer_bert.encode(train_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 206 in tokens, so set max_length as 256, safely
max_len = 256

input_ids = []
att_masks = []
genders = []
labels_IE = []
labels_SN = []
labels_TF = []
labels_JP = []

# Preprocessing
for idx in training.index:
  question = training['Question'][idx]
  answer = training['Answer'][idx]
  gender = training['Gender'][idx]
  mbti = training['MBTI'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  # Convert MBTI of string to list of integers.
  mbti = MBTI_to_vec(mbti)

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  genders.append(torch.tensor([[gender]]))
  labels_IE.append(torch.tensor([mbti[0]]))
  labels_SN.append(torch.tensor([mbti[1]]))
  labels_TF.append(torch.tensor([mbti[2]]))
  labels_JP.append(torch.tensor([mbti[3]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
genders = torch.cat(genders, dim=0)
labels_IE = torch.cat(labels_IE, dim=0)
labels_SN = torch.cat(labels_SN, dim=0)
labels_TF = torch.cat(labels_TF, dim=0)
labels_JP = torch.cat(labels_JP, dim=0)

206


In [None]:
print(input_ids[0])
print(att_masks[0])
print(genders[0])
print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids), len(att_masks), len(genders), len(labels_IE))

tensor([    2, 25753, 14567, 28897, 18069, 14526,  2033, 19742, 22742,  8082,
        31724,  3463, 32771,  8061, 19773, 16941, 24296,  8055,  2016,     3,
         2030, 15345,  2032, 18430,  3463,  5724,  8423, 26850, 20699, 14204,
        15916, 17729, 25878, 18895, 14045, 27024,  8107, 28669,  8120,  6266,
        24832,  2016,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
import random
import numpy as np

# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

#### 5-2. Data Split
Currently, we do not have the answers for testing dataset, so we must split the training data to evaluate our model. (18:1:1)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, genders, labels_IE)
dataset_SN = TensorDataset(input_ids, att_masks, genders, labels_SN)
dataset_TF = TensorDataset(input_ids, att_masks, genders, labels_TF)
dataset_JP = TensorDataset(input_ids, att_masks, genders, labels_JP)

batch_size = 16 # 16 or 32

# Define dataloaders
dataloader_IE = DataLoader (
    dataset_IE,
    sampler = RandomSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = RandomSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = RandomSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = RandomSampler(dataset_JP),
    batch_size = batch_size
)

# ""train_size = int(0.9 * len(dataset_IE))
# val_size = int(0.05 * len(dataset_IE))
# test_size = len(dataset_IE) - train_size - val_size

# print(f"lengths are {train_size}:{val_size}:{test_size}")

# # Split into train dataset, validation dataset and test dataset.
# train_dataset_IE, val_dataset_IE, test_dataset_IE = random_split(dataset_IE, [train_size, val_size, test_size])
# train_dataset_SN, val_dataset_SN, test_dataset_SN = random_split(dataset_SN, [train_size, val_size, test_size])
# train_dataset_TF, val_dataset_TF, test_dataset_TF = random_split(dataset_TF, [train_size, val_size, test_size])
# train_dataset_JP, val_dataset_JP, test_dataset_JP = random_split(dataset_JP, [train_size, val_size, test_size])

# batch_size = 16 # 16 or 32

# # Define dataloaders
# train_dataloader_IE = DataLoader(
#     train_dataset_IE,
#     sampler = RandomSampler(train_dataset_IE),
#     batch_size = batch_size
# )

# val_dataloader_IE = DataLoader (
#     val_dataset_IE,
#     sampler = SequentialSampler(val_dataset_IE),
#     batch_size = batch_size
# )

# test_dataloader_IE = DataLoader (
#     test_dataset_IE,
#     sampler = SequentialSampler(test_dataset_IE),
#     batch_size = batch_size
# )

# train_dataloader_SN = DataLoader(
#     train_dataset_SN,
#     sampler = RandomSampler(train_dataset_SN),
#     batch_size = batch_size
# )

# val_dataloader_SN = DataLoader (
#     val_dataset_SN,
#     sampler = SequentialSampler(val_dataset_SN),
#     batch_size = batch_size
# )

# test_dataloader_SN = DataLoader (
#     test_dataset_SN,
#     sampler = SequentialSampler(test_dataset_SN),
#     batch_size = batch_size
# )

# train_dataloader_TF = DataLoader(
#     train_dataset_TF,
#     sampler = RandomSampler(train_dataset_TF),
#     batch_size = batch_size
# )

# val_dataloader_TF = DataLoader (
#     val_dataset_TF,
#     sampler = SequentialSampler(val_dataset_TF),
#     batch_size = batch_size
# )

# test_dataloader_TF = DataLoader (
#     test_dataset_TF,
#     sampler = SequentialSampler(test_dataset_TF),
#     batch_size = batch_size
# )

# train_dataloader_JP = DataLoader(
#     train_dataset_JP,
#     sampler = RandomSampler(train_dataset_JP),
#     batch_size = batch_size
# )

# val_dataloader_JP = DataLoader (
#     val_dataset_JP,
#     sampler = SequentialSampler(val_dataset_JP),
#     batch_size = batch_size
# )

# test_dataloader_JP = DataLoader (
#     test_dataset_JP,
#     sampler = SequentialSampler(test_dataset_JP),
#     batch_size = batch_size
# )""

#### 5-3. Model Definition
Now we require a model written in Torch package.  
Because we decided to **fine-tune** the pretrained BERT model, I'm going to add just one layer on top of the pretrained model.

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

class MBTIClassifier(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.linstr = nn.Linear(hidden_size, (int)(hidden_size * (127 / 128)))
    self.lingen = nn.Linear(1, hidden_size - (int)(hidden_size * (127 / 128)))
    self.lin = nn.Linear(hidden_size, 128)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(128, num_classes)
    self.classifier = nn.Softmax(dim = 1)

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks, gens):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output

    linstr_output = self.linstr(dr_output)
    lingen_output = self.lingen(gens)

    gen_added = torch.cat((linstr_output, lingen_output), dim=1)

    lin_output = self.lin(gen_added)
    relu_output = self.relu(lin_output)
    lin2_output = self.lin2(relu_output)
    
    return self.classifier(lin2_output)

model_bert_IE = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP = BertModel.from_pretrained("kykim/bert-kor-base")
  
model_IE = MBTIClassifier(model_bert_IE, dr_rate = 0.3)
model_SN = MBTIClassifier(model_bert_SN, dr_rate = 0.3)
model_TF = MBTIClassifier(model_bert_TF, dr_rate = 0.3)
model_JP = MBTIClassifier(model_bert_JP, dr_rate = 0.3)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters_IE = [
    {'params': [p for n, p in model_IE.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_IE.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_SN = [
    {'params': [p for n, p in model_SN.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_SN.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_TF = [
    {'params': [p for n, p in model_TF.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_TF.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_JP = [
    {'params': [p for n, p in model_JP.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_JP.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

learning_rate = 2e-5

optimizer_IE = AdamW(optimizer_grouped_parameters_IE,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_SN = AdamW(optimizer_grouped_parameters_SN,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_TF = AdamW(optimizer_grouped_parameters_TF,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_JP = AdamW(optimizer_grouped_parameters_JP,
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 4 # 2 or 4

total_steps = len(dataloader_IE) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_SN = get_cosine_schedule_with_warmup(optimizer_SN, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_TF = get_cosine_schedule_with_warmup(optimizer_TF, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.

#### 5-4. Training
Finally, we can do train our model!  
Let's see how accurate our model is :)

In [None]:
import numpy as np

device = torch.device("cuda")

model_IE.cuda()
model_SN.cuda()
model_TF.cuda()
model_JP.cuda()

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

##### 5-4-1. I vs. E

In [None]:
for epoch in range(epochs):
  print(f"\n======= I/E : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_IE.train()

  for step, batch in enumerate(dataloader_IE):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_gender = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_IE.zero_grad()

    b_out = model_IE(b_input_id, b_input_mask, b_gender)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_IE.parameters(), 1.0)

    optimizer_IE.step()
    scheduler_IE.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))

torch.save(model_IE, model_IE_dir)
del(model_IE)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.7001798748970032 train acc 0.4375
epoch 1 batch id 129 loss 0.7016459107398987 train acc 0.5116279069767442
epoch 1 batch id 257 loss 0.6852167844772339 train acc 0.5055933852140078
epoch 1 batch id 385 loss 0.7021611928939819 train acc 0.5209415584415584
epoch 1 batch id 513 loss 0.7122368216514587 train acc 0.5285087719298246
epoch 1 batch id 641 loss 0.6721355319023132 train acc 0.5349063962558502

epoch 2 batch id 1 loss 0.6485157012939453 train acc 0.75
epoch 2 batch id 129 loss 0.6639285087585449 train acc 0.6216085271317829
epoch 2 batch id 257 loss 0.5877779722213745 train acc 0.6223249027237354
epoch 2 batch id 385 loss 0.6351543068885803 train acc 0.6313311688311688
epoch 2 batch id 513 loss 0.7415991425514221 train acc 0.6298732943469786
epoch 2 batch id 641 loss 0.7780039310455322 train acc 0.6333853354134166

epoch 3 batch id 1 loss 0.5490245223045349 train acc 0.8125
epoch 3 batch id 129 loss 0.6687514781951904 train acc 0.7461240310077519
epoch

##### 5-4-2. S vs. N

In [None]:
for epoch in range(epochs):
  print(f"\n======= S/N : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_SN.train()

  for step, batch in enumerate(dataloader_SN):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_age = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_SN.zero_grad()

    b_out = model_SN(b_input_id, b_input_mask, b_age)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_SN.parameters(), 1.0)

    optimizer_SN.step()
    scheduler_SN.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))

torch.save(model_SN, model_SN_dir)
del(model_SN)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6960057020187378 train acc 0.4375
epoch 1 batch id 129 loss 0.688785195350647 train acc 0.5247093023255814
epoch 1 batch id 257 loss 0.6874986290931702 train acc 0.521887159533074
epoch 1 batch id 385 loss 0.7806422710418701 train acc 0.5376623376623376
epoch 1 batch id 513 loss 0.5792698264122009 train acc 0.5455653021442495
epoch 1 batch id 641 loss 0.6319953799247742 train acc 0.5506045241809673

epoch 2 batch id 1 loss 0.7074391841888428 train acc 0.4375
epoch 2 batch id 129 loss 0.5853970050811768 train acc 0.6182170542635659
epoch 2 batch id 257 loss 0.6319385170936584 train acc 0.6303501945525292
epoch 2 batch id 385 loss 0.5918905138969421 train acc 0.6308441558441559
epoch 2 batch id 513 loss 0.4849189817905426 train acc 0.6331627680311891
epoch 2 batch id 641 loss 0.5771105289459229 train acc 0.6341653666146646

epoch 3 batch id 1 loss 0.5119748115539551 train acc 0.875
epoch 3 batch id 129 loss 0.4745030999183655 train acc 0.7286821705426356
epoch 

##### 5-4-3. T vs. F

In [None]:
for epoch in range(epochs):
  print(f"\n======= T/F : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_TF.train()

  for step, batch in enumerate(dataloader_TF):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_age = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_TF.zero_grad()

    b_out = model_TF(b_input_id, b_input_mask, b_age)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_TF.parameters(), 1.0)

    optimizer_TF.step()
    scheduler_TF.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))
  
torch.save(model_TF, model_TF_dir)
del(model_TF)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.6729010343551636 train acc 0.625
epoch 1 batch id 129 loss 0.6967265605926514 train acc 0.5072674418604651
epoch 1 batch id 257 loss 0.6803112626075745 train acc 0.5068093385214008
epoch 1 batch id 385 loss 0.6805212497711182 train acc 0.5076298701298702
epoch 1 batch id 513 loss 0.7193647623062134 train acc 0.518396686159844
epoch 1 batch id 641 loss 0.6367465257644653 train acc 0.5240834633385335

epoch 2 batch id 1 loss 0.6807119250297546 train acc 0.5625
epoch 2 batch id 129 loss 0.7659392952919006 train acc 0.5944767441860465
epoch 2 batch id 257 loss 0.6823577284812927 train acc 0.6070038910505836
epoch 2 batch id 385 loss 0.6202472448348999 train acc 0.6116883116883117
epoch 2 batch id 513 loss 0.5753186941146851 train acc 0.6126949317738791
epoch 2 batch id 641 loss 0.6222316026687622 train acc 0.6143720748829953

epoch 3 batch id 1 loss 0.5949863195419312 train acc 0.75
epoch 3 batch id 129 loss 0.5751014351844788 train acc 0.7078488372093024
epoch 3

##### 5-4-4. J vs. P

In [None]:
for epoch in range(epochs):
  print(f"\n======= J/P : {epoch + 1} / {epochs} =======")
  train_acc = 0.0
  val_acc = 0.0
  test_acc = 0.0

  model_JP.train()

  for step, batch in enumerate(dataloader_JP):
    b_input_id = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_gender = batch[2].float().to(device)
    b_label = batch[3].float().to(device)

    optimizer_JP.zero_grad()

    b_out = model_JP(b_input_id, b_input_mask, b_gender)
    loss = loss_fn(b_out, b_label)
    loss.backward()
        
    torch.nn.utils.clip_grad_norm_(model_JP.parameters(), 1.0)

    optimizer_JP.step()
    scheduler_JP.step()

    train_acc += calc_accuracy(b_out, b_label)

    if step % 128 == 0:
      print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))

torch.save(model_JP, model_JP_dir)
del(model_JP)
torch.cuda.empty_cache()


epoch 1 batch id 1 loss 0.7049600481987 train acc 0.1875
epoch 1 batch id 129 loss 0.7053886651992798 train acc 0.49709302325581395
epoch 1 batch id 257 loss 0.7000486850738525 train acc 0.48711089494163423
epoch 1 batch id 385 loss 0.7155138254165649 train acc 0.4943181818181818
epoch 1 batch id 513 loss 0.6768242716789246 train acc 0.496588693957115
epoch 1 batch id 641 loss 0.6982666254043579 train acc 0.49726989079563183

epoch 2 batch id 1 loss 0.7004210948944092 train acc 0.375
epoch 2 batch id 129 loss 0.6860992908477783 train acc 0.5203488372093024
epoch 2 batch id 257 loss 0.6669090986251831 train acc 0.5233463035019456
epoch 2 batch id 385 loss 0.6913437843322754 train acc 0.5287337662337662
epoch 2 batch id 513 loss 0.6948210597038269 train acc 0.5282651072124757
epoch 2 batch id 641 loss 0.7147037982940674 train acc 0.5328588143525741

epoch 3 batch id 1 loss 0.6591122150421143 train acc 0.625
epoch 3 batch id 129 loss 0.7106804847717285 train acc 0.5983527131782945
epoch 

#### 5-5. Export Results
Using our trained model, produce the output for real test inputs (variable`testing`)

In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

# Experiment: measure the maximum number of tokens.
max_len = 0
for test_sentence in testing['Question']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

for test_sentence in testing['Answer']:
  tok = tokenizer_bert.encode(test_sentence)
  max_len = max(max_len, len(tok))

print(max_len)
# Here, the longest sentence's length is 105 in tokens, but set max_length as 256, as before
max_len = 256

input_ids = []
att_masks = []
genders = []

# Preprocessing
for idx in testing.index:
  question = testing['Question'][idx]
  answer = testing['Answer'][idx]
  gender = training['Gender'][idx]

  # Encode with the tokenizer.
  encodings = tokenizer_bert(
      question,
      answer,
      padding = 'max_length',
      max_length = max_len,
      return_tensors = 'pt',
  )

  input_ids.append(encodings['input_ids'])
  att_masks.append(encodings['attention_mask'])
  genders.append(torch.tensor([[gender]]))

# Convert to tensors.
input_ids = torch.cat(input_ids, dim=0)
att_masks = torch.cat(att_masks, dim=0)
genders = torch.cat(genders, dim=0)

150


In [None]:
# Construct datasets
dataset_IE = TensorDataset(input_ids, att_masks, genders)
dataset_SN = TensorDataset(input_ids, att_masks, genders)
dataset_TF = TensorDataset(input_ids, att_masks, genders)
dataset_JP = TensorDataset(input_ids, att_masks, genders)

dataloader_IE = DataLoader (
    dataset_IE,
    sampler = SequentialSampler(dataset_IE),
    batch_size = batch_size
)

dataloader_SN = DataLoader (
    dataset_SN,
    sampler = SequentialSampler(dataset_SN),
    batch_size = batch_size
)

dataloader_TF = DataLoader (
    dataset_TF,
    sampler = SequentialSampler(dataset_TF),
    batch_size = batch_size
)

dataloader_JP = DataLoader (
    dataset_JP,
    sampler = SequentialSampler(dataset_JP),
    batch_size = batch_size
)

In [None]:
model_IE = torch.load(model_IE_dir)
model_SN = torch.load(model_SN_dir)
model_TF = torch.load(model_TF_dir)
model_JP = torch.load(model_JP_dir)

model_IE.eval()
model_SN.eval()
model_TF.eval()
model_JP.eval()

preds_IE = []
preds_prob_IE = []
preds_SN = []
preds_prob_SN = []
preds_TF = []
preds_prob_TF = []
preds_JP = []
preds_prob_JP = []

# Predict I vs. E
for batch in dataloader_IE:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_gender = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_gender)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_IE = preds_prob_IE + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_IE = preds_IE + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict S vs. N
for batch in dataloader_SN:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_gender = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_gender)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_SN = preds_prob_SN + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_SN = preds_SN + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict T vs. F
for batch in dataloader_TF:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_gender = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_gender)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_TF = preds_prob_TF + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_TF = preds_TF + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict J vs. P
for batch in dataloader_JP:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_gender = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_gender)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_JP = preds_prob_JP + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_JP = preds_JP + np.argmax(b_out_np, axis=1).flatten().tolist()

In [None]:
idx = range(1, len(preds_IE) + 1)

preds = {'idx': idx,'I/E': preds_IE, 'S/N':preds_SN, 'T/F':preds_TF, 'J/P':preds_JP}
preds = pd.DataFrame(data=preds)
preds = preds.set_index('idx')
preds.to_csv('result.csv')

preds_prob = {'idx': idx,'I/E': preds_prob_IE, 'S/N':preds_prob_SN, 'T/F':preds_prob_TF, 'J/P':preds_prob_JP}
preds_prob = pd.DataFrame(data=preds_prob)
preds_prob = preds_prob.set_index('idx')
preds_prob.to_csv('result_prob.csv')