In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import *
import os
import sys
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import numpy as np
import re
import pickle
import time
import pandas as pd
from pathlib import Path
import random
from torch.utils.tensorboard import SummaryWriter

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
# Mount google drive
from google.colab import drive
google_drive_path = '/content/gdrive/MyDrive/QIA_hackathon/phase 2/'
#google_drive_path = '/content/gdrive/MyDrive/phase 2/'
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
train_data_file = '/content/gdrive/MyDrive/QIA_hackathon/phase 2/data/train_data.xlsx'
test_data_file = '/content/gdrive/MyDrive/QIA_hackathon/phase 2/data/test_data.xlsx'
question_file = '/content/gdrive/MyDrive/QIA_hackathon/phase 2/data/Question_with_type.xlsx'
# train_data_file = '/content/gdrive/MyDrive/phase 2/data/train_data.xlsx'
# test_data_file = '/content/gdrive/MyDrive/phase 2/data/test_data.xlsx'
# question_file = '/content/gdrive/MyDrive/phase 2/data/Question_with_type.xlsx'

In [None]:
# model_IE_dir = google_drive_path + 'pt/model_IE.pt'
# model_SN_dir = google_drive_path + 'pt/model_SN.pt'
# model_TF_dir = google_drive_path + 'pt/model_TF.pt'
# model_JP_dir = google_drive_path + 'pt/model_JP.pt'

model_IE_dir = google_drive_path + 'pt_best/model_IE_base.pt'
model_SN_dir = google_drive_path + 'pt_best/model_SN.pt'
model_TF_dir = google_drive_path + 'pt_best/model_TF_age.pt'
model_JP_dir = google_drive_path + 'pt_best/model_JP_base.pt'

In [None]:
# Read the data
df1 = pd.read_excel(train_data_file, index_col=0)
df2 = pd.read_excel(test_data_file, index_col=0)
df3 = pd.read_excel(question_file, index_col=0)

In [None]:
df3.head()

Unnamed: 0_level_0,index.1,Question,type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,IE
2,2,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,SN
3,3,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,TF
4,4,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,JP
5,5,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,TF


In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7200 entries, 481 to 14400
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User_ID       7200 non-null   int64 
 1   Gender        7200 non-null   object
 2   Age           7200 non-null   int64 
 3   MBTI          7200 non-null   object
 4   Q_number      7200 non-null   int64 
 5   Short_Answer  7200 non-null   object
 6   Long_Answer   7200 non-null   object
dtypes: int64(3), object(4)
memory usage: 450.0+ KB


In [None]:
def shortAns_to_cat(short_answer):
  if short_answer == "그렇다":
    return 2
  elif short_answer == "중립/모르겠다":
    return 1
  elif short_answer == "아니다":
    return 0
  else:
    raise Exception(f"Not valid: {short_answer}")

def gender_to_cat(gender):
  if gender == "female":
    return 1
  elif gender == "male":
    return 0
  else:
    raise Exception(f"Not valid: {gender}")

# Retrieve the original question in String, using the question number
def retrival_q(q_num):
  return df3.loc[q_num]['Question']

def retrival_qtype(q_num):
  return df3.loc[q_num]['type']

assert(retrival_q(1) == "주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁금해요.")
assert(retrival_qtype(1) == "IE")

In [None]:
df1['Question'] = df1['Q_number'].apply(retrival_q)
df1['Q_type'] = df1['Q_number'].apply(retrival_qtype)
df1['Short_Answer'] = df1['Short_Answer'].apply(shortAns_to_cat)
df1['Gender'] = df1['Gender'].apply(gender_to_cat)

#df1.drop(['User_ID'], axis='columns', inplace=True)

In [None]:
# Reordering
df1 = df1[['User_ID','Q_number', 'Question', 'Q_type', 'Short_Answer', 'Long_Answer', 'Age', 'Gender','MBTI']]

df1.head()

# Question text, Q_number info not needed???

Unnamed: 0_level_0,User_ID,Q_number,Question,Q_type,Short_Answer,Long_Answer,Age,Gender,MBTI
Data_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
481,9,1,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,IE,0,새로운 사람을 만나서 이야기를 나누고 마음이 통하는 친구가 되기까지의 과정은 많은 ...,30,1,INTJ
482,9,2,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,SN,2,자유시간이 생기면 평소에 관심을 두고 있던 분야에 대해 공부하고 탐구하는 시간을 주...,30,1,INTJ
483,9,3,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,TF,2,다른 사람이 울고 있는 모습만 볼 때는 왜 울고 있는지 이유를 몰라 공감 되지 않는...,30,1,INTJ
484,9,4,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,JP,2,모든 일이 마음먹은 대로 진행되는 것은 아니기에 미리 여러 가지 대책을 세워둬야 계...,30,1,INTJ
485,9,5,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,TF,2,퇴사 면담을 하면서 대외적인 이유가 아닌 진짜 퇴사의 이유를 솔직하게 말해 달라는 ...,30,1,INTJ


In [None]:
def MBTI_to_vec(mbti):
  if len(mbti) != 4:
    raise Exception(f"Not valid: {mbti}")

  if mbti[0] == 'I':
    fst = [1, 0]
  elif mbti[0] == 'E':
    fst = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")

  if mbti[1] == 'S':
    snd = [1, 0]
  elif mbti[1] == 'N':
    snd = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")

  if mbti[2] == 'T':
    trd = [1, 0]
  elif mbti[2] == 'F':
    trd = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")
  
  if mbti[3] == 'J':
    fth = [1, 0]
  elif mbti[3] == 'P':
    fth = [0, 1]
  else:
    raise Exception(f"Not valid: {mbti}")
  
  return [fst, snd, trd, fth]

## Preprocessing

In [None]:
from transformers import BertTokenizerFast, BertModel
import torch

# Loading a tokenizer and a model.
tokenizer_bert = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/tokenizer_config.json


Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/config.json
Model config BertConfig {
  "_name_or_path": "kykim/bert-kor-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "embedding_size": 768,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_siz

In [None]:
def preprocess(df1, is_export=False):
  max_len = 256

  input_ids = {'IE':[], 'SN':[], 'TF':[], 'JP':[]} # long answers
  att_masks = {'IE':[], 'SN':[], 'TF':[], 'JP':[]}
  short_answers = {'IE':[], 'SN':[], 'TF':[], 'JP':[]}
  ages = {'IE':[], 'SN':[], 'TF':[], 'JP':[]}
  genders = {'IE':[], 'SN':[], 'TF':[], 'JP':[]}
  labels_IE = []
  labels_SN = []
  labels_TF = []
  labels_JP = []

  # Preprocessing
  for idx in df1.index:
    question = df1['Question'][idx]
    long_answer = df1['Long_Answer'][idx]
    short_answer = df1['Short_Answer'][idx]
    age = df1['Age'][idx]
    gender = df1['Gender'][idx]
    if not is_export:
      mbti = df1['MBTI'][idx]

    q_type = df1['Q_type'][idx] ##

    # Encode with the tokenizer.
    encodings = tokenizer_bert(
        question,
        long_answer,
        padding = 'max_length',
        max_length = max_len,
        return_tensors = 'pt',
    )

    # Convert MBTI of string to list of integers.
    if not is_export:
      mbti = MBTI_to_vec(mbti)

    input_ids[q_type].append(encodings['input_ids'])
    att_masks[q_type].append(encodings['attention_mask'])
    short_answers[q_type].append(torch.tensor([[short_answer]]))
    ages[q_type].append(torch.tensor([[age]]))
    genders[q_type].append(torch.tensor([[gender]]))

    if not is_export:
      if q_type == 'IE':
        labels_IE.append(torch.tensor([mbti[0]]))
      elif q_type == 'SN':
        labels_SN.append(torch.tensor([mbti[1]]))
      elif q_type == 'TF':
        labels_TF.append(torch.tensor([mbti[2]]))
      elif q_type == 'JP':
        labels_JP.append(torch.tensor([mbti[3]]))
      else:
        print("error")

  # Convert to tensors.
  for q_type in input_ids.keys():
    input_ids[q_type] = torch.cat(input_ids[q_type], dim=0)
    att_masks[q_type] = torch.cat(att_masks[q_type], dim=0)
    short_answers[q_type] = torch.cat(short_answers[q_type], dim=0)
    ages[q_type] = torch.cat(ages[q_type], dim=0)
    genders[q_type] = torch.cat(genders[q_type], dim=0)

  if not is_export:
    labels_IE = torch.cat(labels_IE, dim=0)
    labels_SN = torch.cat(labels_SN, dim=0)
    labels_TF = torch.cat(labels_TF, dim=0)
    labels_JP = torch.cat(labels_JP, dim=0)

  return input_ids, att_masks, short_answers, ages, genders, labels_IE, labels_SN, labels_TF, labels_JP

In [None]:
input_ids, att_masks, short_answers, ages, genders, labels_IE, labels_SN, labels_TF, labels_JP = preprocess(df1)

In [None]:
print(input_ids['IE'][0])
print(att_masks['IE'][0])
print(short_answers['IE'][0])
print(ages['IE'][0])
print(genders['IE'][0])
print(labels_IE[0])
print(labels_SN[0])
print(labels_TF[0])
print(labels_JP[0])

print(len(input_ids['IE']), len(att_masks['IE']), len(short_answers['IE']), len(ages['IE']), len(labels_IE))
print(len(input_ids['IE']), len(input_ids['SN']), len(input_ids['TF']), len(input_ids['JP']))

tensor([    2, 25753, 14567, 28897, 18069, 14526,  2033, 19742, 22742,  8082,
        31724,  3463, 32771,  8061, 19773, 16941, 24296,  8055,  2016,     3,
        14567, 17106, 25135, 15960, 28589, 16058, 33292, 17541, 17603, 23567,
        27842, 14090, 25436, 23299, 14261, 38325, 36272,  2016,  4909, 28897,
        35626, 25718, 15009, 15385, 14644, 30252, 14307, 22022, 14199,  3175,
        26358,  2016,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
# Fix the seeds
seed_val = 50

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

## Create dataloaders

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Construct datasets
dataset_IE_base = TensorDataset(input_ids['IE'], att_masks['IE'], short_answers['IE'], labels_IE)
dataset_JP_base = TensorDataset(input_ids['JP'], att_masks['JP'], short_answers['JP'], labels_JP)

dataset_SN_age = TensorDataset(input_ids['SN'], att_masks['SN'], short_answers['SN'], ages['SN'], labels_SN)
dataset_TF_age = TensorDataset(input_ids['TF'], att_masks['TF'], short_answers['TF'], ages['TF'], labels_TF)

In [None]:
def construct_dataloader(dataset_xx, batch_size=16):  
  dataloader_xx = DataLoader (
      dataset_xx,
      sampler = RandomSampler(dataset_xx),
      batch_size = batch_size
  )

  return dataloader_xx

In [None]:
dataloader_IE_base = construct_dataloader(dataset_IE_base)
dataloader_JP_base = construct_dataloader(dataset_JP_base)

dataloader_SN_age = construct_dataloader(dataset_SN_age)
dataloader_TF_age = construct_dataloader(dataset_TF_age)

## Model & Training

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

device = torch.device("cuda")

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(preds, labels):
  preds = preds.detach().cpu().numpy()
  labels = labels.detach().cpu().numpy()
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = np.argmax(labels, axis=1).flatten()
  return np.sum(preds_flat == labels_flat) / len(labels_flat)

In [None]:
model_bert_IE = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_SN = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_JP = BertModel.from_pretrained("kykim/bert-kor-base")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "embedding_size": 768,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size"

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/pytorch_model.bin
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassif

### Model: baseline - for I/E & J/P

In [None]:
class MBTIClassifier_base(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier_base, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.linstr = nn.Linear(hidden_size, (int)(hidden_size * (127 / 128)))
    self.linshort = nn.Linear(1, hidden_size - (int)(hidden_size * (127 / 128))) ##
    self.lin = nn.Linear(hidden_size, 128)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(128, num_classes)
    self.classifier = nn.Softmax(dim = 1)

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks, short_answers):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output

    linstr_output = self.linstr(dr_output)
    linshort_output = self.linshort(short_answers) ##

    all_added = torch.cat((linstr_output, linshort_output), dim=1)

    lin_output = self.lin(all_added)
    relu_output = self.relu(lin_output)
    lin2_output = self.lin2(relu_output)
    
    return self.classifier(lin2_output)
  
model_IE = MBTIClassifier_base(model_bert_IE, dr_rate = 0.3)
model_JP = MBTIClassifier_base(model_bert_JP, dr_rate = 0.3)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters_IE = [
    {'params': [p for n, p in model_IE.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_IE.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_JP = [
    {'params': [p for n, p in model_JP.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_JP.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

learning_rate = 2e-5

optimizer_IE = AdamW(optimizer_grouped_parameters_IE,
                  lr = learning_rate,
                  eps = 1e-8
                )

optimizer_JP = AdamW(optimizer_grouped_parameters_JP,
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 10
total_steps = len(dataloader_IE_base) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)


NameError: ignored

In [None]:
def train_model_base(name, model_xx, optimizer_xx, scheduler_xx, dataloader_xx, model_xx_dir):
  model_xx.cuda()
  
  for epoch in range(epochs):
    print(name)
    print(f"======= {epoch + 1} / {epochs} =======")
    train_acc = 0.0
    test_acc = 0.0

    model_xx.train()

    for step, batch in enumerate(dataloader_xx):
      b_input_id = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_short = batch[2].float().to(device)
      b_label = batch[3].float().to(device)

      optimizer_xx.zero_grad()

      b_out = model_xx(b_input_id, b_input_mask, b_short)
      loss = loss_fn(b_out, b_label)
      loss.backward()
          
      torch.nn.utils.clip_grad_norm_(model_xx.parameters(), 1.0)

      optimizer_xx.step()
      scheduler_xx.step()

      train_acc += calc_accuracy(b_out, b_label)

      if step % 16 == 0:
        print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))


  torch.save(model_xx, model_xx_dir)
  del(model_xx)
  torch.cuda.empty_cache()

In [None]:
train_model_base("I/E", model_IE, optimizer_IE, scheduler_IE, dataloader_IE_base, model_IE_dir)

I/E
epoch 1 batch id 1 loss 0.699505627155304 train acc 0.4375
epoch 1 batch id 17 loss 0.6948754787445068 train acc 0.5551470588235294
epoch 1 batch id 33 loss 0.6928362846374512 train acc 0.5227272727272727
epoch 1 batch id 49 loss 0.6973528265953064 train acc 0.5165816326530612
epoch 1 batch id 65 loss 0.6864979863166809 train acc 0.5125
epoch 1 batch id 81 loss 0.6863939762115479 train acc 0.5092592592592593
epoch 1 batch id 97 loss 0.6892430186271667 train acc 0.5045103092783505
I/E
epoch 2 batch id 1 loss 0.6938775777816772 train acc 0.4375
epoch 2 batch id 17 loss 0.7620671391487122 train acc 0.5294117647058824
epoch 2 batch id 33 loss 0.6845139861106873 train acc 0.5208333333333334
epoch 2 batch id 49 loss 0.7018166780471802 train acc 0.5280612244897959
epoch 2 batch id 65 loss 0.7048094868659973 train acc 0.5240384615384616
epoch 2 batch id 81 loss 0.6755356788635254 train acc 0.5246913580246914
epoch 2 batch id 97 loss 0.6915950775146484 train acc 0.5193298969072165
I/E
epoch

In [None]:
train_model_base("J/P", model_JP, optimizer_JP, scheduler_JP, dataloader_JP_base, model_JP_dir)

J/P
epoch 1 batch id 1 loss 0.6950881481170654 train acc 0.5
epoch 1 batch id 17 loss 0.6911914348602295 train acc 0.47058823529411764
epoch 1 batch id 33 loss 0.6866869926452637 train acc 0.4772727272727273
epoch 1 batch id 49 loss 0.7058669924736023 train acc 0.4897959183673469
epoch 1 batch id 65 loss 0.6928192377090454 train acc 0.5019230769230769
epoch 1 batch id 81 loss 0.6754801869392395 train acc 0.5077160493827161
epoch 1 batch id 97 loss 0.682321310043335 train acc 0.5012886597938144
J/P
epoch 2 batch id 1 loss 0.6934977173805237 train acc 0.6875
epoch 2 batch id 17 loss 0.6561018824577332 train acc 0.5330882352941176
epoch 2 batch id 33 loss 0.6917966604232788 train acc 0.5170454545454546
epoch 2 batch id 49 loss 0.6755691170692444 train acc 0.5165816326530612
epoch 2 batch id 65 loss 0.675352156162262 train acc 0.5173076923076924
epoch 2 batch id 81 loss 0.7408791780471802 train acc 0.5216049382716049
epoch 2 batch id 97 loss 0.6467621922492981 train acc 0.5186855670103093


### Model: baseline + age - for S/N & T/F

In [None]:
from transformers import BertConfig, get_cosine_schedule_with_warmup
from torch import nn
from torch.optim import AdamW

class MBTIClassifier_age(nn.Module):
  def __init__ (self,
                bert,
                hidden_size=768,
                num_classes=2,
                dr_rate=None):
    super(MBTIClassifier_age, self).__init__()
    self.dr_rate = dr_rate
    self.bert = bert
    self.linstr = nn.Linear(hidden_size, (int)(hidden_size * (126 / 128)))
    self.linage = nn.Linear(1, (int)(hidden_size * (1/ 128)))
    self.linshort = nn.Linear(1, hidden_size - (int)(hidden_size * (126 / 128)) - (int)(hidden_size * (1/128))) ##
    self.lin = nn.Linear(hidden_size, 128)
    self.relu = nn.ReLU()
    self.lin2 = nn.Linear(128, num_classes)
    self.classifier = nn.Softmax(dim = 1)

    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)
  
  def forward(self, input_ids, att_masks, short_answers, ages):
    bert_output = self.bert(input_ids, token_type_ids=None, attention_mask=att_masks).pooler_output
    
    if self.dr_rate:
      dr_output = self.dropout(bert_output)
    else:
      dr_output = bert_output

    linstr_output = self.linstr(dr_output)
    linage_output = self.linage(ages)
    linshort_output = self.linshort(short_answers) ##

    age_short_added = torch.cat((linstr_output, linage_output, linshort_output), dim=1)

    lin_output = self.lin(age_short_added)
    relu_output = self.relu(lin_output)
    lin2_output = self.lin2(relu_output)
    
    return self.classifier(lin2_output)

model_bert_SN = BertModel.from_pretrained("kykim/bert-kor-base")
model_bert_TF = BertModel.from_pretrained("kykim/bert-kor-base")
  
model_SN = MBTIClassifier_age(model_bert_SN, dr_rate = 0.3)
model_TF = MBTIClassifier_age(model_bert_TF, dr_rate = 0.3)

no_decay = ['bias', 'LayerNorm.weight']

optimizer_grouped_parameters_SN = [
    {'params': [p for n, p in model_SN.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_SN.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer_grouped_parameters_TF = [
    {'params': [p for n, p in model_TF.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_TF.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]


learning_rate = 2e-5

optimizer_SN = AdamW(optimizer_grouped_parameters_SN,
                  lr = learning_rate,
                  eps = 1e-8
                )
optimizer_TF = AdamW(optimizer_grouped_parameters_TF,
                  lr = learning_rate,
                  eps = 1e-8
                )

epochs = 10

total_steps = len(dataloader_SN_age) * epochs
warmup_steps = int(total_steps * 0.1)

scheduler_IE = get_cosine_schedule_with_warmup(optimizer_IE, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_SN = get_cosine_schedule_with_warmup(optimizer_SN, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_TF = get_cosine_schedule_with_warmup(optimizer_TF, num_warmup_steps = warmup_steps, num_training_steps = total_steps)
scheduler_JP = get_cosine_schedule_with_warmup(optimizer_JP, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "embedding_size": 768,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size"

Downloading pytorch_model.bin:   0%|          | 0.00/476M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--kykim--bert-kor-base/snapshots/1779cc0982ada0216dd6de0dd4e86fb78201926d/pytorch_model.bin
Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassif

NameError: ignored

In [None]:
def train_model_age(name, model_xx, optimizer_xx, scheduler_xx, dataloader_xx, model_xx_dir):
  model_xx.cuda()

  for epoch in range(epochs):
    print(name)
    print(f"======= {epoch + 1} / {epochs} =======")
    train_acc = 0.0
    test_acc = 0.0

    model_xx.train()

    for step, batch in enumerate(dataloader_xx):
      b_input_id = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_short = batch[2].float().to(device)
      b_age = batch[3].float().to(device)
      b_label = batch[4].float().to(device)

      optimizer_xx.zero_grad()

      b_out = model_xx(b_input_id, b_input_mask, b_short, b_age)
      loss = loss_fn(b_out, b_label)
      loss.backward()
          
      torch.nn.utils.clip_grad_norm_(model_xx.parameters(), 1.0)

      optimizer_xx.step()
      scheduler_xx.step()

      train_acc += calc_accuracy(b_out, b_label)

      if step % 16 == 0:
        print("epoch {} batch id {} loss {} train acc {}".format(epoch + 1, step + 1, loss.data.cpu().numpy(), train_acc / (step + 1)))

  torch.save(model_xx, model_xx_dir)
  del(model_xx)
  torch.cuda.empty_cache()

In [None]:
train_model_age("S/N", model_SN, optimizer_SN, scheduler_SN, dataloader_SN_age, model_SN_dir)

S/N
epoch 1 batch id 1 loss 0.6758502721786499 train acc 0.5625
epoch 1 batch id 17 loss 0.6902005672454834 train acc 0.5257352941176471
epoch 1 batch id 33 loss 0.7307713627815247 train acc 0.5321969696969697
epoch 1 batch id 49 loss 0.7019630670547485 train acc 0.5114795918367347
epoch 1 batch id 65 loss 0.7030199766159058 train acc 0.5115384615384615
epoch 1 batch id 81 loss 0.680959939956665 train acc 0.5123456790123457
epoch 1 batch id 97 loss 0.6791056990623474 train acc 0.5199742268041238
epoch 1 batch id 113 loss 0.6913493871688843 train acc 0.5110619469026548
S/N
epoch 2 batch id 1 loss 0.6924874782562256 train acc 0.625
epoch 2 batch id 17 loss 0.7003716230392456 train acc 0.5
epoch 2 batch id 33 loss 0.7332204580307007 train acc 0.4962121212121212
epoch 2 batch id 49 loss 0.6883540749549866 train acc 0.49362244897959184
epoch 2 batch id 65 loss 0.70102858543396 train acc 0.5153846153846153
epoch 2 batch id 81 loss 0.6771023273468018 train acc 0.5277777777777778
epoch 2 batch

In [None]:
train_model_age("T/F", model_TF, optimizer_TF, scheduler_TF, dataloader_TF_age, model_TF_dir)

T/F
epoch 1 batch id 1 loss 0.659927487373352 train acc 0.625
epoch 1 batch id 17 loss 0.7287081480026245 train acc 0.4742647058823529
epoch 1 batch id 33 loss 0.7445829510688782 train acc 0.49053030303030304
epoch 1 batch id 49 loss 0.741357147693634 train acc 0.49107142857142855
epoch 1 batch id 65 loss 0.7212130427360535 train acc 0.48846153846153845
epoch 1 batch id 81 loss 0.6897526979446411 train acc 0.4861111111111111
epoch 1 batch id 97 loss 0.6946068406105042 train acc 0.4729381443298969
epoch 1 batch id 113 loss 0.7103577852249146 train acc 0.47345132743362833
T/F
epoch 2 batch id 1 loss 0.6781226992607117 train acc 0.625
epoch 2 batch id 17 loss 0.6957057118415833 train acc 0.47058823529411764
epoch 2 batch id 33 loss 0.6802890300750732 train acc 0.4602272727272727
epoch 2 batch id 49 loss 0.722913384437561 train acc 0.47066326530612246
epoch 2 batch id 65 loss 0.6736736297607422 train acc 0.4576923076923077
epoch 2 batch id 81 loss 0.679204523563385 train acc 0.475308641975

## Export result

In [None]:
df2.head()

Unnamed: 0_level_0,User_ID,Gender,Age,Q_number,Short_Answer,Long_Answer
Data_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,female,30,1,아니다,친구를 만들 상황에 새로운 친구를 만듭니다. 의도적으로나 꼭 주기적으로 새로운 친구...
2,1,female,30,2,그렇다,일상이 무료한 걸 못 참는 편. 요즘은 리본으로 뜨개질해서 가방 만드는 걸 배우고 ...
3,1,female,30,3,아니다,다른 사람이 울고 있는 모습을 본다고 해도 눈물이 나지는 않습니다. 그 심정은 이해...
4,1,female,30,4,그렇다,진행하고 있는 일이 계획단계일 때 이미 대비책을 함께 세우는 편. 잘못된 순간 바로...
5,1,female,30,5,그렇다,"최근 프로젝트 진행 중 문제가 발생, 마감기한이 얼마 남지 않았고, 주변 사람들도 ..."


In [None]:
df2['Question'] = df2['Q_number'].apply(retrival_q)
df2['Q_type'] = df2['Q_number'].apply(retrival_qtype)
df2['Short_Answer'] = df2['Short_Answer'].apply(shortAns_to_cat)
df2['Gender'] = df2['Gender'].apply(gender_to_cat)

# Reordering
df2 = df2[['User_ID','Q_number', 'Question', 'Q_type', 'Short_Answer', 'Long_Answer', 'Age', 'Gender']]
df2.head()

Unnamed: 0_level_0,User_ID,Q_number,Question,Q_type,Short_Answer,Long_Answer,Age,Gender
Data_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,주기적으로 새로운 친구를 만드나요? 경험을 비추어봤을 때 어떤지와 그러한 이유가 궁...,IE,0,친구를 만들 상황에 새로운 친구를 만듭니다. 의도적으로나 꼭 주기적으로 새로운 친구...,30,1
2,1,2,자유 시간 중 상당 부분을 다양한 관심사를 탐구하는 데 할애하나요? 요즘 어떤 관심...,SN,2,일상이 무료한 걸 못 참는 편. 요즘은 리본으로 뜨개질해서 가방 만드는 걸 배우고 ...,30,1
3,1,3,다른 사람이 울고 있는 모습을 보면 자신도 울고 싶어질 때가 많나요? 이런 상황에서...,TF,0,다른 사람이 울고 있는 모습을 본다고 해도 눈물이 나지는 않습니다. 그 심정은 이해...,30,1
4,1,4,일이 잘못될 때를 대비해 여러 대비책을 세우는 편인가요? 이유는 무엇인가요.,JP,2,진행하고 있는 일이 계획단계일 때 이미 대비책을 함께 세우는 편. 잘못된 순간 바로...,30,1
5,1,5,압박감이 심한 환경에서도 평정심을 유지하는 편인가요? 최근 경험을 말씀해주세요.,TF,2,"최근 프로젝트 진행 중 문제가 발생, 마감기한이 얼마 남지 않았고, 주변 사람들도 ...",30,1


In [None]:
input_ids, att_masks, short_answers, ages, genders, _, _, _, _ = preprocess(df2, is_export=True)

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

def construct_dataloader(dataset_xx, batch_size=16):  
  dataloader_xx = DataLoader (
      dataset_xx,
      sampler = RandomSampler(dataset_xx),
      batch_size = batch_size
  )

  return dataloader_xx

In [None]:
# Construct datasets
dataset_IE_base = TensorDataset(input_ids['IE'], att_masks['IE'], short_answers['IE'])
dataset_JP_base = TensorDataset(input_ids['JP'], att_masks['JP'], short_answers['JP'])
dataset_SN_age = TensorDataset(input_ids['SN'], att_masks['SN'], short_answers['SN'], ages['SN'])
dataset_TF_age = TensorDataset(input_ids['TF'], att_masks['TF'], short_answers['TF'], ages['TF'])

# Construct dataloaders
dataloader_IE_base = construct_dataloader(dataset_IE_base)
dataloader_JP_base = construct_dataloader(dataset_JP_base)
dataloader_SN_age = construct_dataloader(dataset_SN_age)
dataloader_TF_age = construct_dataloader(dataset_TF_age)

In [None]:
device = torch.device("cuda")

model_IE = torch.load(model_IE_dir)
model_SN = torch.load(model_SN_dir)
model_TF = torch.load(model_TF_dir)
model_JP = torch.load(model_JP_dir)

model_IE.eval()
model_SN.eval()
model_TF.eval()
model_JP.eval()

preds_IE = []
preds_prob_IE = []
preds_SN = []
preds_prob_SN = []
preds_TF = []
preds_prob_TF = []
preds_JP = []
preds_prob_JP = []

# Predict I vs. E
for batch in dataloader_IE_base:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_short = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_IE(b_input_id, b_input_mask, b_short)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_IE = preds_prob_IE + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_IE = preds_IE + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict S vs. N
for batch in dataloader_SN_age:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_short = batch[2].float().to(device)
  b_age = batch[3].float().to(device)

  with torch.no_grad():
      b_out = model_SN(b_input_id, b_input_mask, b_short, b_age)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_SN = preds_prob_SN + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_SN = preds_SN + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict T vs. F
for batch in dataloader_TF_age:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_short = batch[2].float().to(device)
  b_age = batch[3].float().to(device)

  with torch.no_grad():
      b_out = model_TF(b_input_id, b_input_mask, b_short, b_age)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_TF = preds_prob_TF + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_TF = preds_TF + np.argmax(b_out_np, axis=1).flatten().tolist()

# Predict J vs. P
for batch in dataloader_JP_base:
  b_input_id = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_short = batch[2].float().to(device)

  with torch.no_grad():
      b_out = model_JP(b_input_id, b_input_mask, b_short)

  b_out_np = b_out.detach().cpu().numpy()
  preds_prob_JP = preds_prob_JP + list(map(lambda x: x[1], b_out_np.tolist()))
  preds_JP = preds_JP + np.argmax(b_out_np, axis=1).flatten().tolist()

In [None]:
range_IE = 13
range_SN = 16
range_TF = 17
range_JP = 14
assert(len(preds_prob_IE)/range_IE == 120)
assert(len(preds_prob_SN)/range_SN == 120)
assert(len(preds_prob_TF)/range_TF == 120)
assert(len(preds_prob_JP)/range_JP == 120)


In [None]:
print(preds_prob_SN[:14])
print(preds_prob_SN[14:27])

[0.9934601783752441, 0.02405782788991928, 0.9971278309822083, 0.01615426503121853, 0.0018488740315660834, 0.9969909191131592, 0.9925491809844971, 0.9968185424804688, 0.9970554113388062, 0.994439959526062, 0.9904261231422424, 0.995876133441925, 0.9916542768478394, 0.993485152721405]
[0.08536873012781143, 0.33761295676231384, 0.003727381117641926, 0.992825984954834, 0.993502676486969, 0.0069595989771187305, 0.012159456498920918, 0.9929630160331726, 0.05580657720565796, 0.995763897895813, 0.9930174350738525, 0.9970601201057434, 0.9968199729919434]


In [None]:
mean_pred_prob_IE = []
mean_pred_prob_SN = []
mean_pred_prob_TF = []
mean_pred_prob_JP = []

prob_sum = 0

for i, prob in enumerate(preds_prob_IE):
  prob_sum += prob
  if ((i+1) % range_IE == 0):
    mean_pred_prob_IE.append(prob_sum/range_IE)
    prob_sum = 0

for i, prob in enumerate(preds_prob_SN):
  prob_sum += prob
  if ((i+1) % range_SN == 0):
    mean_pred_prob_SN.append(prob_sum/range_SN)
    prob_sum = 0

for i, prob in enumerate(preds_prob_TF):
  prob_sum += prob
  if ((i+1) % range_TF == 0):
    mean_pred_prob_TF.append(prob_sum/range_TF)
    prob_sum = 0

for i, prob in enumerate(preds_prob_JP):
  prob_sum += prob
  if ((i+1) % range_JP == 0):
    mean_pred_prob_JP.append(prob_sum/range_JP)
    prob_sum = 0

In [None]:
user_ids = []
prev_user_id = -1
for idx, row in df2.iterrows():
  user_id = row['User_ID']
  if prev_user_id != user_id:
    user_ids.append(user_id)
  prev_user_id = user_id

assert(len(user_ids)==120)

In [None]:
preds_prob = {'User_ID': user_ids,'I/E': mean_pred_prob_IE, 'S/N':mean_pred_prob_SN, 'T/F':mean_pred_prob_TF, 'J/P':mean_pred_prob_JP}
preds_prob = pd.DataFrame(data=preds_prob)
preds_prob = preds_prob.set_index('User_ID')
preds_prob.to_csv(google_drive_path + 'result_prob_sort-based_submit_mean.csv')

In [None]:
preds_prob.head()

Unnamed: 0_level_0,I/E,S/N,T/F,J/P
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.500045,0.712808,0.524727,0.616331
2,0.601788,0.682467,0.52593,0.574219
3,0.488432,0.744313,0.576563,0.224937
4,0.787117,0.52718,0.475692,0.705915
5,0.670901,0.452349,0.673906,0.408749


In [None]:
# majority_pred_IE = []
# majority_pred_SN = []
# majority_pred_TF = []
# majority_pred_JP = []

# def majority_pred(preds_prob_xx, range_xx):
#   majority_pred_xx = []
#   majority_count = 0
#   max_one = 0
#   min_zero = 0.5
#   for i, prob in enumerate(preds_prob_xx):
#     if prob>0.5: 
#       majority_count += 1
#       max_one = max(max_one, prob)
#     else:
#       min_zero = min(min_zero, prob)
#     if ((i+1) % range_xx == 0):
#       if majority_count > (range_IE - majority_count):
#         majority_pred_xx.append(max_one)
#       else:
#         majority_pred_xx.append(min_zero)
#       majority_count = 0
#       max_one = 0
#       min_zero = 0.5

#   return majority_pred_xx

In [None]:
# majority_pred_IE = majority_pred(preds_prob_IE, range_IE)
# majority_pred_SN = majority_pred(preds_prob_SN, range_SN)
# majority_pred_TF = majority_pred(preds_prob_TF, range_TF)
# majority_pred_JP = majority_pred(preds_prob_JP, range_JP)

In [None]:
# preds_prob_2 = {'User_ID': user_ids,'I/E': majority_pred_IE, 'S/N':majority_pred_SN, 'T/F':majority_pred_TF, 'J/P':majority_pred_JP}
# preds_prob_2 = pd.DataFrame(data=preds_prob_2)
# preds_prob_2 = preds_prob_2.set_index('User_ID')
# preds_prob_2.to_csv(google_drive_path + 'result_prob_sort-based_submit_major.csv')