In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/AES_Linking"

Mounted at /content/drive
/content/drive/MyDrive/AES_Linking


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
Co

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import os
import scipy
import nltk
import random
import re
import codecs
import operator
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import cohen_kappa_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from pandas.core.common import random_state

# Data formatting for running AES

In [4]:
def output(df,output_dir, output_file_name):
    # ディレクトリがないとエラーになるため作成
    dir = Path(output_dir)
    dir.mkdir(parents=True, exist_ok=True)
    # csv形式で出力
    df.to_csv(f'{output_dir}/{output_file_name}', index=False)

#Read focal_theta
focal_theta = pd.read_csv(f'data/2_res_IRT/focal/theta.csv')
focal_theta = focal_theta["theta"]

#Read reference_theta
reference_theta = pd.read_csv(f'data/2_res_IRT/reference/theta.csv')
reference_theta = reference_theta["theta"]

#Read focal_data
focal_data = pd.read_csv(f"data/1_for_IRT/focal/data.csv")

#Read reference_data
reference_data = pd.read_csv(f"data/1_for_IRT/reference/data.csv")

#Transform the data format
focal_data = focal_data.groupby(["essay_id", "essay"], as_index=False).mean()
reference_data = reference_data.groupby(["essay_id", "essay"], as_index=False).mean()

focal_data["theta"] = list(focal_theta)
reference_data["theta"] = list(reference_theta)
focal_data = focal_data[['essay_id', 'essay', 'score', 'theta']]
reference_data = reference_data[['essay_id', 'essay', 'score', 'theta']]

output(focal_data, f"data/3_for_AES/focal/", "data.csv")
output(reference_data, f"data/3_for_AES/reference/", "data.csv")

# Prepare BERT model

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

class BertAES(nn.Module):
    def __init__(self):
        super(BertAES, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(768, 1)

        self.sigmoid = nn.Sigmoid() #活性化関数

        nn.init.normal_(self.linear.weight, std=0.02)  # 重みの初期化
        nn.init.normal_(self.linear.bias, 0)  # バイアスの初期化

    def forward(self, dataset):
        outputs = self.bert(dataset['input_ids'], attention_mask=dataset['attention_mask'], token_type_ids=dataset['token_type_ids'])
        sequence_output = outputs['last_hidden_state'][:, 0, :]
        theta = self.sigmoid(self.linear(sequence_output))
        return {'theta_est': theta}

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Run Model Training using the data of the reference group

In [None]:
max_length = 512

def get_model_friendly_scores(scores_array):
	low, high = (-3,3)
	scores_array = (scores_array - low) / (high - low)
	return scores_array

def get_model_scores(scores_array):
	low, high = (-3,3)
	scores_array = (scores_array - low) / (high - low)
	return scores_array

class DataSet:
  def __init__(self, X, Y):
    self.input_ids = X['input_ids']
    self.attention_mask = X['attention_mask']
    self.token_type_ids = X['token_type_ids']
    self.labels = Y

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, index):
    return {'input_ids': self.input_ids[index],
            'attention_mask': self.attention_mask[index],
            'token_type_ids': self.token_type_ids[index],
            'labels': self.labels[index]}

def get_dataset(dataf):
	dat_p = dataf
	x = dat_p["essay"].tolist()
	y = get_model_scores(np.array(dat_p["theta"])).tolist()
	encoding = tokenizer(x, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
	labels = torch.tensor(y, dtype=torch.float32)
	return DataSet(encoding, labels)

low, high = (-3,3)
dev_info = pd.DataFrame()

os.makedirs(f"data/4_res_BERT_pred/reference/", exist_ok=True)

base_data = pd.read_csv(f"data/3_for_AES/reference/data.csv", sep=",")
kf = KFold(n_splits=10, shuffle=True, random_state=0)
for train_id, valid_id in kf.split(base_data):
  train_data = base_data.iloc[train_id]
  dev_data = base_data.iloc[valid_id]

train_dataset = get_dataset(train_data)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True)

dev_dataset = get_dataset(dev_data)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=8, shuffle=False, drop_last=False)

regressor = BertAES()
regressor.train()

best_mse = np.inf
epoch_num = 10

optimizer = optim.AdamW(regressor.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, 500, epoch_num)
regressor = regressor.cuda()

batch_num = len(train_dataloader)
for epoch in range(epoch_num):
  loss_all = 0
  for data in train_dataloader:
    data = {k: v.cuda() for k, v in data.items()}
    optimizer.zero_grad()
    outputs = regressor(data)
    loss = F.mse_loss(data['labels'].unsqueeze(1), outputs['theta_est'])
    loss_all += loss
    loss.backward()
    optimizer.step()
    scheduler.step()

  dev_true = []
  dev_pred = []
  for dev_data in dev_dataloader:
    d_data = {k: v.cuda() for k, v in dev_data.items()}
    dev_labels = d_data['labels'].unsqueeze(1).to('cpu').detach().numpy().copy()
    dev_labels = dev_labels * (high - low) + low
    dev_true = np.append(dev_true, dev_labels[:,0])
    dev_outputs = regressor(d_data)["theta_est"].to('cpu').detach().numpy().copy()
    dev_outputs = dev_outputs * (high - low) + low
    dev_pred = np.append(dev_pred, dev_outputs[:,0])
  dev_mse = mean_squared_error(dev_true, dev_pred)
  print(f'Epoch:{epoch}, train_Loss: {loss_all/batch_num}, dev_mse: {dev_mse}')

  if dev_mse < best_mse:
        best_mse = dev_mse
        torch.save(regressor.state_dict(), f"data/4_res_BERT_pred/reference/best_bert_model.pth")
        print("best model saved ...")
        dev_info["best_mse"] = [best_mse]
        dev_info["last_mse"] = [dev_mse]

output(dev_info, f"data/4_res_BERT_pred/reference/", "dev_info.csv")

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch:0, train_Loss: 0.042710185050964355, dev_mse: 0.49256745537996166
best model saved ...
Epoch:1, train_Loss: 0.01193182822316885, dev_mse: 0.36590026288105904
best model saved ...


# Predict abilities for the examinees in the focal group

In [None]:
source_data = pd.read_csv(f"data/3_for_AES/focal/data.csv", sep=",")
source_data = source_data[["essay_id", "essay", "score", "theta"]]
source_data["theta_irt"] = source_data["theta"]
source_data["theta"] = 0

source_dataset = get_dataset(source_data)
source_dataloader = torch.utils.data.DataLoader(source_dataset, batch_size=8, shuffle=False, drop_last=False)

regressor = BertAES()
regressor.load_state_dict(torch.load(f"data/4_res_BERT_pred/reference/best_bert_model.pth", map_location = "cpu")) # モデルの読み込み
regressor.eval()
regressor = regressor.cuda()

source_pred = []
with torch.no_grad():
    for eq_data in source_dataloader:
      e_data = {k: v.cuda() for k, v in eq_data.items()}
      source_outputs = regressor(e_data)["theta_est"].to('cpu').detach().numpy().copy()
      source_outputs = source_outputs * (high - low) + low
      source_pred = np.append(source_pred, source_outputs[:,0])

source_data["theta"] = source_pred
output(source_data, f"data/4_res_BERT_pred/focal/", "pred.csv")
