# Install libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers



In [None]:
!pip install SentencePiece



# Data

In [None]:
import pandas as pd

df = pd.read_csv('drive/MyDrive/corpus/en_to_ko_mqm_evaluation.tsv', sep='\t')
df.head()

Unnamed: 0,type,corpus,en_source,ko_reference,ko_target,annotation,accuracy,fluency,style,total
0,Training,GlobalVoices,To let Japanese buy FT will be the last thing ...,중국은 닛케이가 파이낸셜타임스를 인수하는 걸 절대 바라지 않는다.,중국은 일본 고객의 FT 구매를 최종 선호로 보고 싶지 않을 것입니다.,"Accuracy: 일본 고객의(mistranslation/major), 최종 선호로...",35.0,5.0,0.0,40.0
1,Training,GlobalVoices,If our uncritical engagement with media is any...,미디어와의 이런 안식없는 관계는 우리가 스스로와도 밀접하게 연관되어 있지 못하는 것...,우리가 언론을 세심하게 살피지 않는 걸 보면 우리 일에는 관심조차 없는 것 같다.,Accuracy: 우리가 언론을 세심하게 살피지 않는 걸 보면(mistranslat...,7.0,0.0,0.0,7.0
2,Training,GlobalVoices,China: Rising prices and rooftop gardens · Glo...,중국: 물가상승에 따른 새로운 현상- 옥상텃밭,글로벌 보이스(Global Voices)는 중국의 치솟는 물가와 옥상 정원의 등장에...,"Accuracy: 등장에(addition/minor), (Global Voices)...",3.0,2.0,10.0,15.0
3,Training,GlobalVoices,And one year of a brave stance against great e...,그리고 일 년 간 거대한 악에 대항하는 용기도 볼 수 있었다. 그러나 시리아인들은 ...,"1년 내내 강력한 세력에 맞서는 용기를 보여줬음에도 불구하고, 시리아 시민들은 자유...","Accuracy: 세력에(mistranslation/major), 존경을(mistr...",10.0,1.0,0.0,11.0
4,Training,GlobalVoices,Amina's story deeply touched and outraged Moro...,"모로코 누리꾼들은 아미나의 이야기를 듣고 분노했고, 트위터 해쉬태그 #RIPAmin...",Amina의 이야기는 Twitter에서 해시태그 #RIPAmina를 사용하여 소녀를...,"Accuracy: Amina, Twitter(untranslated text/maj...",10.0,10.0,0.0,20.0


In [None]:
train_data = df[:1000]
valid_data = df[1000:1100]
test_data = df[1100:]

In [None]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

1000
100
100


# Model

In [None]:
import transformers

In [None]:
# RemBERT

rembert_tokenizer = transformers.AutoTokenizer.from_pretrained("google/rembert")
# rembert_model_mte = transformers.AutoModel.from_pretrained("google/rembert")
rembert_model_qe = transformers.AutoModel.from_pretrained("google/rembert")

# Preprocessing

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import random
import numpy as np

# set the random seed
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

class MTEvaluationDataset(Dataset):
    def __init__(self, en_source, ko_reference, ko_target, accuracy, fluency, style, total, tokenizer, max_length=512):
        self.en_source = en_source
        self.ko_reference = ko_reference
        self.ko_target = ko_target
        self.accuracy = accuracy
        self.fluency = fluency
        self.style = style
        self.total = total
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.en_source)

    def __getitem__(self, idx):
        # Concatenate the source, reference, and target texts with [SEP] tokens in between
        text = self.en_source[idx] + ' [SEP] ' + self.ko_reference[idx] + ' [SEP] ' + self.ko_target[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'accuracy': self.accuracy[idx],
            'fluency': self.fluency[idx],
            'style': self.style[idx],
            'total': self.total[idx]
        }

class QEDataset(Dataset):
    def __init__(self, en_source, ko_target, accuracy, fluency, style, total, tokenizer, max_length=512):
        self.en_source = en_source
        self.ko_target = ko_target
        self.accuracy = accuracy
        self.fluency = fluency
        self.style = style
        self.total = total
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.en_source)

    def __getitem__(self, idx):
        # Concatenate the source and target texts with [SEP] tokens in between
        text = self.en_source[idx] + ' [SEP] ' + self.ko_target[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'accuracy': self.accuracy[idx],
            'fluency': self.fluency[idx],
            'style': self.style[idx],
            'total': self.total[idx]
        }

# Create datasets
mt_train_dataset = MTEvaluationDataset(
    train_data['en_source'].tolist(),
    train_data['ko_reference'].tolist(),
    train_data['ko_target'].tolist(),
    train_data['accuracy'].tolist(),
    train_data['fluency'].tolist(),
    train_data['style'].tolist(),
    train_data['total'].tolist(),
    rembert_tokenizer
)

qe_train_dataset = QEDataset(
    train_data['en_source'].tolist(),
    train_data['ko_target'].tolist(),
    train_data['accuracy'].tolist(),
    train_data['fluency'].tolist(),
    train_data['style'].tolist(),
    train_data['total'].tolist(),
    rembert_tokenizer
)

# For validation
mt_valid_dataset = MTEvaluationDataset(
    valid_data['en_source'].tolist(),
    valid_data['ko_reference'].tolist(),
    valid_data['ko_target'].tolist(),
    valid_data['accuracy'].tolist(),
    valid_data['fluency'].tolist(),
    valid_data['style'].tolist(),
    valid_data['total'].tolist(),
    rembert_tokenizer
)

qe_valid_dataset = QEDataset(
    valid_data['en_source'].tolist(),
    valid_data['ko_target'].tolist(),
    valid_data['accuracy'].tolist(),
    valid_data['fluency'].tolist(),
    valid_data['style'].tolist(),
    valid_data['total'].tolist(),
    rembert_tokenizer
)

# For test
mt_test_dataset = MTEvaluationDataset(
    test_data['en_source'].tolist(),
    test_data['ko_reference'].tolist(),
    test_data['ko_target'].tolist(),
    test_data['accuracy'].tolist(),
    test_data['fluency'].tolist(),
    test_data['style'].tolist(),
    test_data['total'].tolist(),
    rembert_tokenizer
)

qe_test_dataset = QEDataset(
    test_data['en_source'].tolist(),
    test_data['ko_target'].tolist(),
    test_data['accuracy'].tolist(),
    test_data['fluency'].tolist(),
    test_data['style'].tolist(),
    test_data['total'].tolist(),
    rembert_tokenizer
)

len(mt_train_dataset), len(qe_train_dataset), len(mt_valid_dataset), len(qe_valid_dataset), len(mt_test_dataset), len(qe_test_dataset)

(1000, 1000, 100, 100, 100, 100)

## Modeling

In [None]:
import torch.nn as nn

class MTEvaluationModel(nn.Module):
    def __init__(self, base_model):
        super(MTEvaluationModel, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        return self.regression_head(pooled_output)

class QEModel(nn.Module):
    def __init__(self, base_model):
        super(QEModel, self).__init__()
        self.base_model = base_model
        self.regression_head = nn.Linear(base_model.config.hidden_size, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        return self.regression_head(pooled_output)

# Instantiate models
# mt_evaluation_model = MTEvaluationModel(rembert_model_mte)
qe_model = QEModel(rembert_model_qe)


## Run training

In [None]:
from torch.optim import AdamW
from scipy.stats import spearmanr
from torch.utils.data import DataLoader
import torch
import numpy as np
import random
from scipy.stats import kendalltau
import sys

# Hyperparameters
EPOCHS = 100
BATCH_SIZE = 8
LEARNING_RATE = 2e-6
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move models to device
# mt_evaluation_model = mt_evaluation_model.float().to(DEVICE)
qe_model = qe_model.float().to(DEVICE)

# Optimizers
# mt_optimizer = AdamW(mt_evaluation_model.parameters(), lr=LEARNING_RATE)
qe_optimizer = AdamW(qe_model.parameters(), lr=LEARNING_RATE)

# Loss function
loss_fn = nn.MSELoss()

# DataLoaders
# mt_train_dataloader = DataLoader(mt_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# mt_valid_dataloader = DataLoader(mt_valid_dataset, batch_size=BATCH_SIZE)
# mt_test_dataloader = DataLoader(mt_test_dataset, batch_size=BATCH_SIZE)
qe_train_dataloader = DataLoader(qe_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
qe_valid_dataloader = DataLoader(qe_valid_dataset, batch_size=BATCH_SIZE)
qe_test_dataloader = DataLoader(qe_test_dataset, batch_size=BATCH_SIZE)


# path to save the model weights
# MODEL_SAVE_PATH = 'drive/MyDrive/model/ex1/rembert/mte/'
MODEL_SAVE_PATH = 'drive/MyDrive/model/ex1/rembert/qe/'

# Training loop
def train_model(model, optimizer, dataloader):
    model.train()
    total_loss = 0
    for data in dataloader:
        input_ids = data['input_ids'].to(DEVICE)
        attention_mask = data['attention_mask'].to(DEVICE)
        targets = torch.stack([data['accuracy'], data['fluency'], data['style']], dim=1).to(DEVICE).float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# Validation loop
def validate_model(model, dataloader):
    model.eval()
    total_val_loss = 0

    all_preds = []
    all_targets = []

    with torch.no_grad():
        for data in dataloader:
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            targets = torch.stack([data['accuracy'], data['fluency'], data['style']], dim=1).to(DEVICE)

            outputs = model(input_ids, attention_mask)
            all_preds.append(outputs.cpu())
            all_targets.append(targets.cpu())

            val_loss = loss_fn(outputs, targets)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(dataloader)

    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_targets = torch.cat(all_targets, dim=0).numpy()

    return avg_val_loss

def test_model(model, dataloader):
    model.eval()
    all_preds = []
    all_targets = []
    total_scores = []
    with torch.no_grad():
        for data in dataloader:
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            targets = torch.stack([data['accuracy'], data['fluency'], data['style']], dim=1).to(DEVICE)

            outputs = model(input_ids, attention_mask)
            all_preds.append(outputs.cpu())
            all_targets.append(targets.cpu())
            total_scores.extend(data['total'].tolist())

    all_preds = torch.cat(all_preds, dim=0).numpy()
    all_targets = torch.cat(all_targets, dim=0).numpy()

    kendall_accuracy = kendalltau(all_preds[:, 0], all_targets[:, 0])[0]
    kendall_fluency = kendalltau(all_preds[:, 1], all_targets[:, 1])[0]
    kendall_style = kendalltau(all_preds[:, 2], all_targets[:, 2])[0]

    # Compute accumulated scores from the predictions
    accumulated_preds = all_preds.sum(axis=1)
    kendall_total = kendalltau(accumulated_preds, total_scores)[0]

    return kendall_accuracy, kendall_fluency, kendall_style, kendall_total

# Training and validation execution
g_val_loss = sys.maxsize
for epoch in range(EPOCHS):
    # mt_train_loss = train_model(mt_evaluation_model, mt_optimizer, mt_train_dataloader)
    # mt_val_loss = validate_model(mt_evaluation_model, mt_valid_dataloader)
    qe_train_loss = train_model(qe_model, qe_optimizer, qe_train_dataloader)
    qe_val_loss = validate_model(qe_model, qe_valid_dataloader)

    # Save the model at each epoch
    if g_val_loss >= qe_val_loss:
        # torch.save(mt_evaluation_model.state_dict(), f"{MODEL_SAVE_PATH}rembert_mte_1000_training1_epoch_{epoch+1}.pt")
        torch.save(qe_model.state_dict(), f"{MODEL_SAVE_PATH}rembert_qe_1000_training1_epoch_{epoch+1}.pt")
        g_val_loss = qe_val_loss

    print(f"Epoch {epoch+1}/{EPOCHS}")
    # print(f"MT Evaluation Model - Train Loss: {mt_train_loss:.4f}, Validation Loss: {mt_val_loss:.4f}")
    print(f"QE Model - Train Loss: {qe_train_loss:.4f}, Validation Loss: {qe_val_loss:.4f}")
    print("="*60)

Epoch 1/100
QE Model - Train Loss: 85.3821, Validation Loss: 62.8862
Epoch 2/100
QE Model - Train Loss: 69.8551, Validation Loss: 59.8068
Epoch 3/100
QE Model - Train Loss: 66.6030, Validation Loss: 58.4071
Epoch 4/100
QE Model - Train Loss: 64.5051, Validation Loss: 56.9902
Epoch 5/100
QE Model - Train Loss: 60.1128, Validation Loss: 55.5602
Epoch 6/100
QE Model - Train Loss: 55.9617, Validation Loss: 54.5779
Epoch 7/100
QE Model - Train Loss: 51.4000, Validation Loss: 55.0315
Epoch 8/100
QE Model - Train Loss: 47.6682, Validation Loss: 54.5992
Epoch 9/100
QE Model - Train Loss: 46.2100, Validation Loss: 51.9353
Epoch 10/100
QE Model - Train Loss: 44.7531, Validation Loss: 61.6753
Epoch 11/100
QE Model - Train Loss: 42.4719, Validation Loss: 54.0371
Epoch 12/100
QE Model - Train Loss: 39.8605, Validation Loss: 52.7883
Epoch 13/100
QE Model - Train Loss: 38.3008, Validation Loss: 51.6513
Epoch 14/100
QE Model - Train Loss: 36.3832, Validation Loss: 49.7097
Epoch 15/100
QE Model - Train

## Test the model

In [None]:
i = 97

# path to the saved model weights
# SAVED_MODEL_PATH = f'drive/MyDrive/model/ex1/rembert/mte/rembert_mte_1000_training1_epoch_{i}.pt'
SAVED_MODEL_PATH = f'drive/MyDrive/model/ex1/rembert/qe/rembert_qe_1000_training1_epoch_{i}.pt'

# load the model
# loaded_model = MTEvaluationModel(rembert_model_mte)
loaded_model = QEModel(rembert_model_qe)

# Load the saved weights into the model
loaded_model.load_state_dict(torch.load(SAVED_MODEL_PATH))

# Move the model to the device
loaded_model = loaded_model.to(DEVICE)

# Set the model to evaluation mode
loaded_model.eval()

# calculate kendall's tau correlations on the test set
# mt_kendall_accuracy, mt_kendall_fluency, mt_kendall_style, mt_kendall_total = test_model(loaded_model, mt_test_dataloader)
# mt_kendall_average = (mt_kendall_accuracy + mt_kendall_fluency + mt_kendall_style) / 3
# print(f"Epoch {i} >>> Test Kendall Accuracy: {mt_kendall_accuracy:.4f}, Fluency: {mt_kendall_fluency:.4f}, Style: {mt_kendall_style:.4f}, Total: {mt_kendall_total:.4f}, Average: {mt_kendall_average:.4f}")

qe_kendall_accuracy, qe_kendall_fluency, qe_kendall_style, qe_kendall_total = test_model(loaded_model, qe_test_dataloader)
qe_kendall_average = (qe_kendall_accuracy + qe_kendall_fluency + qe_kendall_style) / 3
print(f"Epoch {i} >>> Test Kendall Accuracy: {qe_kendall_accuracy:.4f}, Fluency: {qe_kendall_fluency:.4f}, Style: {qe_kendall_style:.4f}, Total: {qe_kendall_total:.4f}, Average: {qe_kendall_average:.4f}")

Epoch 97 >>> Test Kendall Accuracy: 0.3743, Fluency: 0.3710, Style: 0.2400, Total: 0.3630, Average: 0.3284
