먼저, github 내부의 다른 code를 통해서 모델들을 각각 train한 후, 얻은 checkpoint가 필요하다.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install simpletransformers
!pip install sentencepiece

In [None]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange

from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW,
    ElectraForSequenceClassification,
    AutoTokenizer,
    XLNetForSequenceClassification,
    XLNetTokenizer,
    XLNetModel,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    RobertaModel,
    
)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# **test data**

---



In [None]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')

In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset):
    data_strings = []
    id_file_data = [tokenizer.encode(sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset)

In [None]:
test[:10]

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    print(input_ids)
    max_len = max(len(input_id) for input_id in input_ids)
    # error code
    #sorted_indices = np.argsort([len(input_id) for input_id in input_ids])[::-1] 
    
    # train을 할 때와는 달리, test는 label이 없기 때문에 shuffle을 해주면, 각각의 순서가 맞지 않는다. 
    # shuffle을 하지 않아야 하므로, index 값으로 다시 코드 작성을 해주어야 한다. 
    sorted_indices = list(i for i in range(len(input_ids))) 

    input_ids = pad_sequence([torch.tensor(input_ids[index]) for index in sorted_indices],
                             batch_first=True)
    attention_mask = torch.tensor(
        [[1] * len(input_ids[index]) + [0] * (max_len - len(input_ids[index])) for index in
         sorted_indices])
    token_type_ids = torch.tensor([[0] * len(input_ids[index]) for index in sorted_indices])
    position_ids = torch.tensor([list(range(len(input_ids[index]))) for index in sorted_indices])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_batch_size = 64
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

# **1. bert base uncased**

---



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/Colab Notebooks/NLP/project/checkpoint_epoch_1.13850_model1.pth'
# 해당하는 모델을 입력한다.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
# 앞의 train 부분에서의 check point의 파일경로를 삽입한다. 
checkpoint1 = torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/project/0/checkpoint_epoch_0.5536.pth', map_location=device)
model.load_state_dict(checkpoint1['model_state_dict'])
model.to(device)

with torch.no_grad():
    model.eval()
    predictions1 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output1 = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits1 = output1.logits
        batch_predictions1 = [0 if example[0] > example[1] else 1 for example in logits1]
        # 모델이 예측한 결과 값
        predictions1 += batch_predictions1

# **2. bert large**

---



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/Colab Notebooks/NLP/project/checkpoint_epoch_1.13850.pth'
# 해당하는 모델을 입력한다.
model = BertForSequenceClassification.from_pretrained('bert-large-uncased')
# 앞의 train 부분에서의 check point의 파일경로를 삽입한다. 
checkpoint2 = torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/project/0/checkpoint_epoch_0.5536.pth', map_location=device)
model.load_state_dict(checkpoint2['model_state_dict'])
model.to(device)

with torch.no_grad():
    model.eval()
    predictions2 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output2 = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits2 = output2.logits
        batch_predictions2 = [0 if example[0] > example[1] else 1 for example in logits2]
        # 모델이 예측한 결과 값
        predictions2 += batch_predictions2

# **3. Electra**

---



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/Colab Notebooks/NLP/project/checkpoint_epoch_1.13850.pth'
# 해당하는 모델을 입력한다.
model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator")
# 앞의 train 부분에서의 check point의 파일경로를 삽입한다. 
checkpoint3 = torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/project/1/checkpoint_epoch_1.6228.pth', map_location=device)
model.load_state_dict(checkpoint3['model_state_dict'])
model.to(device)

with torch.no_grad():
    model.eval()
    predictions3 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output3 = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits3 = output3.logits
        batch_predictions3 = [0 if example[0] > example[1] else 1 for example in logits3]
        # 모델이 예측한 결과 값
        predictions3 += batch_predictions3

# **4. XLNet**

---



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/Colab Notebooks/NLP/project/checkpoint_epoch_1.13850.pth'
# 해당하는 모델을 입력한다.
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')
# 앞의 train 부분에서의 check point의 파일경로를 삽입한다. 
checkpoint4 = torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/project/2/checkpoint_epoch_1.4152.pth', map_location=device)
model.load_state_dict(checkpoint4['model_state_dict'])
model.to(device)

with torch.no_grad():
    model.eval()
    predictions4 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output4 = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)
        


        logits4 = output4.logits

        batch_predictions4 = [0 if example[0] > example[1] else 1 for example in logits4]
        # 모델이 예측한 결과 값
        predictions4 += batch_predictions4

# **5. Roberta**

---



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = '/content/drive/MyDrive/Colab Notebooks/NLP/project/check_roberta/checkpoint_epoch_1.13850.pth'
# 해당하는 모델을 입력한다.
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
# 앞의 train 부분에서의 check point의 파일경로를 삽입한다. 
checkpoint5 = torch.load('/content/drive/MyDrive/Colab Notebooks/NLP/project/check_roberta/checkpoint_epoch_1.1730.pth', map_location=device)
model.load_state_dict(checkpoint5['model_state_dict'])
model.to(device)

with torch.no_grad():
    model.eval()
    predictions5 = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output5 = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits5 = output5.logits
        batch_predictions5 = [0 if example[0] > example[1] else 1 for example in logits5]
        # 모델이 예측한 결과 값
        predictions5 += batch_predictions5

# **Ensemble**

---

ensemble 방법으로는 hard-voting을 사용하였다. 각각의 결과 값에 대해서 각 모델들의 예측 결과 값을 다 더한 후 모델의 개수만큼 나누어 주었다. 이 값이 0.5 이상이라면 1이라고 판단, 0.5 미만이라면 0이라고 판단하였다.
이러한 방법을 통해 최종적인 예측 값을 낼 수 있다.

In [None]:
predictions = []
for i in range(len(predictions1)):
    new = ((int(predictions1[i]) + int(predictions2[i]) + int(predictions3[i]) + int(predictions4[i]) + int(predictions5[i])) / 5)
    if new > 0.5:
        predictions.append(1)
    else:
        predictions.append(0)
print(predictions)

# **Final**

---



In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission1.csv', index=False)