In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
from pathlib import Path
logging.basicConfig(level=logging.ERROR)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
root_dir = Path('.')
data_dir = Path(root_dir,'.data', 'sentence-classification')

In [3]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_train):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.is_train = is_train
        if self.is_train:
            self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        if self.is_train:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
            }

def get_data_loader(df, tokenizer, max_len, batch_size, is_train, shuffle):
    dataset = SentimentData(
        df,
        tokenizer = tokenizer,
        max_len = max_len,
        is_train=is_train,
    )
    
    return DataLoader(
        dataset,
        shuffle = shuffle,
        batch_size=batch_size
    )

def get_predictions(model, loader):
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['ids']
            attention_mask = batch['mask']
            token_type_ids = batch["token_type_ids"]
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()
                
            outputs,_ = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids
            )
#             print(outputs)
#             _, preds = torch.max(outputs, dim=1)     
            
            predictions.extend(torch.argmax(outputs, dim=1))
    return torch.stack(predictions).cpu()

In [4]:
class cat_dataloaders():
    """Class to concatenate multiple dataloaders"""

    def __init__(self, dataloaders):
        self.dataloaders = dataloaders
        len(self.dataloaders)

    def __iter__(self):
        self.loader_iter = []
        for data_loader in self.dataloaders:
            self.loader_iter.append(iter(data_loader))
        return self

    def __next__(self):
        out = []
        for data_iter in self.loader_iter:
            out.append(next(data_iter)) # may raise StopIteration
        return tuple(out)

In [5]:
# Defining some key variables that will be used later on in the training
load_model = False
if load_model:
#     tokenizer = RobertaTokenizer.from_pretrained('./vocab.json',truncation=True, do_lower_case=True)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)
else:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

max_len = 256
batch_size = 64
# EPOCHS = 1
LEARNING_RATE = 2e-05
train_valid_frac = 0.8
#원래 데이터셋
train = pd.read_csv(data_dir.joinpath('train_plus.csv'))
new_df = train[['Phrase', 'Sentiment']]

train_df=new_df.sample(frac=train_valid_frac,random_state=200)
valid_df=new_df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

test_df = pd.read_csv(data_dir.joinpath('eval_final_open.csv'))
test_df = test_df[['Sentence']]
test_df.rename(columns = {'Sentence': 'Phrase'}, inplace = True)



print(f'Dataset Configuration')
print(f'-'*25)
print(f'Train/Valid = {train_valid_frac:.2f}/{1-train_valid_frac:.2f}')
print(f'Batch size = {batch_size}')
print(f'-'*25)
print(f'Train set : {len(train_df)}')
print(f'Valid set : {len(valid_df)}')
print(f'Test set : {len(test_df)}')

training_loader   = get_data_loader(train_df, tokenizer, max_len, batch_size, True, True)
validating_loader = get_data_loader(valid_df, tokenizer, max_len, batch_size, True, True)
testing_loader    = get_data_loader(test_df, tokenizer, max_len, batch_size, False, False)


# training_loader   = DataLoader(training_set, **train_params)
# validating_loader = DataLoader(validating_set, **valid_params)
# testing_loader    = DataLoader(testing_set, **test_params)

Dataset Configuration
-------------------------
Train/Valid = 0.80/0.20
Batch size = 1
-------------------------
Train set : 69809
Valid set : 17452
Test set : 4311


In [6]:
train_IM = pd.read_csv(data_dir.joinpath('train_IM.csv'))
training_IM_loader   = get_data_loader(train_IM, tokenizer, max_len, batch_size, True, True)

In [7]:
tmp = cat_dataloaders([training_loader,training_IM_loader])
# value= next(iter(tmp))
#0은 sst-5, 1은 sst-2
# print(value)

({'ids': tensor([[    0,  1342,  2399,  8173,  1135,    63,    65,    12,   267,  5361,
         18805,    19,     5, 24149,    14,   390,    31, 21011,   687,     8,
           604,    31, 35899,    64,  5329,   120,   561,   479,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,

In [8]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1               = RobertaModel.from_pretrained("roberta-large")
        self.sst5_fc1 = torch.nn.Linear(1024, 1024)
        self.dropout  = torch.nn.Dropout(0.1)
        self.sst5_fc2 = torch.nn.Linear(1024, 5)
        
        self.sst2_fc1 = torch.nn.Linear(1024, 1024)
        self.dropout  = torch.nn.Dropout(0.1)
        self.sst2_fc2 = torch.nn.Linear(1024, 2)    

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        
        sst5 = self.sst5_fc1(pooler)
        sst5 = torch.nn.ReLU()(sst5)
        sst5 = self.dropout(sst5)
        sst5_output = self.sst5_fc2(sst5)
                  
        sst2 = self.sst2_fc1(pooler)
        sst2 = torch.nn.ReLU()(sst2)
        sst2 = self.dropout(sst2)
        sst2_output = self.sst2_fc2(sst2)
                  
        return sst5_output, sst2_output         

In [9]:
model = RobertaClass()
if load_model:
    model = torch.load('pytorch_roberta_sentiment.bin')
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps

In [10]:
# Creating the loss function and optimizer
loss_fc_sst5 = torch.nn.CrossEntropyLoss()
loss_fc_sst2 = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                lr_lambda=lambda epoch: 0.95 ** epoch,
                                last_epoch=-1,
                                verbose=False)

In [11]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for i,data in tqdm(enumerate(tmp, 0)):
        ids5 = data[0]['ids'].to(device, dtype = torch.long)
        mask5 = data[0]['mask'].to(device, dtype = torch.long)
        token_type_ids5 = data[0]['token_type_ids'].to(device, dtype = torch.long)
        targets5 = data[0]['targets'].to(device, dtype = torch.long)

        outputs5,_ = model(ids5, mask5, token_type_ids5)
        loss5 = loss_fc_sst5(outputs5, targets5)
        
        optimizer.zero_grad()
        loss5.backward()
        optimizer.step()
        
        ids2 = data[1]['ids'].to(device, dtype = torch.long)
        mask2 = data[1]['mask'].to(device, dtype = torch.long)
        token_type_ids2 = data[1]['token_type_ids'].to(device, dtype = torch.long)
        targets2 = data[1]['targets'].to(device, dtype = torch.long)

        _,outputs2 = model(ids2, mask2, token_type_ids2)
        loss2 = loss_fc_sst2(outputs2, targets2)
        
        optimizer.zero_grad()
        loss2.backward()
        optimizer.step()
        
        tr_loss += loss5.item()
        big_val, big_idx = torch.max(outputs5.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets5)

        nb_tr_steps += 1
        nb_tr_examples+=targets5.size(0)
        
        if i%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        
        scheduler.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
EPOCHS = 30
for epoch in range(EPOCHS):
    train(epoch)

1it [00:00,  3.03it/s]

Training Loss per 5000 steps: 1.6031367778778076
Training Accuracy per 5000 steps: 0.0


105it [00:33,  3.15it/s]

In [36]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for i, data in tqdm(enumerate(validating_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs,_ = model(ids, mask, token_type_ids)
            
            loss = loss_fc_sst5(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if i%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [37]:
valid(model,testing_loader)

8it [00:00, 38.07it/s]

Validation Loss per 100 steps: 1.0383436679840088
Validation Accuracy per 100 steps: 75.0


109it [00:02, 40.41it/s]

Validation Loss per 100 steps: 1.4367505081809393
Validation Accuracy per 100 steps: 39.35643564356435


204it [00:05, 40.37it/s]

Validation Loss per 100 steps: 1.4502054865087444
Validation Accuracy per 100 steps: 37.81094527363184


244it [00:06, 40.23it/s]


KeyboardInterrupt: 

In [43]:
predictions = get_predictions(model, testing_loader)
# submission = pd.DataFrame({'Id' : range(len(predictions)), 'Category' : predictions})
# submission.to_csv('submission.csv', index=False)

In [77]:
output_model_file = 'pytorch_roberta_sentiment_plus_multi.pt'
output_vocab_file = './'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
