In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
from pathlib import Path
from torch.nn import Module
import torch.nn.functional as F
logging.basicConfig(level=logging.ERROR)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

RANDOM_SEED = 42
# For same result
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [2]:
root_dir = Path('.')
data_dir = Path(root_dir,'.data', 'sentence-classification')

In [3]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_train):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.is_train = is_train
        if self.is_train:
            self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        if self.is_train:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            }

def get_data_loader(df, tokenizer, max_len, batch_size, is_train, shuffle):
    dataset = SentimentData(
        df,
        tokenizer = tokenizer,
        max_len = max_len,
        is_train=is_train,
    )
    
    return DataLoader(
        dataset,
        shuffle = shuffle,
        batch_size=batch_size
    )

def get_predictions(model, loader):
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['ids']
            attention_mask = batch['mask']
            token_type_ids = batch["token_type_ids"]
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids
            )
                        
            predictions.extend(outputs.detach().cpu().numpy())
    predictions = np.array(predictions)
    predictions = torch.from_numpy(predictions)
    return F.softmax(predictions,dim=1)

In [4]:
# Defining some key variables that will be used later on in the training
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

max_len = 256
batch_size = 8
# EPOCHS = 1
LEARNING_RATE = 1e-05
train_valid_frac = 0.8
#원래 데이터셋
train = pd.read_csv(data_dir.joinpath('train_plus.csv'))
new_df = train[['Phrase', 'Sentiment']]

# train_df=new_df.sample(frac=train_valid_frac,random_state=200)
# valid_df=new_df.drop(train_df.index).reset_index(drop=True)
# train_df = train_df.reset_index(drop=True)
train_amazon_df = pd.read_csv(data_dir.joinpath('merged_250742_train_plus.csv'))
train_amazon_df.rename(columns = {'Sentence': 'Phrase','Category':'Sentiment'}, inplace = True)

test_df = pd.read_csv(data_dir.joinpath('eval_final_open.csv'))
test_df = test_df[['Sentence']]
test_df.rename(columns = {'Sentence': 'Phrase'}, inplace = True)



print(f'Dataset Configuration')
print(f'-'*25)
print(f'Train/Valid = {train_valid_frac:.2f}/{1-train_valid_frac:.2f}')
print(f'Batch size = {batch_size}')
print(f'-'*25)
# print(f'Train set : {len(train_df)}')
# print(f'Valid set : {len(valid_df)}')
print(f'Test set : {len(test_df)}')

training_amazon_loader = get_data_loader(train_amazon_df, tokenizer, max_len, batch_size, True, True)
training_loader        = get_data_loader(new_df, tokenizer, max_len, batch_size, True, True)
# validating_loader = get_data_loader(valid_df, tokenizer, max_len, batch_size, True, True)
testing_loader         = get_data_loader(test_df, tokenizer, max_len, batch_size, False, False)

Dataset Configuration
-------------------------
Train/Valid = 0.80/0.20
Batch size = 8
-------------------------
Test set : 4311


In [5]:
class ActiveDropout(Module):
    # all building blocks of networks are inherited from Module!

    def __init__(self, p=0.5):
        super().__init__()  # init the base class
        self.p = p

    def forward(self, input):
        mask = torch.rand_like(input) > self.p
        return input * mask.to(input) / (1 - self.p)
    
        pass

In [6]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier1 = torch.nn.Linear(768, 768)
        self.pre_classifier2 = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        
        pooler = self.pre_classifier1(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = ActiveDropout(0.5)(pooler)
        
        pooler = self.pre_classifier2(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = ActiveDropout(0.5)(pooler)
        
        output = self.classifier(pooler)
        return output

In [11]:
load_model = True
model = RobertaClass()
if load_model:
    model = torch.load('DataPlus_ActDrop_Amazon.pt')
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [12]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01)

In [13]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch,data_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(data_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
#         print(ids.shape)
#         print(mask.shape)
#         print(token_type_ids.shape)
        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch,training_amazon_loader)

In [None]:
output_model_file = 'DataPlus_ActDrop_Amazon.pt'
torch.save(model, output_model_file)

print('All files saved')
print('This tutorial is completed')

In [22]:
EPOCHS = 2
for epoch in range(EPOCHS):
    train(epoch,training_loader)

1it [00:00,  6.39it/s]

Training Loss per 5000 steps: 0.19170083105564117
Training Accuracy per 5000 steps: 100.0


5001it [12:27,  6.69it/s]

Training Loss per 5000 steps: 0.47973216587625084
Training Accuracy per 5000 steps: 80.33643271345731


10001it [24:54,  6.69it/s]

Training Loss per 5000 steps: 0.492323935275128
Training Accuracy per 5000 steps: 79.5982901709829


10908it [27:10,  6.69it/s]
1it [00:00,  6.73it/s]

The Total Accuracy for Epoch 0: 79.44557133198107
Training Loss Epoch: 0.49511401315684506
Training Accuracy Epoch: 79.44557133198107
Training Loss per 5000 steps: 0.737583339214325
Training Accuracy per 5000 steps: 62.5


5001it [12:27,  6.68it/s]

Training Loss per 5000 steps: 0.43238158433120255
Training Accuracy per 5000 steps: 81.99360127974406


10001it [24:54,  6.71it/s]

Training Loss per 5000 steps: 0.44821476513329356
Training Accuracy per 5000 steps: 81.36811318868114


10908it [27:09,  6.69it/s]

The Total Accuracy for Epoch 1: 81.35134825408831
Training Loss Epoch: 0.4503095800556184
Training Accuracy Epoch: 81.35134825408831





In [28]:
output_model_file = 'DataPlus_ActDrop_Amazon_fine_8.pt'
torch.save(model, output_model_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


In [29]:
def MC_soft_predict(model,testing_loader,MC_num):
    MC_pred = get_predictions(model, testing_loader)
    for i in range(MC_num-1):
        predictions = get_predictions(model, testing_loader)
        MC_pred     += predictions
    MC_pred = MC_pred/MC_num
    result = torch.argmax(MC_pred, dim=1)
    resulti = result.numpy()
    
    return result

In [30]:
result = MC_soft_predict(model,testing_loader,100)

KeyboardInterrupt: 

In [None]:
result

In [None]:
submission = pd.DataFrame({'Id' : range(len(result)), 'Category' : result})
submission.to_csv('submission.csv', index=False)

In [None]:
output_model_file = 'DataPlus_ActDrop_Amazon.pt'
torch.save(model, output_model_file)

print('All files saved')
print('This tutorial is completed')

# train dil

In [None]:
dil_iter = 5
MC_iter  = 100
EPOCHS = 10

result = MC_predict(model,testing_loader,MC_iter)
for i in range(dil_iter):
    result_series = pd.Series(result)
    dil_train = pd.DataFrame(test_df["Phrase"])
    dil_train["Sentiment"] = result_series
    
    dil_loader   = get_data_loader(dil_train, tokenizer, max_len, batch_size, True, True)
    
    for epoch in range(EPOCHS):
        train(epoch,dil_loader)
    
    result = MC_predict(model,testing_loader,MC_iter)
    
    submission = pd.DataFrame({'Id' : range(len(result)), 'Category' : result})
    submission.to_csv('submission_T'+str(i)+'.csv', index=False)

In [None]:
def MC_soft_predict(model,testing_loader,MC_num):
    MC_pred = get_predictions(model, testing_loader)
    for i in range(MC_num-1):
        predictions = get_predictions(model, testing_loader)
        MC_pred     += predictions
    MC_pred = MC_pred/MC_num
    result = torch.argmax(MC_pred, dim=1)
    resulti = result.numpy()
    
    return result