In [1]:
!pip3 install torch
!pip3 install torchtext
!pip3 install transformers
!pip3 install tqdm
!pip3 install pathlib

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 8.6MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 48.5MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 50.3MB/s 
Installing c

In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
from pathlib import Path
logging.basicConfig(level=logging.ERROR)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

root_dir = Path('/content/gdrive/My Drive')
data_dir = Path(root_dir,'dataset', 'sentence-classification')

Mounted at /content/gdrive


In [4]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_train):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.is_train = is_train
        if self.is_train:
            self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        if self.is_train:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            }

def get_data_loader(df, tokenizer, max_len, batch_size, is_train, shuffle):
    dataset = SentimentData(
        df,
        tokenizer = tokenizer,
        max_len = max_len,
        is_train=is_train,
    )
    
    return DataLoader(
        dataset,
        shuffle = shuffle,
        batch_size=batch_size
    )

def get_predictions(model, loader):
    model = model.eval()
    
    predictions = []
    predictions_probs = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['ids']
            attention_mask = batch['mask']
            token_type_ids = batch["token_type_ids"]
            
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
                token_type_ids = token_type_ids.cuda()
                
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                token_type_ids = token_type_ids
            )
            
            _, preds = torch.max(outputs, dim=1)            
            
            predictions.extend(torch.argmax(outputs, dim=1))
    return torch.stack(predictions).cpu()

In [5]:
# Defining some key variables that will be used later on in the training
load_model = False
if load_model:
#     tokenizer = RobertaTokenizer.from_pretrained('./vocab.json',truncation=True, do_lower_case=True)
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)
else:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

max_len = 256
batch_size = 32
# EPOCHS = 1
LEARNING_RATE = 1e-05
train_valid_frac = 0.8
#원래 데이터셋
train = pd.read_csv(data_dir.joinpath('train_plus.csv'))
new_df = train[['Phrase', 'Sentiment']]

train_df=new_df.sample(frac=train_valid_frac,random_state=200)
valid_df=new_df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

test_df = pd.read_csv(data_dir.joinpath('eval_final_open.csv'))
test_df = test_df[['Sentence']]
test_df.rename(columns = {'Sentence': 'Phrase'}, inplace = True)



print(f'Dataset Configuration')
print(f'-'*25)
print(f'Train/Valid = {train_valid_frac:.2f}/{1-train_valid_frac:.2f}')
print(f'Batch size = {batch_size}')
print(f'-'*25)
print(f'Train set : {len(train_df)}')
print(f'Valid set : {len(valid_df)}')
print(f'Test set : {len(test_df)}')

training_loader   = get_data_loader(train_df, tokenizer, max_len, batch_size, True, True)
validating_loader = get_data_loader(valid_df, tokenizer, max_len, batch_size, True, True)
testing_loader    = get_data_loader(test_df, tokenizer, max_len, batch_size, False, False)


# training_loader   = DataLoader(training_set, **train_params)
# validating_loader = DataLoader(validating_set, **valid_params)
# testing_loader    = DataLoader(testing_set, **test_params)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…


Dataset Configuration
-------------------------
Train/Valid = 0.80/0.20
Batch size = 32
-------------------------
Train set : 69809
Valid set : 17452
Test set : 4311


In [6]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [7]:
model = RobertaClass()
if load_model:
    model = torch.load('pytorch_roberta_sentiment.bin')
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=501200538.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [8]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [9]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [10]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
#         print(ids.shape)
#         print(mask.shape)
#         print(token_type_ids.shape)
        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [11]:
EPOCHS = 10
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Loss per 5000 steps: 1.5986244678497314
Training Accuracy per 5000 steps: 15.625


500it [06:23,  1.30it/s]

Training Loss per 5000 steps: 1.1347068961033089
Training Accuracy per 5000 steps: 51.34730538922156


1000it [12:46,  1.30it/s]

Training Loss per 5000 steps: 1.0441551735470227
Training Accuracy per 5000 steps: 55.15422077922078


1500it [19:08,  1.31it/s]

Training Loss per 5000 steps: 1.0063535547272353
Training Accuracy per 5000 steps: 56.543554297135245


2000it [25:31,  1.31it/s]

Training Loss per 5000 steps: 0.980583309620872
Training Accuracy per 5000 steps: 57.62743628185907


2182it [27:50,  1.31it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 0: 57.98679253391396
Training Loss Epoch: 0.9727686630761699
Training Accuracy Epoch: 57.98679253391396
Training Loss per 5000 steps: 0.9222535490989685
Training Accuracy per 5000 steps: 68.75


500it [06:22,  1.31it/s]

Training Loss per 5000 steps: 0.8305612268919003
Training Accuracy per 5000 steps: 64.22779441117764


1000it [12:45,  1.31it/s]

Training Loss per 5000 steps: 0.8290742864082385
Training Accuracy per 5000 steps: 64.1702047952048


1500it [19:08,  1.30it/s]

Training Loss per 5000 steps: 0.8245879569941564
Training Accuracy per 5000 steps: 64.36125916055963


2000it [25:30,  1.30it/s]

Training Loss per 5000 steps: 0.8219589043980179
Training Accuracy per 5000 steps: 64.54272863568215


2182it [27:49,  1.31it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 1: 64.67790686014698
Training Loss Epoch: 0.8203177172502829
Training Accuracy Epoch: 64.67790686014698
Training Loss per 5000 steps: 0.5697252154350281
Training Accuracy per 5000 steps: 78.125


500it [06:22,  1.31it/s]

Training Loss per 5000 steps: 0.7459701458494107
Training Accuracy per 5000 steps: 68.10129740518963


1000it [12:45,  1.31it/s]

Training Loss per 5000 steps: 0.7498748934233225
Training Accuracy per 5000 steps: 68.05382117882118


1500it [19:08,  1.31it/s]

Training Loss per 5000 steps: 0.7483148081035792
Training Accuracy per 5000 steps: 68.18787475016656


2000it [25:31,  1.30it/s]

Training Loss per 5000 steps: 0.7494851565104851
Training Accuracy per 5000 steps: 68.13780609695152


2182it [27:50,  1.31it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 2: 68.1531034680342
Training Loss Epoch: 0.7487892456195641
Training Accuracy Epoch: 68.1531034680342
Training Loss per 5000 steps: 0.8473849296569824
Training Accuracy per 5000 steps: 65.625


500it [06:22,  1.30it/s]

Training Loss per 5000 steps: 0.6761481463314293
Training Accuracy per 5000 steps: 71.44461077844312


1000it [12:45,  1.31it/s]

Training Loss per 5000 steps: 0.6824366076366527
Training Accuracy per 5000 steps: 71.25062437562437


1500it [19:08,  1.31it/s]

Training Loss per 5000 steps: 0.6849802448342912
Training Accuracy per 5000 steps: 71.34410393071286


2000it [25:31,  1.31it/s]

Training Loss per 5000 steps: 0.6894555685372427
Training Accuracy per 5000 steps: 71.14411544227886


2182it [27:50,  1.31it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 3: 71.10974229683852
Training Loss Epoch: 0.6901872931927524
Training Accuracy Epoch: 71.10974229683852
Training Loss per 5000 steps: 0.5731616616249084
Training Accuracy per 5000 steps: 78.125


500it [06:22,  1.31it/s]

Training Loss per 5000 steps: 0.6251093455536398
Training Accuracy per 5000 steps: 74.02694610778443


1000it [12:45,  1.31it/s]

Training Loss per 5000 steps: 0.6332160628639854
Training Accuracy per 5000 steps: 73.84802697302698


1500it [19:08,  1.30it/s]

Training Loss per 5000 steps: 0.6387666884380369
Training Accuracy per 5000 steps: 73.58427714856762


2000it [25:31,  1.30it/s]

Training Loss per 5000 steps: 0.6434100304973656
Training Accuracy per 5000 steps: 73.43047226386807


2182it [27:51,  1.31it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 4: 73.44181982265897
Training Loss Epoch: 0.6433764500125191
Training Accuracy Epoch: 73.44181982265897
Training Loss per 5000 steps: 0.4698637127876282
Training Accuracy per 5000 steps: 81.25


500it [06:23,  1.31it/s]

Training Loss per 5000 steps: 0.5762203229282669
Training Accuracy per 5000 steps: 76.39096806387225


1000it [12:46,  1.31it/s]

Training Loss per 5000 steps: 0.5820744744279645
Training Accuracy per 5000 steps: 76.05207292707293


1500it [19:09,  1.30it/s]

Training Loss per 5000 steps: 0.5895170653307144
Training Accuracy per 5000 steps: 75.77864756828781


2000it [25:33,  1.30it/s]

Training Loss per 5000 steps: 0.5917821562942537
Training Accuracy per 5000 steps: 75.64030484757622


2182it [27:52,  1.30it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 5: 75.61059462246988
Training Loss Epoch: 0.5926309843862636
Training Accuracy Epoch: 75.61059462246988
Training Loss per 5000 steps: 0.44308388233184814
Training Accuracy per 5000 steps: 84.375


500it [06:23,  1.31it/s]

Training Loss per 5000 steps: 0.5378502317650351
Training Accuracy per 5000 steps: 78.1561876247505


1000it [12:46,  1.30it/s]

Training Loss per 5000 steps: 0.5399182770963197
Training Accuracy per 5000 steps: 78.06256243756243


1500it [19:10,  1.30it/s]

Training Loss per 5000 steps: 0.5454988297246283
Training Accuracy per 5000 steps: 77.85851099267155


2000it [25:34,  1.30it/s]

Training Loss per 5000 steps: 0.5518901445742311
Training Accuracy per 5000 steps: 77.53310844577712


2182it [27:53,  1.30it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 6: 77.55017261384636
Training Loss Epoch: 0.551533643972371
Training Accuracy Epoch: 77.55017261384636
Training Loss per 5000 steps: 0.5508717894554138
Training Accuracy per 5000 steps: 71.875


500it [06:23,  1.31it/s]

Training Loss per 5000 steps: 0.49174159955597685
Training Accuracy per 5000 steps: 80.06487025948104


1000it [12:46,  1.31it/s]

Training Loss per 5000 steps: 0.5037417939522645
Training Accuracy per 5000 steps: 79.56418581418582


1500it [19:10,  1.30it/s]

Training Loss per 5000 steps: 0.5073368648820364
Training Accuracy per 5000 steps: 79.40747834776815


2000it [25:33,  1.30it/s]

Training Loss per 5000 steps: 0.5104502599829737
Training Accuracy per 5000 steps: 79.25880809595202


2182it [27:53,  1.30it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 7: 79.20755203483792
Training Loss Epoch: 0.5116706181020501
Training Accuracy Epoch: 79.20755203483792
Training Loss per 5000 steps: 0.41529661417007446
Training Accuracy per 5000 steps: 87.5


500it [06:23,  1.30it/s]

Training Loss per 5000 steps: 0.46045799824054134
Training Accuracy per 5000 steps: 81.4184131736527


1000it [12:47,  1.31it/s]

Training Loss per 5000 steps: 0.46237455855507953
Training Accuracy per 5000 steps: 81.55906593406593


1500it [19:10,  1.31it/s]

Training Loss per 5000 steps: 0.46610841572503103
Training Accuracy per 5000 steps: 81.30204863424383


2000it [25:34,  1.30it/s]

Training Loss per 5000 steps: 0.4710275412670736
Training Accuracy per 5000 steps: 80.87987256371814


2182it [27:53,  1.30it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 8: 80.75033305161226
Training Loss Epoch: 0.47291549591912685
Training Accuracy Epoch: 80.75033305161226
Training Loss per 5000 steps: 0.3341645896434784
Training Accuracy per 5000 steps: 90.625


500it [06:23,  1.30it/s]

Training Loss per 5000 steps: 0.4119511867711644
Training Accuracy per 5000 steps: 83.42689620758483


1000it [12:46,  1.31it/s]

Training Loss per 5000 steps: 0.4169703117468498
Training Accuracy per 5000 steps: 83.08878621378621


1500it [19:10,  1.31it/s]

Training Loss per 5000 steps: 0.42684417270089214
Training Accuracy per 5000 steps: 82.57203530979348


2000it [25:33,  1.30it/s]

Training Loss per 5000 steps: 0.43300804061659687
Training Accuracy per 5000 steps: 82.1885932033983


2182it [27:53,  1.30it/s]

The Total Accuracy for Epoch 9: 82.0682147001103
Training Loss Epoch: 0.43452529173504856
Training Accuracy Epoch: 82.0682147001103





In [12]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(validating_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [13]:
valid(model,testing_loader)

1it [00:00,  3.71it/s]

Validation Loss per 100 steps: 1.0687732696533203
Validation Accuracy per 100 steps: 71.875


101it [00:26,  3.75it/s]

Validation Loss per 100 steps: 0.9885358444534906
Validation Accuracy per 100 steps: 66.24381188118812


201it [00:53,  3.76it/s]

Validation Loss per 100 steps: 1.0169922753946106
Validation Accuracy per 100 steps: 65.53171641791045


301it [01:20,  3.76it/s]

Validation Loss per 100 steps: 1.0201604673236708
Validation Accuracy per 100 steps: 65.53156146179403


401it [01:46,  3.76it/s]

Validation Loss per 100 steps: 1.0234481690084547
Validation Accuracy per 100 steps: 65.67955112219451


501it [02:13,  3.76it/s]

Validation Loss per 100 steps: 1.0224139360253683
Validation Accuracy per 100 steps: 65.7123253493014


546it [02:25,  3.76it/s]

Validation Loss Epoch: 1.0237794725340366
Validation Accuracy Epoch: 65.71166628466652





65.71166628466652

In [14]:
predictions = get_predictions(model, testing_loader)
submission = pd.DataFrame({'Id' : range(len(predictions)), 'Category' : predictions})
submission.to_csv('submission.csv', index=False)



In [15]:
output_model_file = 'roberta_large_model.pt'
# output_vocab_file = './'

torch.save(model, output_model_file)
# tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed
