In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset

from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup

In [2]:
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./dataset/test.csv
./dataset/train.csv


In [3]:
def preprocessing(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "http", regex=True)
    df[text_field] = df[text_field].str.replace(r"http", "http", regex=True)
    df[text_field] = df[text_field].str.replace(r"@\S+", "@user", regex=True)
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ", regex=True)
    df[text_field] = df[text_field].str.replace(r"@", "at", regex=True)
    df[text_field] = df[text_field].str.lower()
    return df

In [4]:
class TwitterDataset(Dataset):
    def __init__(self, text_list, label_list):
        assert (len(text_list) == len(label_list))

        self.text_list = text_list
        self.label_list = label_list

    def __len__(self):
        return len(self.text_list)
    
    def __getitem__(self, idx):
        return self.text_list[idx], self.label_list[idx]

In [5]:
class Collator():
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        text_list, label_list = zip(*batch)
        
        batch = self.tokenizer(text_list, 
                            max_length=self.max_length, 
                            padding='max_length', 
                            return_tensors='pt')

        output = dict(input_ids=batch['input_ids'],
                      attention_mask=batch['attention_mask'],
                      labels=torch.tensor(label_list, dtype=torch.int64))
        
        return output

In [6]:
def train_model(net, dataloader_dict, criterion, optimizer, num_epochs, scheduler):
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device)
    
    torch.backends.cudnn.benchmark = True

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 20)
    
        for phase in ["train", "val"]:
            if phase == 'train':
                net.train()
            else:
                net.eval()

            epoch_loss = 0.0
            epoch_corrects = 0

            if (epoch == 0) and (phase == "train"):
                continue

            for minibatch in dataloader_dict[phase]:
                inputs = minibatch['input_ids'].to(device)
                labels = minibatch['labels'].to(device)
                attention_mask = minibatch['attention_mask'].to(device)
                
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    logits = net(inputs, attention_mask).logits
                    probs = torch.softmax(logits, dim=-1)
                    preds = torch.argmax(probs, dim=-1)

                    loss = criterion(logits, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

In [7]:
def get_dataloader(train_data, valid_data, batch_size, collate_fn):
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    dataloader_dict = {"train": train_dataloader, "val": valid_dataloader}

    return dataloader_dict

In [8]:
DATA_PATH = './dataset/train.csv'
VALID_RATIO = 0.2
RANDOM_SEED = 119
BATCH_SIZE = 128
NUM_EPOCHS = 10
MAX_LEN = 150
LEARNING_RATE = 5e-5
WARMUP_RATIO = 0.2
PRETRAINED_MODEL_NAME = 'bert-base-uncased'
MODEL_PATH = './model/tweet-bert-base-uncased.pth'

In [9]:
def main():
    data = pd.read_csv(DATA_PATH)
    data = preprocessing(data, 'text')

    data_text, data_labels = data['text'].to_list(), data['target'].to_list()
    train_texts, val_texts, train_labels, val_labels = train_test_split(data_text, data_labels, test_size=VALID_RATIO, random_state=RANDOM_SEED)

    num_labels = len(set(train_labels))

    train_data = TwitterDataset(train_texts, train_labels)
    valid_data = TwitterDataset(val_texts, val_labels)

    num_training_step = (len(train_data)//BATCH_SIZE + 1) * NUM_EPOCHS
    num_warmup_steps = int(num_training_step * WARMUP_RATIO)

    tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL_NAME)

    dataloders_dict = get_dataloader(train_data, valid_data, BATCH_SIZE, Collator(tokenizer, MAX_LEN))

    model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=num_labels)
    optimizer = Adam(params=model.parameters(), lr=LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_step)

    train_model(model, dataloders_dict, loss_fn, optimizer, NUM_EPOCHS, scheduler)

    torch.save(obj={"bert":model.state_dict(),
                    "tokenizer":tokenizer,
                    "num_labels":num_labels},
                    f=MODEL_PATH)    

In [10]:
main()

  df[text_field] = df[text_field].str.replace(r"http\S+", "http")
  df[text_field] = df[text_field].str.replace(r"@\S+", "@user")
  df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to

Epoch 1/10
--------------------
val Loss: 0.8776 Acc: 0.4386
Epoch 2/10
--------------------
train Loss: 0.6639 Acc: 0.6076
val Loss: 0.4690 Acc: 0.7919
Epoch 3/10
--------------------
train Loss: 0.4076 Acc: 0.8309
val Loss: 0.4157 Acc: 0.8181
Epoch 4/10
--------------------
train Loss: 0.3186 Acc: 0.8814
val Loss: 0.4311 Acc: 0.8227
Epoch 5/10
--------------------
train Loss: 0.2257 Acc: 0.9233
val Loss: 0.4279 Acc: 0.8253
Epoch 6/10
--------------------
train Loss: 0.1615 Acc: 0.9442
val Loss: 0.5412 Acc: 0.8227
Epoch 7/10
--------------------
train Loss: 0.1160 Acc: 0.9617
val Loss: 0.6807 Acc: 0.7984
Epoch 8/10
--------------------
train Loss: 0.0913 Acc: 0.9691
val Loss: 0.6593 Acc: 0.8221
Epoch 9/10
--------------------
train Loss: 0.0659 Acc: 0.9760
val Loss: 0.6902 Acc: 0.8227
Epoch 10/10
--------------------
train Loss: 0.0519 Acc: 0.9800
val Loss: 0.7215 Acc: 0.8168
