In [1]:
import pandas as pd
# /kaggle/input/natural-language-processing-with-disaster-tweets
def path(file):
    return f"../input/natural-language-processing-with-disaster-tweets/{file}.csv"
train = pd.read_csv(path("train"))
valid = pd.read_csv(path("valid"))
test_df = pd.read_csv(path("test"))

In [2]:
train.shape, valid.shape, test_df.shape

((6089, 9), (1524, 9), (3263, 7))

In [3]:
import torch

# Preliminaries
from torchtext.transforms import BERTTokenizer, VocabTransform, ToTensor, Sequential, Truncate, PadTransform
from torchtext.vocab import vocab
from torch.utils.data import Dataset, DataLoader

# Models
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training
import torch.optim as optim
import warnings
warnings.filterwarnings("ignore")

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


<div style="border-radius:10px;
            padding: 15px;
            background-color:#c0deed;">

- Chuẩn bị data và pipeline cho mô hình, gồm có: 
    - Dataloader
    - Tokenize
    - Vocab transform
    - Chuyển thành tensor

In [4]:
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# # Model parameter
# MAX_SEQ_LEN = 128
# PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
# UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# # Transforms
# vocab_path = "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt"
# label_transform = ToTensor(dtype=torch.float) 
# vo = vocab(tokenizer.vocab)
# vo.set_default_index(0)
# text_transform = Sequential(BERTTokenizer(vocab_path=vocab_path), VocabTransform(vo), 
#                             Truncate(max_seq_len=MAX_SEQ_LEN),ToTensor(),
#                             PadTransform(max_length=MAX_SEQ_LEN, pad_value=0),)

# class TextClassificationDataset(Dataset): 
#     def __init__(self, path): 
#         super().__init__()
#         self.df = pd.read_csv(path) 
#         self.label_col = 'target' 
#         self.text_col = 'keyword_text'

#     def __getitem__(self, index):
#         label = [self.df.loc[index, self.label_col].tolist()]
#         text = self.df.loc[index, self.text_col]
#         label = label_transform(label)
#         text = text_transform(text)[:MAX_SEQ_LEN]
#         return label, text

#     def __len__(self):
#         return len(self.df)
    
# # datasets
# train_dataset = TextClassificationDataset(path("train")) 
# valid_dataset = TextClassificationDataset(path("valid")) 
# test_dataset = TextClassificationDataset(path("test"))

# # Data loader
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) 
# valid_loader = DataLoader(valid_dataset, batch_size=16) 
# test_loader = DataLoader(test_dataset, batch_size=16)

In [5]:
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
tokenizer.decode(tokenizer.encode("This is example of tokenizer."))

'[CLS] this is example of tokenizer. [SEP]'

In [7]:
tokenizer.sep_token

'[SEP]'

In [8]:
# MAX_SEQ_LEN = 32
MAX_SEQ_LEN = 124

from tqdm.auto import tqdm
tqdm.pandas()

class SiameseDataset(Dataset):

    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.df["encoded"] = self.df.keyword.fillna("") + f" {tokenizer.sep_token} " + self.df.text.fillna("")
        self.encoded = tokenizer.batch_encode_plus(list(df.encoded.apply(lambda x: x.replace("_"," ")).values), max_length=max_length, truncation=True)["input_ids"]
        # Check if the dataframe has the target column
        if hasattr(self.df, "target"):
            self.targets = self.df.target
        else:
            self.targets = None
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return {
            'ids': torch.tensor(self.encoded[index]), 
            # Return None if the targets are None
            'target': None if self.targets is None else torch.tensor(self.targets[index])
        }
pad_token_id = tokenizer.pad_token_id

def collate_fn(batch):
    ids = [x["ids"] for x in batch]
    targets = [x["target"] for x in batch]
    max_len = np.max([len(x) for x in ids])
    masks = []
    for i in range(len(ids)):
        if len(ids[i]) < max_len:
            ids[i]= torch.cat((ids[i], torch.tensor([pad_token_id,]*(max_len - len(ids[i])),dtype=torch.long)))
        masks.append(ids[i] != pad_token_id)
    # print(tokenizer.decode(ids[0]))
    # Check if the target is None
    if targets[0] is None:
        # Return only ids and masks
        outputs = {
            "ids": torch.vstack(ids),
            "masks": torch.vstack(masks)
        }
    else:
        # Return ids, masks and target as before
        outputs = {
            "ids": torch.vstack(ids),
            "masks": torch.vstack(masks),
            "target": torch.vstack(targets).view(-1)
        }
    return outputs

train_dataset = SiameseDataset(train, tokenizer, MAX_SEQ_LEN)
valid_dataset = SiameseDataset(valid, tokenizer, MAX_SEQ_LEN)
test_dataset = SiameseDataset(test_df, tokenizer, MAX_SEQ_LEN)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn) 
valid_loader = DataLoader(valid_dataset, batch_size=16, collate_fn=collate_fn) 
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

<div style="border-radius:10px;
            padding: 15px;
            background-color:#c0deed;">

- Xây dựng mô hình

In [9]:
# class BERT(nn.Module):
#     def __init__(self):
#         super(BERT, self).__init__()
#         options_name = 'bert-large-uncased'
#         self.model = BertForSequenceClassification.from_pretrained(
#                             options_name, hidden_dropout_prob=0.1)
    
#     def forward(self, input_ids, attention_mask, labels=None):
#         # Get the last hidden state from the BERT model
#         outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
#         # Get the loss and logits from the outputs
#         loss, logits = outputs[:2]
#         return loss, logits

In [10]:
# Save and Load functions
def save_checkpoint(save_path, model, valid_loss):
    if save_path is None:
        return
    
    state_dict = {
                     'model_state_dict': model.state_dict(),
                     'valid_loss': valid_loss
                 }
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    if load_path is None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']

def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    if save_path is None:
        return
    
    state_dict = {
                     'train_loss_list': train_loss_list,
                     'valid_loss_list': valid_loss_list,
                     'global_steps_list': global_steps_list
                 }
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')
   
def load_metrics(load_path):
    if load_path is None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    return state_dict['train_loss_list'], state_dict['valid_loss_list'],state_dict['global_steps_list']

In [11]:
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
def optimizer_scheduler(model, num_train_steps):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

    opt = AdamW(optimizer_parameters, lr=3e-5)
#     opt = AdamW(optimizer_parameters, lr=6e-6, eps = 1e-8)
    sch = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.05*num_train_steps),
        num_training_steps=num_train_steps,
        last_epoch=-1,
    )
    return opt, sch

In [12]:
device=torch.device("cuda:0")
model = BertForSequenceClassification.from_pretrained(
                             "bert-large-uncased", hidden_dropout_prob=0.1)
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [13]:
loss_fn = nn.BCEWithLogitsLoss()
# Training function
def train(model,
         criterion=nn.BCELoss(),
         train_loader=train_loader,
         valid_loader=valid_loader,
         num_epochs=5,
         eval_every=len(train_loader)//2,
         accumulation_steps=6,
         file_path="/kaggle/working",
         best_valid_loss=float('Inf')):
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []
    
    num_train_steps = len(train_loader) * num_epochs // accumulation_steps
    optimizer, scheduler = optimizer_scheduler(model, num_train_steps)

    # training loop
    model.train()
    for epoch in range(num_epochs):
        bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
        for step, data in bar:
            ids = data["ids"].to(device)
            masks = data["masks"].to(device)
            target = data["target"].to(device)
            res = model(ids, 
                        token_type_ids=None, 
                        attention_mask=masks, 
                        labels=target)
            loss = res.loss
            preds = res.logits
#             loss = loss_fn(preds, target.float())

            loss.backward()
            if (step + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
            
            # update running values
            running_loss += loss.item()
            global_step +=1 
            
#             bar.set_postfix(loss=loss.item())
            # evaluation step
            if global_step % eval_every == 5:
                model.eval()
                with torch.no_grad():
                    # validation loop
                    bar = tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False)
                    targets = []
                    all_preds = []
                    for step, data in bar:
                        ids = data["ids"].to(device)
                        masks = data["masks"].to(device)
                        target = data["target"].to(device)
                        res = model(ids, 
                                    token_type_ids=None, 
                                    attention_mask=masks, 
                                    labels=target)
                        loss = res.loss
                        preds = res.logits
                        
                        valid_running_loss += loss.item()
                    all_preds = np.array(all_preds)
                    targets = np.array(targets)
                    
#                 total = sum(valid["target"] == all_preds.argmax(dim=1))
#                 print(total/valid.shape[0])
                
                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / eval_every
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)
                
                # reset running values
                running_loss = 0.0
                valid_running_loss = 0.0
                model.train()
                
                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train loss: {:.4f}, Valid loss: {:.4f}'
                      .format(epoch + 1, num_epochs, global_step, num_epochs * len(train_loader), average_train_loss, average_valid_loss))
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)

# model = BERT().to(device)
train(model=model)

  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [1/5], Step [5/1905], Train loss: 0.0190, Valid loss: 0.3673
Model saved to ==> /kaggle/working/model.pt
Model saved to ==> /kaggle/working/metrics.pt


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [1/5], Step [195/1905], Train loss: 0.5779, Valid loss: 0.2309
Model saved to ==> /kaggle/working/model.pt
Model saved to ==> /kaggle/working/metrics.pt


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [2/5], Step [385/1905], Train loss: 0.4296, Valid loss: 0.2036
Model saved to ==> /kaggle/working/model.pt
Model saved to ==> /kaggle/working/metrics.pt


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [2/5], Step [575/1905], Train loss: 0.3465, Valid loss: 0.2030
Model saved to ==> /kaggle/working/model.pt
Model saved to ==> /kaggle/working/metrics.pt


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [3/5], Step [765/1905], Train loss: 0.3571, Valid loss: 0.2007
Model saved to ==> /kaggle/working/model.pt
Model saved to ==> /kaggle/working/metrics.pt


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [3/5], Step [955/1905], Train loss: 0.2801, Valid loss: 0.2063


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [4/5], Step [1145/1905], Train loss: 0.2841, Valid loss: 0.2665


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [4/5], Step [1335/1905], Train loss: 0.2533, Valid loss: 0.2297


  0%|          | 0/381 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [5/5], Step [1525/1905], Train loss: 0.2513, Valid loss: 0.2279


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [5/5], Step [1715/1905], Train loss: 0.2131, Valid loss: 0.2393


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch [5/5], Step [1905/1905], Train loss: 0.1995, Valid loss: 0.2434
Model saved to ==> /kaggle/working/metrics.pt


In [14]:
model.eval()
with torch.no_grad():
    # validation loop
    valid_running_loss = 0.0
    bar = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
    targets = []
    all_preds = []
    for step, data in bar:
        ids = data["ids"].to(device)
        masks = data["masks"].to(device)
#         target = data["target"].to(device)
        res = model(ids, masks)
        all_preds.extend(res.logits)
#         targets.extend(target)
        
    all_preds = np.array([tensor.cpu().numpy() for tensor in all_preds])
#     targets = np.array([tensor.cpu().numpy() for tensor in targets])

  0%|          | 0/204 [00:00<?, ?it/s]

In [15]:
all_preds.argmax(axis=1)

array([1, 1, 1, ..., 1, 1, 1])

In [16]:
perfect_score = pd.read_csv("/kaggle/input/perfectsubmission/perfect_submission.csv")
perfect_score["target"]

0       1
1       1
2       1
3       1
4       1
       ..
3258    0
3259    1
3260    1
3261    1
3262    1
Name: target, Length: 3263, dtype: int64

In [17]:
ground_truth = perfect_score["target"].values
(ground_truth == all_preds.argmax(axis=1)).sum() / len(ground_truth)

0.8283787925222188

In [18]:
perfect_score["target"] = all_preds.argmax(axis=1)

In [19]:
perfect_score.to_csv("/kaggle/working/submission02.csv", index=False)

<div style="border-radius:10px;
            padding: 15px;
            background-color:#c0deed;">

- Load model tốt nhất 

In [20]:
model_new = BertForSequenceClassification.from_pretrained(
                    "bert-large-uncased")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly

In [21]:
load_checkpoint("/kaggle/working/model.pt", model_new)

Model loaded from <== /kaggle/working/model.pt


0.2006850636515178

In [22]:
model_new.to(device)
model_new.eval()
with torch.no_grad():
    # validation loop
    valid_running_loss = 0.0
    bar = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
    targets = []
    all_preds = []
    for step, data in bar:
        ids = data["ids"].to(device)
        masks = data["masks"].to(device)
#         target = data["target"].to(device)
        res = model_new(ids, masks)
        all_preds.extend(res.logits)
#         targets.extend(target)
        
    all_preds = np.array([tensor.cpu().numpy() for tensor in all_preds])
#     targets = np.array([tensor.cpu().numpy() for tensor in targets])

  0%|          | 0/204 [00:00<?, ?it/s]

In [23]:
perfect_score = pd.read_csv("/kaggle/input/perfectsubmission/perfect_submission.csv")
perfect_score["target"]

0       1
1       1
2       1
3       1
4       1
       ..
3258    0
3259    1
3260    1
3261    1
3262    1
Name: target, Length: 3263, dtype: int64

In [24]:
ground_truth = perfect_score["target"].values

In [25]:
(ground_truth == all_preds.argmax(axis=1)).sum() / len(ground_truth)

0.8384921851057309

In [26]:
perfect_score["target"] = all_preds.argmax(axis=1)

In [27]:
perfect_score.to_csv("/kaggle/working/submission03.csv", index=False)