<a href="https://colab.research.google.com/github/DanZter/BERT_TRAINING_sentiment_model/blob/master/BERT_TRAINING_sentiment_model_(GPU).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install -U scikit-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 7.0MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 16.2MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 44.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K 

In [3]:
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
from sklearn import model_selection
from scipy import stats
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
# import logging
# logging.basicConfig(level=logging.ERROR)

In [5]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
# ACCUMULATION = 2
BERT_PATH = "/content/drive/My Drive/Colab Notebooks/input/bert_base_uncased"
TRAINING_FILE = "/content/drive/My Drive/Colab Notebooks/input/IMDB Dataset.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)
MODEL_PATH ="/content/drive/My Drive/Colab Notebooks/models/bert_sentiment_model.bin"


In [11]:
class BERTDataset:
    def __init__(self, review, target):
        self.review = review                     # the review text, a list
        self.target = target                     # 0 or 1, a list
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    def __len__(self):                           # returns the total length of data set
        return len(self.review)

    def __getitem__(self, item):                 # takes an 'item' and returns tokenizer of that item from data set
        review = str(self.review[item])          # converts everything to str incase there exists numbers etc.
        review = " ".join(review.split())        # removes all unnecessary space

        inputs = self.tokenizer.encode_plus(     # encode_plus can encode 2 strings at a time
            review,
            None,                                # since we use only 1 string at a time
            add_special_tokens=True,             # adds cld, sep tokens
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"] # since only 1 string token_type_ids are same and unnecessary

        padding_length = self.max_len - len(ids)  # for bert we pad on the right side
        ids = ids + ([0] * padding_length)        # zero times the padding length
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.target[item], dtype=torch.float)
        }
    """ if we have 2 target outputs then set to torch.long,
    depends on loss function also, from cross-entropy we should use torch.long"""

In [15]:
def loss_fn(outputs, target):
    return nn.BCEWithLogitsLoss()(outputs, target.view(-1, 1))

def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()

    for bi, d in enumerate(data_loader):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        target = d["target"]

        ids = ids.to(device, dtype=torch.long)              # send to cuda device
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        loss = loss_fn(outputs, target)        # find loss
        loss.backward()                         # backward propagation

        optimizer.step()
        scheduler.step()

        """ stop the optimizer only after a certain number of accumulation steps """

        # if (bi + 1) % accumulation_steps == 0:
        #     optimizer.step()
        #     scheduler.step
        if bi % 10 == 0:
            print(f"batch_index={bi}, loss={loss}")

def eval_fn(data_loader, model, device):
    model.eval()
    fin_target = []                         # final targets
    fin_outputs = []                        # final outputs
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            target = d["target"]

            ids = ids.to(device, dtype=torch.long)              # send to cuda device
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.float)

            outputs = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids
            )
            # loss = loss_fn(outputs, targets)        # find loss, its bettwer to evaluate loss in eval fn

            fin_target.extend(target.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    return fin_outputs, fin_target

In [16]:
import transformers
import torch.nn as nn

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        """ 768: bert we use have 768 features | 1: binary classification
        if we use 2, we need to change the loss function"""

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        """ We have 2 outputs from a BERT model
         o1(last hidden state): is the sequence of hidden states. eg. if we have 512 tokens (MAX_LEN), 
         we have 512 vectors of size 768 for each batch. We can use out1 to max pooling or averge pooling
         o2(pooler output from bert pooler layer): we get vector of size 768 for each sample in batch"""
        bo = self.bert_drop(o2)                                 # drop-out
        output = self.out(bo)                                   # linear-layer
        return output

In [17]:
import torch
import pandas as pd
import numpy as np
# import torch.nn as nn       # for multi-gpu

# from model import BERTBaseUncased
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup


def run():
    dfx = pd.read_csv(TRAINING_FILE).fillna("none")
    dfx.sentiment = dfx.sentiment.apply(  # can use label encoding
        lambda x: 1 if x == "positive" else 0  # can use map fn
    )

    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.values  # when split both train and val have same positive to negative sample ratio
    )

    df_train = df_train.reset_index(drop=True)  # 0 to length of df_train
    df_valid = df_valid.reset_index(drop=True)  # 0 to length of df_valid

    train_dataset = BERTDataset(
        review=df_train.review.values,
        target=df_train.sentiment.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        num_workers=4
    )
    valid_dataset = BERTDataset(
        review=df_valid.review.values,
        target=df_valid.sentiment.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        num_workers=1
    )

    device = torch.device("cuda")  # using cuda
    print(torch.cuda.is_available(), device)
    model = BERTBaseUncased().to(device)  # calling from model.py

    param_optimizer = list(model.named_parameters())  # specify parameters to train
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    """ These parameters are adjustable, we should take a look at different layers and
    the decay we want, how much learning rate etc."""

    num_train_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    # model = nn.DataParallel(model)              # converting to multi gpu model

    best_accuracy = 0
    for epoch in tqdm(range(EPOCHS), total=EPOCHS):
        print("X"*20, "EPOCH :", epoch, "X"*100)
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        outputs, target = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(target, outputs)
        print(f"Accuracy score = {accuracy}")
        if accuracy > best_accuracy:
            # torch.save(model.state_dict(), MODEL_PATH)  # saving the model only if it improves
            best_accuracy = accuracy


run().to(device)

True cuda


  0%|          | 0/10 [00:00<?, ?it/s]

XXXXXXXXXXXXXXXXXXXX EPOCH : 0 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
batch_index=0, loss=0.630736231803894
batch_index=10, loss=0.5436104536056519
batch_index=20, loss=0.6880532503128052
batch_index=30, loss=0.7165029048919678
batch_index=40, loss=0.5221825242042542
batch_index=50, loss=0.612755537033081
batch_index=60, loss=0.4160301089286804
batch_index=70, loss=0.5801146626472473
batch_index=80, loss=0.10606885701417923
batch_index=90, loss=0.5991536974906921
batch_index=100, loss=0.29199567437171936
batch_index=110, loss=0.6438760757446289
batch_index=120, loss=0.4128738045692444
batch_index=130, loss=0.42486417293548584
batch_index=140, loss=0.13386133313179016
batch_index=150, loss=0.13998660445213318
batch_index=160, loss=0.09322542697191238
batch_index=170, loss=0.30200475454330444
batch_index=180, loss=0.5537563562393188
batch_index=190, loss=0.23777258396148682
batch_index=200, loss=0.6004219651222229
batch_index=

 10%|█         | 1/10 [1:06:17<9:56:41, 3977.94s/it]

Accuracy score = 0.914
XXXXXXXXXXXXXXXXXXXX EPOCH : 1 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
batch_index=0, loss=0.21776549518108368
batch_index=10, loss=0.07216081768274307
batch_index=20, loss=0.10269412398338318
batch_index=30, loss=0.014318596571683884
batch_index=40, loss=0.0632098838686943
batch_index=50, loss=0.03609033674001694
batch_index=60, loss=0.23828370869159698
batch_index=70, loss=0.0776645764708519
batch_index=80, loss=0.08358469605445862
batch_index=90, loss=0.17218410968780518
batch_index=100, loss=0.25186312198638916
batch_index=110, loss=0.06747061014175415


KeyboardInterrupt: ignored