In [None]:
!pip install -q ipywidgets
!jupyter nbextension enable --py widgetsnbextension
# from ipywidgets import FloatProgress

In [None]:
import torch
import transformers 
import torch.optim as optimize
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import warnings
#import torchdata

### Set Up hyperparameters and file paths

In [2]:
file_path = "/mnt/alluxio/fuse/yelp-review/yelp_review_sample_large.csv"
batch_size = 32
num_workers = 8
num_epochs = 3
profiler_enabled = False
profiler_log_path = "../log/nlp-demo"

In [3]:
warnings.filterwarnings("ignore", category=UserWarning)

### Checking device used for training

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device}")

### Build and Preprocess Data Using torchdata.datapipe (Optional)

In [5]:
# Using torchdata.datapipe to load data, version=0.3.0

# datapipe = torchdata.datapipes.iter.FileLister(file_path).filter(
#     filter_fn=lambda filename: filename.endswith("large.csv"))

# datapipe = torchdata.datapipes.iter.FileOpener(datapipe, mode="rt")
# datapipe = datapipe.parse_csv(delimiter = ",", skip_lines = 1)

# _,_,_,score,_,_,_,text,_ = datapipe.unzip(sequence_length=9)
# map_score = score.map(classify)

# # Lower Reviews for BertTokenizer
# def uncase(x):
#     return x.lower()

# lower_text = text.map(uncase)
# clean_datapipe = lower_text.zip(map_score)

# Using torchdata.datapipe to load data, version=0.6.0 (tbd)
# datapipe = torchdata.datapipes.iter.FileLister(file_path).filter(
#     filter_fn=lambda filename: filename.endswith(".csv")
#     )
# datapipe = torchdata.datapipes.iter.FileOpener(datapipe, mode="rt")
# datapipe = datapipe.parse_csv(delimiter = ",", skip_lines = 1)
# N_rows = 500

# # Drop irrelevant cols
# r_datapipe = datapipe.drop([0,1,2,4,5,6,8])

# # Classify Stars: 1,2 -> negative; 3,4,5 -> positive
# score,text = r_datapipe.unzip(sequence_length=2)
# def classify(x):
#     if int(x) >2:
#         return 1
#     else:
#         return 0
# map_score = score.map(classify)

# # Lower Reviews for BertTokenizer
# def uncase(x):
#     return x.lower()

# lower_text = text.map(uncase)
# clean_datapipe = lower_text.zip(map_score)

### Build and Preprocess Data Using Dataset and Dataloader

In [7]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

class YelpDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_length):
        super(YelpDataset, self).__init__()
        
        self.df = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        
        text = self.df.iloc[index,7]
        text = text.lower()
        
        label = self.df.iloc[index,3]
        label = classify(label)
        
        
        #tokenize,pad and encode reviews
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            max_length=self.max_length,
            padding = "max_length",
            add_special_tokens=True,
            return_attention_mask=True,
            truncation=True
        )
        
        encoded_text = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            "encoded_text": torch.tensor(encoded_text, dtype=torch.long),
            "label": torch.tensor(label, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)
            }

# Build Dataset with Bert Tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset= YelpDataset(tokenizer,file_path, max_length=128)

# Split Dataset to train, valid and test
train_ds, valid_ds,test_ds = random_split(dataset,[8000,1000,1000])

# Build DataLoader for train, valid and test Dataset
train_dl = DataLoader(dataset = train_ds, num_workers = num_workers, batch_size = batch_size, shuffle = True)

valid_dl = DataLoader(dataset = valid_ds, num_workers = num_workers, batch_size = batch_size, shuffle = True)

### Set up model from pretrained BERT for Sequence Classification model

In [10]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        #model_name = "bert-base-uncased"
        self.encoder = transformers.BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels = 2, 
            return_dict = True
        )
        
    def forward(self, input_ids, attention_mask, token_type_ids,labels):
        
        loss, logits = self.encoder(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )[:2]
        
        return loss,logits

# Save model
def save_checkpoints(path, model, valid_loss):
    
    if path == None:
        return
    state_dict =  {"model_state_dict": model.state_dict(),
                  "valid_loss": valid_loss}
    
    torch.save(state_dict, path)
    print(f"Model saved to {path}")
    
    
# Load Model
# def load_checkpoints(path, model):
#     if path == None:
#         return
#     state_dict = torch.load(load_path, map_location=device)
#     print(f"Model loaded from {path}")
    
#     model.load_state_dict(state_dict["model_state_dict"])
#     return state_dict["valid_loss"]

In [12]:
model = BERT()
model = model.to(device)
optimizer = optimize.Adam(model.parameters(), lr = 1e-6, weight_decay = 0.01)

### Set Up Training Loops

In [15]:
def train(
    num_epoch = num_epochs,
    train_dl = train_dl,
    valid_dl = valid_dl, 
    model = model, 
    optimizer = optimizer, 
    criterion = nn.BCELoss(), 
    file_path = file_path
):
    
    start_time = time.perf_counter()
    print(f"Started training at the timestamp{start_time}")
          
    # Set up metrics
    train_loss = 0.0
    valid_loss = 0.0
    total_train_loss = []
    total_valid_loss = []
    lowest_loss = float("Inf")
    
    # Training loop
    model.train()
    for  epoch in range(num_epoch):
        
        for item in tqdm(train_dl, leave = True):
            text = item["encoded_text"]
            text = text.to(device)
            label = item["label"]
            label = label.unsqueeze(1)
            label = label.to(device)
            mask = item["mask"]
            mask = mask.to(device)
            token_type_ids = item["token_type_ids"]
            token_type_ids = token_type_ids.to(device)
            
            output = model(
                text, 
                attention_mask=mask,
                token_type_ids=token_type_ids, 
                labels=label
            )
            
            loss = output[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update train loss
            train_loss += loss.item()
            
        # Evaluate
        model.eval()
        with torch.no_grad():                    

            # validation loop

            for item in valid_dl:
                text = item["encoded_text"]
                text = text.to(device)
                label = item["label"]
                label = label.unsqueeze(1)
                label = label.to(device)
                mask = item["mask"]
                mask = mask.to(device)
                token_type_ids = item["token_type_ids"]
                token_type_ids = token_type_ids.to(device)
                output = model(
                    text, 
                    attention_mask=mask,
                    token_type_ids=token_type_ids, 
                    labels=label
                )
                loss = output[0]

                valid_loss += loss.item()
                
        avg_train_loss = train_loss / len(train_dl)
        avg_valid_loss = valid_loss / len(valid_dl)
        total_train_loss.append(avg_train_loss)
        total_valid_loss.append(avg_valid_loss)

        # Monitor training progress
        print("Epoch [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}"
              .format(epoch+1, num_epoch,
                      avg_train_loss, avg_valid_loss))

       # Save model if valid loss gets lower
        if lowest_loss > valid_loss:
            lowest_loss = valid_loss
            save_checkpoints(file_path + '/' + 'model.pt', model, lowest_loss)

        # Reset Metrics
        train_loss = 0.0
        valid_loss = 0.0
        
        if profiler_enabled:
            profiler.step()
    
    end_time = time.perf_counter()
   
    print(f"Started training at the timestamp{end_time}")
    print(f"Training time in {end_time - start_time:0.4f} seconds")
    
    if profiler_enabled:
        profiler.stop()
        print("The profiler is completed. Please open the TensorBoard to browse the metrics.")
    
    return total_train_loss, total_valid_loss


### Setup Profiler

In [13]:
profiler = None
if profiler_enabled:
    profiler = torch.profiler.profile(
        schedule=torch.profiler.schedule(
            wait=0, warmup=0, active=1, repeat=1
        ),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            profiler_log_path
        ),
    )
    profiler.start()

### Fine Tune the Model

In [None]:
train(
    num_epoch = num_epochs, 
    train_dl = train_dl, 
    valid_dl = valid_dl, 
    model = model, 
    optimizer = optimizer, 
    criterion = nn.BCELoss(), 
    file_path = file_path
)