In [1]:
import torch
from torchtext import data
from torchtext import datasets
import transformers 
import torch.optim as optimize
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm import tqdm
import torchtext
import torchdata
import time
import warnings

In [2]:
file_path = "/Users/aowu/Downloads/yelp_test"
batch_size = 16
#num_workers = 2
num_epochs = 10
profiler_enabled = True
profiler_log_path = "/Users/aowu/Downloads/yelp_test_log"

In [3]:
warnings.filterwarnings("ignore", category=UserWarning)

### Checking device used for training

In [4]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device}")

Using mps


### Build and Preprocess use torchdata.datapipes

In [5]:
datapipe = torchdata.datapipes.iter.FileLister(file_path).filter(filter_fn=lambda filename: filename.endswith('.csv'))
datapipe = torchdata.datapipes.iter.FileOpener(datapipe, mode = 'rt')
datapipe = datapipe.parse_csv(delimiter = ',', skip_lines = 1)
N_rows = 500

# Drop irrelevant cols
r_datapipe = datapipe.drop([0,1,2,4,5,6,8])

# Classify Stars: 1,2 -> negative; 3,4,5 -> positive
score,text = r_datapipe.unzip(sequence_length=2)
def classify(x):
    if int(x) >2:
        return 1
    else:
        return 0
map_score = score.map(classify)

# Lower Reviews for BertTokenizer
def uncase(x):
    return x.lower()

lower_text = text.map(uncase)
clean_datapipe = lower_text.zip(map_score)

In [6]:
#clean_datapipe = lower_text.zip(map_score)

### Build Dataset and DataLoader using datapipes 

In [7]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

class yelpDataset(Dataset):
    def __init__(self, tokenizer, dp, max_length):
        super(yelpDataset, self).__init__()
        self.dp = dp
        self.tokenizer = tokenizer
        self.max_length=max_length
        
    def __len__(self):
        return len(list(self.dp))
    
    def __getitem__(self, index):
        
        text = list(self.dp)[index][0]
        #tokenize,pad and encode reviews
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            max_length=self.max_length,
        )
        
        encoded_text = inputs["input_ids"]

        return {
            "encoded_text": torch.tensor(encoded_text, dtype=torch.long),
            "label": torch.tensor(list(self.dp)[index][1], dtype=torch.long)
            }

tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
dataset= yelpDataset(tokenizer,clean_datapipe, max_length=128)

# Split Dataset to train, valid and test
train_ds, valid_ds,test_ds = random_split(dataset,[0.8,0.1,0.1])

# Build DataLoader for train, valid and test Dataset
train_dl = DataLoader(dataset = train_ds, batch_size = batch_size, shuffle = True)

valid_dl = DataLoader(dataset = valid_ds, batch_size = batch_size, shuffle = True)

test_dl = DataLoader(dataset = test_ds, batch_size = batch_size, shuffle = True)

In [8]:
#train_ds[0]["encoded_text"]

In [9]:
#next(iter(train_dl))

### Build model from pretrained BertModel

In [10]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        model_name = "bert-base-uncased"
        self.encoder = transformers.BertForSequenceClassification.from_pretrained(model_name, num_labels = 2, return_dict = True)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
            position_ids=None, head_mask=None, labels=None):
        loss, logits = self.encoder(input_ids, labels=labels)[:2]
        
        return loss,logits

# Save model
def save_checkpoints(path, model, valid_loss):
    
    if path == None:
        return
    state_dict =  {"model_state_dict": model.state_dict(),
                  "valid_loss": valid_loss}
    
    torch.save(state_dict, path)
    print(f"Model saved to {path}")
    
    
# Load Model
def load_checkpoints(path, model):
    if path == None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f"Model loaded from {path}")
    
    model.load_state_dict(state_dict["model_state_dict"])
    return state_dict["valid_loss"]


In [11]:
# model = BERT()
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# labels = torch.tensor([1]).unsqueeze(0)
# outputs = model(**inputs, labels=labels)
# loss = outputs[0]
# logits = outputs[1]

In [12]:
#outputs

### Setup Profiler

In [13]:
profiler = None
if profiler_enabled:
    profiler = torch.profiler.profile(
        schedule=torch.profiler.schedule(
            wait=1, warmup=1, active=1, repeat=1
        ),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(
            profiler_log_path
        ),
        profile_memory = True
    )
    profiler.start()

### Fine Tune the Model

In [14]:
model = BERT()
model = model.to(device)
optimizer = optimize.Adam(model.parameters(), lr = 1e-6, weight_decay = 0.01)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
def train(num_epoch = num_epochs,train_dl=train_dl,valid_dl=valid_dl, model=model, optimizer=optimizer, criterion = nn.BCELoss(), file_path=file_path):
    
    start_time = time.perf_counter()
    print(f"Started training at the timestamp{start_time}")
          
    # Set up metrics
    train_loss = 0.0
    valid_loss = 0.0
    total_train_loss = []
    total_valid_loss = []
    lowest_loss = float("Inf")
    
    #training model loop
    model.train()
    for  epoch in range(num_epoch):
        print("epoch:"+ str(epoch))
        
        for item in tqdm(train_dl, leave = True):
            text = item["encoded_text"]
            text = text.to(device)
            label = item["label"]
            label = label.unsqueeze(1)
            label = label.to(device)
            
            output = model(text,labels = label)
            loss = output[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update train loss
            train_loss += loss.item()
            
            # Evaluate
        model.eval()
        with torch.no_grad():                    

            # validation loop

            for item in valid_dl:
                text = item["encoded_text"]
                text = text.to(device)
                label = item["label"]
                label = label.unsqueeze(1)
                label = label.to(device)
                output = model(text,labels = label)
                loss = output[0]

                valid_loss += loss.item()
                
        avg_train_loss = train_loss / len(train_dl)
        avg_valid_loss = valid_loss / len(valid_dl)
        total_train_loss.append(avg_train_loss)
        total_valid_loss.append(avg_valid_loss)

        # Monitor training progress
        print("Epoch [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}"
              .format(epoch+1, num_epoch,
                      avg_train_loss, avg_valid_loss))

        # Save model if valid loss gets lower
        if lowest_loss > valid_loss:
            lowest_loss = valid_loss
            save_checkpoints(file_path + '/' + 'model.pt', model, lowest_loss)

        # Reset Metrics
        train_loss = 0.0
        valid_loss = 0.0
        
        if profiler_enabled:
            profiler.step()
    
    end_time = time.perf_counter()
   
    print(f"Started training at the timestamp{end_time}")
    print(f"Training time in {end_time - start_time:0.4f} seconds")
    
    if profiler_enabled:
        profiler.stop()
        print("The profiler is completed. Please open the TensorBoard to browse the metrics.")
    
    return total_train_loss, total_valid_loss




In [16]:
train(num_epoch = num_epochs, train_dl = train_dl, valid_dl = valid_dl, 
      model = model, optimizer = optimizer, criterion = nn.BCELoss(), file_path = file_path)

Started training at the timestamp173.718095291
epoch:0


  0%|          | 0/25 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 25/25 [00:46<00:00,  1.86s/it]


Epoch [1/10], Train Loss: 0.6265, Valid Loss: 0.5374
Model saved to /Users/aowu/Downloads/yelp_test/model.pt
epoch:1


100%|██████████| 25/25 [00:52<00:00,  2.11s/it]


Epoch [2/10], Train Loss: 0.5267, Valid Loss: 0.5724
epoch:2


100%|██████████| 25/25 [01:30<00:00,  3.63s/it]


Epoch [3/10], Train Loss: 0.4810, Valid Loss: 0.5442
epoch:3


100%|██████████| 25/25 [00:52<00:00,  2.10s/it]


Epoch [4/10], Train Loss: 0.4545, Valid Loss: 0.3953
Model saved to /Users/aowu/Downloads/yelp_test/model.pt
epoch:4


100%|██████████| 25/25 [00:55<00:00,  2.21s/it]


Epoch [5/10], Train Loss: 0.4376, Valid Loss: 0.5590
epoch:5


100%|██████████| 25/25 [01:01<00:00,  2.47s/it]


Epoch [6/10], Train Loss: 0.4279, Valid Loss: 0.3714
Model saved to /Users/aowu/Downloads/yelp_test/model.pt
epoch:6


100%|██████████| 25/25 [01:01<00:00,  2.47s/it]


Epoch [7/10], Train Loss: 0.4179, Valid Loss: 0.3691
Model saved to /Users/aowu/Downloads/yelp_test/model.pt
epoch:7


100%|██████████| 25/25 [01:00<00:00,  2.43s/it]


Epoch [8/10], Train Loss: 0.4110, Valid Loss: 0.3537
Model saved to /Users/aowu/Downloads/yelp_test/model.pt
epoch:8


100%|██████████| 25/25 [01:01<00:00,  2.47s/it]


Epoch [9/10], Train Loss: 0.4046, Valid Loss: 0.6076
epoch:9


100%|██████████| 25/25 [01:02<00:00,  2.49s/it]


Epoch [10/10], Train Loss: 0.3984, Valid Loss: 0.3346
Model saved to /Users/aowu/Downloads/yelp_test/model.pt
Started training at the timestamp1063.468643958
Training time in 889.7505 seconds
The profiler is completed. Please open the TensorBoard to browse the metrics.


([0.6265216660499573,
  0.5266794669628143,
  0.48095343708992006,
  0.4544847583770752,
  0.4376257574558258,
  0.42785451412200926,
  0.41793574929237365,
  0.4110111713409424,
  0.40463495552539824,
  0.3984425389766693],
 [0.5373750329017639,
  0.5723609253764153,
  0.5441929996013641,
  0.3952593356370926,
  0.5590322911739349,
  0.37139831483364105,
  0.36912816762924194,
  0.35369179397821426,
  0.6075593680143356,
  0.3346228748559952])