In [9]:
import torch
import torch.nn as nn
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

### Data

In [2]:
class YelpDataset(Dataset):
	def __init__(self, data, tokenizer, max_length = None):
		self.data = data
		self.tokenizer = tokenizer
		self.max_length = 512

	def __len__(self):
		return self.data.num_rows
	
	def __getitem__(self, idx):
		text = self.data['text'][idx]
		label = self.data['label'][idx]

		inputs = self.tokenizer(text,
						  padding = "max_length",
						  truncation = True,
						  max_length = self.max_length,
						  return_tensors = "pt")
		
		input_ids = inputs['input_ids'].squeeze()
		attention_mask = inputs['attention_mask'].squeeze() # mask pads.
		
		return {"input_ids": input_ids, "attention_mask": attention_mask}, label

In [8]:
BATCH_SIZE = 32

ds = load_dataset("Yelp/yelp_review_full")
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

train_dataset = YelpDataset(ds['train'], tokenizer)
test_dataset = YelpDataset(ds['test'], tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Encoder Only Transformer

In [None]:
class Classifier(nn.Module):
	def __init__(self, word_embed_size = 800, ):
		super(Classifier, self).__init__()
		self.embedding = nn.Embedding(35000, word_embed_size)

		self.encoder_layer = nn.TransformerEncoderLayer(
			d_model = word_embed_size,
			
		)