In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.amp as amp
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.notebook import tqdm

In [2]:
checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [4]:
path = './spark_workspace/web_app/data/reviews.csv'

In [5]:
reviews_df = pd.read_csv(path, usecols=['reviewText', 'overall'], nrows=100000, keep_default_na=False)

In [6]:
class ReviewsDataset(Dataset):
    
    def __init__(self, df):
        super(ReviewsDataset, self).__init__()
        self.reviews = df.reviewText
        self.targets = df.overall
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        reviews = self.reviews.iloc[idx]
        target = self.targets.iloc[idx]-1
        return reviews, target

In [7]:
reviews_ds = ReviewsDataset(reviews_df)

In [8]:
class DataCollator:
    
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, data):
        
        reviews, targets = zip(*data)
        reviews = self.tokenizer(list(reviews),
                                 padding=True, 
                                 truncation=True, 
                                 return_tensors='pt')
        targets = torch.tensor(targets, dtype=torch.float32)
        return reviews, targets

In [9]:
data_collator = DataCollator(tokenizer)

In [13]:
batch_size = 128-16

In [14]:
reviews_dl = DataLoader(reviews_ds, batch_size=batch_size, collate_fn=data_collator)

In [15]:
N = len(reviews_df)

In [16]:
top_1_acc = 0
top_2_acc = 0
off_by_1 = 0

In [17]:
@torch.no_grad()
@amp.autocast()
def accuracies(model, dl, N):
    global top_1_acc
    global top_2_acc
    global off_by_1
    model.eval()
    for i, (reviews, targets) in tqdm(enumerate(dl), total=len(dl)):
        reviews = {k: v.to(device) for k, v in reviews.items()}
        targets = targets.unsqueeze(dim=-1).to(device)
        out = model(**reviews)
        _, top_2 = torch.topk(out.logits, 2, dim=-1)
        out = top_2 == targets
        off_by_1 += ((top_2[:, 0] == targets) | (top_2[:, 0]+1 == targets) | (top_2[:, 0]-1 == targets)).sum().item()
        top_1_acc += out[:, 0].sum().item()
        top_2_acc += out.sum().item()
    return top_1_acc/N, top_2_acc/N, off_by_1/N

In [18]:
top_1_acc, top_2_acc, off_by_1 = accuracies(model, reviews_dl, N)

HBox(children=(FloatProgress(value=0.0, max=893.0), HTML(value='')))




ValueError: too many values to unpack (expected 2)

In [20]:
top_1_acc/N

0.64508

In [21]:
top_2_acc/N

0.85822

In [22]:
off_by_1/N

79.79195

### Based on 4,000,000 reviews

In [16]:
top_1_acc

0.6472885

In [17]:
top_2_acc

0.86649025

## Based on 100,000 reviews

In [14]:
top_1_acc

0.64509

In [15]:
top_2_acc

0.85823