In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
import torch
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 9

In [3]:
dataset = load_dataset('yelp_review_full', split='train').shuffle(seed=seed).select(range(100))

In [4]:
zero_bart = pipeline(task="zero-shot-classification", model = "facebook/bart-large-mnli", device=device)

In [5]:
candidate_tags = ["Restaurants", "Bars", "Coffee Shops", "Hotels", "Salons/Barbershops", "Auto Repair", "Home Services", "Medical Services", "Entertainment", "Pet Services", "Financial Services", "Travel & Tourism", "Education", "Real Estate", "Fitness", "Landscaping & Gardening Services", "Legal Services", "Photography Services", "Childcare Services", "Computer & Technology Services"]
candidate_labels = range(1, 20)
candidate_dict = dict(zip(candidate_tags, candidate_labels))
candidate_dict

{'Restaurants': 1,
 'Bars': 2,
 'Coffee Shops': 3,
 'Hotels': 4,
 'Salons/Barbershops': 5,
 'Auto Repair': 6,
 'Home Services': 7,
 'Medical Services': 8,
 'Entertainment': 9,
 'Pet Services': 10,
 'Financial Services': 11,
 'Travel & Tourism': 12,
 'Education': 13,
 'Real Estate': 14,
 'Fitness': 15,
 'Landscaping & Gardening Services': 16,
 'Legal Services': 17,
 'Photography Services': 18,
 'Childcare Services': 19}

In [6]:
%%time
def zero_shot_classification(example, candidate_tags):
    zeroshot = zero_bart(example['text'], candidate_tags)
    return {'text': example['text'], 'zeroshot': zeroshot}

# Apply zero-shot classification to the dataset
zero_shot_bart = dataset.map(lambda example: zero_shot_classification(example, candidate_tags))

CPU times: user 2.65 s, sys: 2.63 s, total: 5.27 s
Wall time: 5.27 s


In [9]:
l = []
for _ in zero_shot_bart['zeroshot']:
    l.append(_['labels'][0])
l

['Restaurants',
 'Entertainment',
 'Entertainment',
 'Bars',
 'Auto Repair',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Hotels',
 'Restaurants',
 'Restaurants',
 'Travel & Tourism',
 'Restaurants',
 'Bars',
 'Auto Repair',
 'Restaurants',
 'Restaurants',
 'Entertainment',
 'Entertainment',
 'Bars',
 'Restaurants',
 'Bars',
 'Restaurants',
 'Restaurants',
 'Bars',
 'Bars',
 'Restaurants',
 'Restaurants',
 'Entertainment',
 'Travel & Tourism',
 'Restaurants',
 'Restaurants',
 'Entertainment',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Bars',
 'Home Services',
 'Restaurants',
 'Restaurants',
 'Entertainment',
 'Entertainment',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Restaurants',
 'Home Services',
 'Travel & Tourism',
 'Restaurants',
 'Coffee Shops',
 'Restaurants',
 'Restaurants',
 'Bars',
 'Restaurants',
 'Entertainment',
 'Entertainment',
 'Entertainme

In [10]:
zero_shot_bart_prediction = [candidate_dict[k] for k in l]
zero_shot_bart_prediction

[1,
 9,
 9,
 2,
 6,
 1,
 1,
 1,
 4,
 1,
 1,
 12,
 1,
 2,
 6,
 1,
 1,
 9,
 9,
 2,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 9,
 12,
 1,
 1,
 9,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 7,
 1,
 1,
 9,
 9,
 1,
 1,
 1,
 1,
 7,
 12,
 1,
 3,
 1,
 1,
 2,
 1,
 9,
 9,
 9,
 1,
 12,
 1,
 10,
 1,
 1,
 1,
 9,
 7,
 1,
 1,
 1,
 12,
 9,
 9,
 9,
 9,
 1,
 1,
 9,
 1,
 1,
 1,
 8,
 9,
 1,
 9,
 9,
 4,
 1,
 1,
 1,
 13,
 1,
 7,
 3]

In [11]:
%%time
from transformers import T5ForConditionalGeneration, T5Tokenizer
# Load T5 model and tokenizer
model_name = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Define candidate labels
candidate_labels = ["Restaurants", "Bars", "Coffee Shops", "Hotels", "Salons/Barbershops", "Auto Repair", "Home Services",
                    "Medical Services", "Entertainment", "Pet Services", "Financial Services", "Travel & Tourism",
                    "Education", "Real Estate", "Fitness", "Landscaping & Gardening Services", "Legal Services",
                    "Photography Services", "Childcare Services", "Computer & Technology Services"]

# Define zero-shot classification function using the model directly
def zero_shot_classification_2(example):
    inputs = tokenizer("classify: " + example['text'], padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
    dummy_decoder_input_ids = torch.zeros_like(inputs["input_ids"]).to(device)
    with torch.no_grad():
        logits = model(input_ids=inputs["input_ids"], decoder_input_ids=dummy_decoder_input_ids).logits
    zeroshot = torch.sigmoid(logits).cpu().numpy()
    return {'text': example['text'], 'zeroshot': zeroshot}

# Apply zero-shot classification to the dataset
zero_shot_t5 = dataset.map(zero_shot_classification_2)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

CPU times: user 17.4 s, sys: 18.5 s, total: 35.9 s
Wall time: 31.9 s


In [12]:
zero_shot_t5

Dataset({
    features: ['label', 'text', 'zeroshot'],
    num_rows: 100
})

In [None]:
zero_shot_t5['zeroshot']

In [None]:
l2 = []
for _ in tqdm(zero_shot_t5['zeroshot']):
    l2.append(_['label'][0])
l2

In [None]:
zero_shot_t5_prediction = [candidate_dict[k] for k in l2]
zero_shot_t5_prediction