In [1]:
# Checking if cuda is installed is necessary 
# Determine whether to use batching based on the availability of CUDA
# Batching is advisable on GPU for better performance, as CPU processing
# without batching can be inefficient and potentially slower

# It is necessary to know as it can impact while actually using/modifing/building new models
# to check if cuda drivers are installed  use "nvcc --version" in terminal
# if not install from official site :)
# install pytorch with specific cuda drivers from this website https://pytorch.org/get-started/locally/

import torch

device = torch.device("cpu")
batch_size = 1
if torch.cuda.is_available():
    print("Available Memory : ",torch.cuda.get_device_properties(0).total_memory/1024/1024, "MB")
    device = torch.device("cuda")
    batch_size = 8
else:
    print("CUDA is not available !")

Available Memory :  2047.875 MB


In [2]:
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import datasets
dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")

pipe = pipeline("text-classification", 
                device_map=device, 
                model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", 
                revision="af0f99b"
               )

dataset[1:4], pipe




({'text': ['When I say this is my favourite film of all time, that comment is not to be taken lightly. I probably watch far too many films than is healthy for me, and have loved quite a few of them. I first saw "La Femme Nikita" nearly ten years ago, and it still manages to be my absolute favourite. Why?<br /><br />This is more than an incredibly stylish and sexy thriller. Luc Besson\'s great flair for impeccable direction, fashion, and appropriate usage of music makes this a very watchable film. But it is Anne Parillaud\'s perfect rendering of a complex character who transforms from a heartless killer into a compassionate, vibrant young woman that makes this film beautiful. I can\'t keep my eyes off of her when she is on screen.<br /><br />I have seen several of Luc Besson\'s films including "Subway", "The Professional", and the irritating "Fifth Element", and "Nikita" is without a doubt, far superior to any of these. Although this film has tragic elements, it is ultimately extremely 

In [3]:
trimmed_dataset = dataset[:64]
for out in pipe(trimmed_dataset["text"], batch_size=batch_size, truncation="only_first"):
    print(out)

{'label': 'POSITIVE', 'score': 0.573521077632904}
{'label': 'POSITIVE', 'score': 0.9997995495796204}
{'label': 'POSITIVE', 'score': 0.9996894598007202}
{'label': 'NEGATIVE', 'score': 0.9103317856788635}
{'label': 'NEGATIVE', 'score': 0.9979425072669983}
{'label': 'NEGATIVE', 'score': 0.9977090358734131}
{'label': 'POSITIVE', 'score': 0.9988546371459961}
{'label': 'POSITIVE', 'score': 0.9998350143432617}
{'label': 'POSITIVE', 'score': 0.9863672852516174}
{'label': 'POSITIVE', 'score': 0.9990979433059692}
{'label': 'NEGATIVE', 'score': 0.9988611936569214}
{'label': 'NEGATIVE', 'score': 0.7546835541725159}
{'label': 'POSITIVE', 'score': 0.9954826831817627}
{'label': 'POSITIVE', 'score': 0.9992594122886658}
{'label': 'POSITIVE', 'score': 0.9948545694351196}
{'label': 'NEGATIVE', 'score': 0.9996827840805054}
{'label': 'POSITIVE', 'score': 0.999333918094635}
{'label': 'NEGATIVE', 'score': 0.7249533534049988}
{'label': 'POSITIVE', 'score': 0.8955302238464355}
{'label': 'POSITIVE', 'score': 0.

In [4]:
from torch.utils.data import Dataset
from tqdm.auto import tqdm

In [5]:
# * Run only if GPU is enabled *
# Dataset with no compute
class MyDataset(Dataset):
    def __len__(self):
        return 1000

    def __getitem__(self, i):
        return "This is a test"


dataset = MyDataset()
for batch_size in [1, 8, 64, 256]:
    print("-" * 30)
    print(f"Streaming batch_size={batch_size}")
    for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
        pass

------------------------------
Streaming batch_size=1


  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=8


  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=64


  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=256


  0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
# # uncomment to try 
# # another dataset but with some computation

# class MyDataset(Dataset):
#     def __len__(self):
#         return 1000

#     def __getitem__(self, i):
#         if i % 64 == 0:
#             n = 100
#         else:
#             n = 1
#         return "This is a test" * n

# dataset = MyDataset()

# for batch_size in [1, 8, 64, 256]:
#     print("-" * 30)
#     print(f"Streaming batch_size={batch_size}")
#     for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
#         pass

------------------------------
Streaming batch_size=1


  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=8


  0%|          | 0/1000 [00:00<?, ?it/s]

------------------------------
Streaming batch_size=64


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 