In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


Step#1 Load dataset

In [2]:
from datasets import load_dataset

# Load dataset
dataset_name = "imdb"
dataset = load_dataset(dataset_name)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

Step#2 split the dataset into train, valid, test

In [5]:
# Split into train, validation, and test sets
train_data = dataset['train']
val_data = dataset['test']
test_data = dataset['unsupervised']

Select model 

In [6]:
# Load pre-trained tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [7]:
print(train_data[0]['text'])
print(tokenizer(train_data[0]['text']).keys())
print(tokenizer(train_data[0]['text']))


I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, eve

In [8]:
# decode the tokenized text
tokenizer.decode(tokenizer(train_data[0]['text'])['input_ids'])

'[CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this country, therefore being a fan of films considered " controversial " i really had to see this for myself. < br / > < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes 

Tokenize the text
`A tokenizer is responsible for breaking down text into smaller units (tokens) that a model can understand. It is an essential component in Natural Language Processing (NLP), especially in models like BERT, GPT, and other transformer-based architectures`

In [9]:
def tokenize_data(data):
    return tokenizer(data['text'], 
                     padding='max_length', 
                     truncation=True, 
                     max_length=512)

# Tokenize dataset
train_encodings = train_data.map(tokenize_data, batched=True)
val_encodings =val_data.map(tokenize_data, batched=True)
test_encodings = test_data.map(tokenize_data, batched=True)

Map: 100%|██████████| 50000/50000 [00:17<00:00, 2887.55 examples/s]


In [10]:
train_encodings[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [11]:
train_encodings['input_ids'][0]

[101,
 1045,
 12524,
 1045,
 2572,
 8025,
 1011,
 3756,
 2013,
 2026,
 2678,
 3573,
 2138,
 1997,
 2035,
 1996,
 6704,
 2008,
 5129,
 2009,
 2043,
 2009,
 2001,
 2034,
 2207,
 1999,
 3476,
 1012,
 1045,
 2036,
 2657,
 2008,
 2012,
 2034,
 2009,
 2001,
 8243,
 2011,
 1057,
 1012,
 1055,
 1012,
 8205,
 2065,
 2009,
 2412,
 2699,
 2000,
 4607,
 2023,
 2406,
 1010,
 3568,
 2108,
 1037,
 5470,
 1997,
 3152,
 2641,
 1000,
 6801,
 1000,
 1045,
 2428,
 2018,
 2000,
 2156,
 2023,
 2005,
 2870,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 1996,
 5436,
 2003,
 8857,
 2105,
 1037,
 2402,
 4467,
 3689,
 3076,
 2315,
 14229,
 2040,
 4122,
 2000,
 4553,
 2673,
 2016,
 2064,
 2055,
 2166,
 1012,
 1999,
 3327,
 2016,
 4122,
 2000,
 3579,
 2014,
 3086,
 2015,
 2000,
 2437,
 2070,
 4066,
 1997,
 4516,
 2006,
 2054,
 1996,
 2779,
 25430,
 14728,
 2245,
 2055,
 3056,
 2576,
 3314,
 2107,
 2004,
 1996,
 5148,
 2162,
 1998,
 2679,
 3314,
 1999,
 1996,
 2142,
 2163,
 1012,
 1999,
 2090,
 48

- convert dataset into pytorch tensor
We need to convert the data in pytorch format inorder to train on pytorch model
- create dataloader to load the date in model

In [12]:
# Create PyTorch datasets
train_encodings.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_encodings.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_encodings.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoaders
batch_size = 8
train_loader = DataLoader(train_encodings, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_encodings, batch_size=batch_size)
test_loader = DataLoader(test_encodings, batch_size=batch_size)

print(len(train_loader), len(val_loader), len(test_loader)) 

3125 3125 6250


step# Train the model

In [13]:
batch = next(iter(train_loader))
batch['input_ids']

tensor([[ 101, 2242, 4873,  ...,    0,    0,    0],
        [ 101, 1006, 1054,  ..., 9061, 1010,  102],
        [ 101, 2065, 2017,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 1037, 2431,  ...,    0,    0,    0],
        [ 101, 1999, 2047,  ...,    0,    0,    0]])

In [22]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 3
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(epochs):
    total_loss = 0
    for (inputs,labels) in train_loader:
        inputs, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(inputs).logits
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

# Validation loop
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch in val_loader:
        inputs, labels = [b.to(device) for b in batch]
        outputs = model(inputs).logits
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
print(f"Validation Accuracy: {correct / total * 100:.2f}%")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'str' object has no attribute 'to'

In [None]:
# Inference
text = "This movie was fantastic! I really enjoyed it."
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
inputs = {key: val.to(device) for key, val in inputs.items()}
model.eval()
with torch.no_grad():
    outputs = model(**inputs).logits
prediction = torch.argmax(outputs, dim=1).item()
print(f"Inference Result: {prediction}")

Making seperate function/pipelines and the calling in main function
- data.py: Contains the data loading and preprocessing functions
- model.py: Contains the model building and training functions
- train.py: Contains the main function to train the model
- inference.py: Contains the main function to make predictions

In [3]:
test = {'label':1, 'input_ids': [1,2,3,4,5], 'attention_mask': [1,1,1,1,1]}
r = [key for key in test]
print(r)

['label', 'input_ids', 'attention_mask']


Compiling everythin in a single place