In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("imdb")


In [12]:
# Shuffle the test dataset
shuffled_test_dataset = dataset['test'].shuffle(seed=13)
# Select the first 2000 examples
subset_test_dataset = shuffled_test_dataset.select(range(5000))
# shuffle the train dataset
shuffled_train_dataset = dataset['train'].shuffle(seed=13)
# Select the first 2000 examples
subset_train_dataset = shuffled_test_dataset.select(range(5000))

In [13]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the subset
tokenized_subset = subset_test_dataset.map(
    lambda examples: tokenizer(examples['text'], padding=True, truncation=True, max_length=256, return_tensors="pt"),
    batched=True,
    batch_size=1000, 
    remove_columns=["text"]
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [14]:
tokenized_subset_train = subset_train_dataset.map(
    lambda examples: tokenizer(examples['text'], padding=True, truncation=True, max_length=256, return_tensors="pt"),
    batched=True,
    batch_size=1000, 
    remove_columns=["text"]
)

# training 

In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

trainer = Trainer(model=model)

2024-04-05 05:36:27.091057: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_subset_train,
    eval_dataset=tokenized_subset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.train()

Step,Training Loss
500,0.5075
1000,0.451
1500,0.3358
2000,0.3218
2500,0.3171
3000,0.1589
3500,0.1686
4000,0.1187
4500,0.1026
5000,0.0721


TrainOutput(global_step=12500, training_loss=0.11815477290809155, metrics={'train_runtime': 4182.6957, 'train_samples_per_second': 11.954, 'train_steps_per_second': 2.989, 'total_flos': 6577776384000000.0, 'train_loss': 0.11815477290809155, 'epoch': 10.0})

In [61]:
torch.cuda.empty_cache()

In [62]:
model.eval()
torch.save(model, "trained_bert.pt")

In [63]:
torch.save(model.state_dict(), "trained_bert_state_dict.pth")

In [46]:
device = torch.device('cuda')

In [70]:
import random

index = random.randint(0, 25000)
text = dataset['test']['text'][index]
print(text)
print(dataset['test']['label'][index])
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)

Just do a little research on the making of this film. Something so simple as a Google search. It was funded by the US Army and promoted just in time for the elections. It is a great idea, but I'd much rather see a DOCUMENTARY, not something edited by the Bush Administration and told its reality. The timing of the movie's release, its tone, and the fact that MS&L promoted it, raised questions about the intent of the movie. "According to MS&L Managing Director Joe Gleason, he and his colleagues also deliver key targeted messages about the war in Iraq to specific constituencies," wrote Eartha Melzer. "Was the left-leaning art house crowd one of those constituencies? Is the government hiring documentary filmmakers to propagandize the U.S. population? Nobody involved with the film is willing to say who initially put up the money for the film or how they ended up represented by the Army's PR firm."
0


In [71]:
logits = outputs.logits
predictions = torch.softmax(logits, dim=1)
print(predictions)

tensor([[1.0000e+00, 3.7152e-06]], device='cuda:0')
