In [None]:
from datasets import load_dataset

# Load IMDB dataset
dataset = load_dataset('imdb')

# Get the train dataset
train_dataset = dataset['train']

# Tokenize input and convert to tensors
train_texts = train_dataset['text']
train_labels = train_dataset['label']

In [None]:
# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(df, test_size=0.5)

In [None]:
train_texts = list(train_dataset['text'])
train_labels = list(train_dataset['label'])

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score
import torch

device = 'cuda'

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model = model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
# Assume we have the following data
#train_texts = ["I love this movie!", "This film is terrible..."]
#train_labels = [1, 0]  # 1 is positive, 0 is negative

# Tokenize input
print('Tokenizing the input...')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

In [None]:
# Convert to tensors
print('Converting to tensors...')
train_inputs = torch.tensor(train_encodings['input_ids']).to(device)
train_labels = torch.tensor(train_labels).to(device)

# Create DataLoader
print('Loading the data...')
train_data = list(zip(train_inputs, train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)

In [None]:
# !pip install GPUtil

# from GPUtil import showUtilization as gpu_usage

# Check GPU usage
gpu_usage()

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from tqdm import tqdm

# Training loop
print('Training...')
model.train()
for epoch in tqdm(range(3)):  # Number of training epochs
    print('Epoch: ', epoch)
    gpu_usage()
    for step, batch in tqdm(enumerate(train_dataloader)):
        b_input_ids, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluation
model.eval()
eval_texts = ["I really enjoyed this film.", "I didn't like the movie."]
eval_labels = [1, 0]

In [None]:
# Check GPU usage
gpu_usage()

In [None]:
# clear cache
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Check GPU usage
gpu_usage()

In [None]:
# Tokenize input
eval_encodings = tokenizer(eval_texts, truncation=True, padding=True)

# Convert to tensors
eval_inputs = torch.tensor(eval_encodings['input_ids']).to(device)
eval_labels = torch.tensor(eval_labels).to(device)

# Create DataLoader
eval_data = list(zip(eval_inputs, eval_labels))
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=16)


In [None]:
for batch in eval_dataloader:
    b_input_ids, b_labels = batch
    b_input_ids = b_input_ids.to('cuda')
    b_labels = b_labels.to('cuda')
    print(device)
    with torch.no_grad():
        outputs = model(b_input_ids, labels=b_labels)
    logits = outputs.logits
    eval_loss += loss.item()
    nb_eval_steps += 1
    preds.append(logits.detach().cpu().numpy())

In [None]:
eval_loss = eval_loss / nb_eval_steps
preds = torch.tensor(preds).numpy()
pred_flat = np.argmax(preds, axis=1).flatten()
eval_labels = eval_labels.cpu()
labels_flat = eval_labels.numpy().flatten()
eval_accuracy = accuracy_score(labels_flat, pred_flat)

print('Validation loss: ', eval_loss)
print('Validation Accuracy: ', eval_accuracy)