In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from torch.nn import functional as F
import numpy as np

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True
        )

        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }


In [3]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

# Convert positive to 1 and negative to 0
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Create the DataLoaders
train_dataset = IMDBDataset(train_df['review'].to_numpy(), train_df['sentiment'].to_numpy(), tokenizer, max_len=128)
test_dataset = IMDBDataset(test_df['review'].to_numpy(), test_df['sentiment'].to_numpy(), tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [5]:
epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

for epoch in range(epochs):
    print(f'Starting epoch {epoch+1}/{epochs}')
    model.train()
    total_loss = 0
    total_correct = 0
    total_count = 0
    batch_num = 0

    for data in train_loader:
        batch_num += 1
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        targets = data['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

        _, preds = torch.max(outputs.logits, dim=1)
        total_correct += (preds == targets).sum().item()
        total_count += targets.numel()

        if batch_num % 100 == 0:
            print(f'Batch {batch_num}: Loss = {loss}')

    print(f'Epoch {epoch+1} loss = {total_loss / batch_num}')
    print(f'Training accuracy: {total_correct / total_count}')

Starting epoch 1/3
Batch 100: Loss = 0.5363600254058838
Batch 200: Loss = 0.31495028734207153
Batch 300: Loss = 0.10615691542625427
Batch 400: Loss = 0.5514608025550842
Batch 500: Loss = 0.23782433569431305
Batch 600: Loss = 0.3502776026725769
Batch 700: Loss = 0.27058520913124084
Batch 800: Loss = 0.1295887529850006
Batch 900: Loss = 0.5224992036819458
Batch 1000: Loss = 0.6010627746582031
Batch 1100: Loss = 0.5560705065727234
Batch 1200: Loss = 0.2468046396970749
Batch 1300: Loss = 0.6059460639953613
Batch 1400: Loss = 0.4713125228881836
Batch 1500: Loss = 0.49384579062461853
Batch 1600: Loss = 0.12369626015424728
Batch 1700: Loss = 0.0779247060418129
Batch 1800: Loss = 0.28342047333717346
Batch 1900: Loss = 0.2286701202392578
Batch 2000: Loss = 0.25090712308883667
Batch 2100: Loss = 0.2216954082250595
Batch 2200: Loss = 0.17405080795288086
Batch 2300: Loss = 0.2309088408946991
Batch 2400: Loss = 0.4224179685115814
Batch 2500: Loss = 0.3777514398097992
Epoch 1 loss = 0.32945073449984

In [6]:
# Evaluate the model
model.eval()
predictions = []
targets = []
for data in test_loader:
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    targets.extend(data['targets'])
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    predictions.extend(logits)

predictions = np.argmax(predictions, axis=1)
print('Accuracy:', accuracy_score(targets, predictions))

Accuracy: 0.8834
