## Basic Use

In [None]:
from transformers import pipeline

# Example of using the pre-trained BERT model for sentiment analysis
classifier = pipeline('sentiment-analysis')
result = classifier("I love using Hugging Face in Python!")
print(result)

## With Classes

Step 1: Load and preprocess the IMDb dataset:
First, let's load the IMDb dataset and preprocess it into a format suitable for fine-tuning.

In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset

# Load the IMDb dataset (you can replace this with your own dataset)
# Assume the dataset is in the format: {'text': 'review text', 'label': 0 or 1}
def load_imdb_dataset():
    # Load and preprocess the dataset here (skipped for brevity)
    # Replace this with your own dataset loading and preprocessing code
    pass

class CustomDataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128,
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': label,
        }

# Load the IMDb dataset and create DataLoader
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
imdb_dataset = load_imdb_dataset()
train_dataset = CustomDataset(tokenizer, imdb_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


Step 2: Fine-tune the pre-trained BERT model:
Next, we'll load the pre-trained BERT model and fine-tune it on the IMDb dataset using a simple sentiment classification task.

In [None]:
from transformers import BertForSequenceClassification, AdamW
import torch

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Fine-tuning loop
model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}')


Step 3: Save the fine-tuned model (optional):
You can save the fine-tuned model to use it later for inference:

In [None]:
model.save_pretrained('fine_tuned_model')


Without Classes

Step 1: Load and preprocess the IMDb dataset:
First, let's load the IMDb dataset and preprocess it, similar to what we did before.

In [None]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset

# Load the IMDb dataset and preprocess it (same as before)
# Replace this with your own dataset loading and preprocessing code

# Create DataLoader
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
imdb_dataset = load_imdb_dataset()  # Assuming you've loaded the dataset similarly as before

def encode_review(review):
    encoding = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=128,
    )
    return encoding['input_ids'], encoding['attention_mask']

train_input_ids = []
train_attention_masks = []
train_labels = []

for example in imdb_dataset:
    input_ids, attention_mask = encode_review(example['text'])
    train_input_ids.append(input_ids)
    train_attention_masks.append(attention_mask)
    train_labels.append(example['label'])

train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


Step 2: Fine-tune the pre-trained BERT model:
Next, we'll load the pre-trained BERT model and fine-tune it on the IMDb dataset, just like before, but this time without using classes.

In [None]:
from transformers import BertForSequenceClassification, AdamW
import torch

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

# Fine-tuning loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0.0
    model.train()
    
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}')


Step 3: Save the fine-tuned model (optional):
You can save the fine-tuned model using the same method as before:

In [None]:
model.save_pretrained('fine_tuned_model')
