<a href="https://colab.research.google.com/github/DaloPotato/DaloPotato/blob/main/Test_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Install necessary libraries
!pip install wandb torch torchvision transformers datasets

import torch
import wandb
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize WandB for tracking
wandb.init(project="ag-news-classification")  # Replace with your account name

# Set hyperparameters and configurations for WandB
wandb.config = {
    "learning_rate": 2e-5,
    "epochs": 3,
    "batch_size": 16,
}

# Load AG News dataset
dataset = load_dataset('ag_news')

# Select 10% of the training dataset
train_subset = dataset['train'].shuffle(seed=42).select(range(int(0.1 * len(dataset['train']))))

# Select 10% of the test dataset (optional)
test_subset = dataset['test'].shuffle(seed=42).select(range(int(0.1 * len(dataset['test']))))

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokeniizattion function
def tokenize_function(examples):
      return tokenizer(examples["text"], padding="max_length", truncation=True)
# Tokenize the dataset subsets instead of the whole datset
train_subset = train_subset.map(tokenize_function, batched=True) # Tokenize traub_subset
test_subset = test_subset.map(tokenize_function, batched=True)  # Tokenize test_subset

# Instantiate a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Move the model to the GPU if available
model.to(device)

# Define training arguments with WandB integration
training_args = TrainingArguments(
    output_dir='./results',
    run_name='./testing',
    evaluation_strategy="epoch",  # Evaluate after each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',          # For storing logs
    logging_steps=500,             # Log every 500 steps
    report_to=["wandb"],           # Enable WandB logging
    # If GPU is available, use fp16 mixed precision to speed up training
    fp16=torch.cuda.is_available(),  # Enable mixed-precision for faster training on GPU (optional)



)
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,  # Use the 10% subset for training
    eval_dataset=test_subset,    # Optionally use the 10% subset for testing
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

# Finish the WandB run after training
wandb.finish()


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
