# Web Scraping Classification Training in Colab

This notebook fine-tunes a BERT model for classifying web pages as useful for scraping or not.

In [None]:
# Install dependencies
!pip install transformers datasets torch pandas

In [None]:
import torch
from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
import yaml
import pandas as pd
from datasets import Dataset

In [None]:
# Load configuration
config = {
    'model': {'name': 'bert-base-uncased', 'num_labels': 2},
    'training': {'batch_size': 16, 'learning_rate': 2e-5, 'num_epochs': 3, 'weight_decay': 0.01, 'warmup_steps': 500},
    'data': {'train_path': 'train.csv', 'val_path': 'val.csv', 'test_path': 'test.csv'},
    'output': {'model_save_path': './fine_tuned_model', 'logs_path': './logs'}
}

# Load model and tokenizer
model_name = config['model']['name']
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=config['model']['num_labels'])

In [None]:
# Load and preprocess data
def preprocess_data(df, tokenizer, max_length=512):
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)
    
    dataset = Dataset.from_pandas(df)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

# Assuming you have uploaded train.csv, val.csv, test.csv
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')

train_dataset = preprocess_data(train_df, tokenizer)
val_dataset = preprocess_data(val_df, tokenizer)
test_dataset = preprocess_data(test_df, tokenizer)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=config['output']['model_save_path'],
    num_train_epochs=config['training']['num_epochs'],
    per_device_train_batch_size=config['training']['batch_size'],
    per_device_eval_batch_size=config['training']['batch_size'],
    learning_rate=config['training']['learning_rate'],
    weight_decay=config['training']['weight_decay'],
    warmup_steps=config['training']['warmup_steps'],
    logging_dir=config['output']['logs_path'],
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train
trainer.train()

# Save model
trainer.save_model(config['output']['model_save_path'])
tokenizer.save_pretrained(config['output']['model_save_path'])

## Evaluation

Evaluate on test set.

In [None]:
# Evaluate
test_results = trainer.evaluate(test_dataset)
print(test_results)