In [1]:
!pip install pandas tqdm torch transformers scikit-learn langchain langchain-community

Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import logging
from torch.nn import BCEWithLogitsLoss

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

def load_and_preprocess_data(train_path, test_path, labels_path):
    """
    Load and preprocess the data from CSV files.
    """
    logger.info("Loading and preprocessing data...")
    df_train = pd.read_csv(train_path, engine='python', encoding='utf-8')
    df_test = pd.read_csv(test_path, engine='python', encoding='utf-8')
    df_label = pd.read_csv(labels_path, engine='python', encoding='utf-8')

    # Preprocess training data
    toxic_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    df_train['labels'] = df_train[toxic_columns].values.tolist()
    df_train = df_train[['comment_text', 'labels']]

    # Preprocess test data
    df_test['labels'] = df_label[toxic_columns].values.tolist()

    # Clean text
    for dataset in [df_train, df_test]:
        dataset['comment_text'] = dataset['comment_text'].fillna("").str.replace("\xa0", " ", regex=False).str.split().str.join(" ")

    logger.info(f"Processed {len(df_train)} training samples and {len(df_test)} test samples")
    return df_train, df_test

def sample_data(df, initial_size=10_000):
    """
    Sample a subset of data for initial training, maintaining class distribution.
    """
    initial_samples = df.sample(n=initial_size, random_state=42)
    remaining_samples = df.drop(initial_samples.index)

    return initial_samples.reset_index(drop=True), remaining_samples.reset_index(drop=True)

class ToxicDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.text = dataframe.comment_text
        self.targets = dataframe.labels.values
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        target = self.targets[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

def train_model(model, data_loader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(outputs.logits, targets)
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(data_loader)
    logger.info(f"Epoch {epoch} - Average training loss: {avg_loss:.4f}")
    return avg_loss

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.sigmoid(logits)  # Don't apply threshold here
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['targets'].cpu().numpy())

    return np.array(true_labels), np.array(predictions)

def compute_metrics(true_labels, predictions):
    # Ensure both are in the same format (multilabel-indicator)
    true_labels = (true_labels > 0.5).astype(int)
    predictions = (predictions > 0.5).astype(int)

    return {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_score(true_labels, predictions, average='micro', zero_division=0),
        'recall': recall_score(true_labels, predictions, average='micro', zero_division=0),
        'f1': f1_score(true_labels, predictions, average='micro', zero_division=0)
    }

class ActiveLearning:
    def __init__(self, df_train, df_test, tokenizer, device, max_len=128, batch_size=64, epochs=4, lr=2e-5, workers=4, query_size=2000, num_iterations=8):
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.device = device
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.workers = workers
        self.query_size = query_size
        self.num_iterations = num_iterations
        self.results_df = pd.DataFrame(columns=['iteration', 'accuracy', 'precision', 'recall', 'F1'])

    def run(self):
        labeled_data, unlabeled_data = sample_data(self.df_train)
        labeled_data, val_data = train_test_split(labeled_data, test_size=0.2, random_state=42)

        # Initial evaluation
        self.initial_evaluation(labeled_data, val_data)

        for iteration in range(self.num_iterations):
            logger.info(f"Starting Active Learning Iteration {iteration + 1}")

            model = self.train_and_evaluate(labeled_data, val_data, unlabeled_data, iteration)

            # Instance selection
            unlabeled_dataset = ToxicDataset(unlabeled_data, self.tokenizer, self.max_len)
            unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=self.batch_size, num_workers=self.workers)

            model.eval()
            uncertainties = []
            with torch.no_grad():
                for batch in tqdm(unlabeled_loader, desc="Calculating uncertainties"):
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)

                    outputs = model(input_ids, attention_mask=attention_mask)
                    probabilities = torch.sigmoid(outputs.logits)
                    uncertainties.extend((-torch.abs(probabilities - 0.5)).mean(dim=1).cpu().numpy())

            selected_indices = np.argsort(uncertainties)[-self.query_size:]
            new_labeled_data = unlabeled_data.iloc[selected_indices]
            labeled_data = pd.concat([labeled_data, new_labeled_data])
            unlabeled_data = unlabeled_data.drop(unlabeled_data.iloc[selected_indices].index).reset_index(drop=True)

            labeled_data = labeled_data.reset_index(drop=True)
            labeled_data, val_data = train_test_split(labeled_data, test_size=0.2, random_state=42)

            logger.info(f"Added {len(new_labeled_data)} samples to labeled data. New sizes - Labeled: {len(labeled_data)}, Validation: {len(val_data)}, Unlabeled: {len(unlabeled_data)}")

        # Final evaluation on test set
        self.final_evaluation(model)

        self.results_df.to_csv("active_learning_results.csv", index=False)
        logger.info("Results saved to active_learning_results.csv")

    def initial_evaluation(self, labeled_data, val_data):
        logger.info("Performing initial evaluation on the validation set")
        model = self.train_and_evaluate(labeled_data, val_data, labeled_data, -1)

    def train_and_evaluate(self, labeled_data, val_data, eval_data, iteration):
        logger.info(f"Training and evaluating model")
        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6).to(self.device)

        train_dataset = ToxicDataset(labeled_data, self.tokenizer, self.max_len)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.workers)

        optimizer = AdamW(model.parameters(), lr=self.lr, correct_bias=False)
        total_steps = len(train_loader) * self.epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        for epoch in range(self.epochs):
            train_model(model, train_loader, optimizer, scheduler, self.device, epoch)

        val_dataset = ToxicDataset(val_data, self.tokenizer, self.max_len)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size, num_workers=self.workers)
        true_labels, predictions = evaluate_model(model, val_loader, self.device)

        metrics = compute_metrics(true_labels, predictions)
        logger.info(f"Iteration {iteration + 1} Validation Results:")
        logger.info(f"Accuracy: {metrics['accuracy']:.4f}, Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1 Score: {metrics['f1']:.4f}")

        new_row = pd.DataFrame({
            'iteration': [iteration + 1],
            'accuracy': [metrics['accuracy']],
            'precision': [metrics['precision']],
            'recall': [metrics['recall']],
            'F1': [metrics['f1']]
        })
        self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)

        return model

    def final_evaluation(self, model):
        logger.info("Performing final evaluation on test set")
        test_dataset = ToxicDataset(self.df_test, self.tokenizer, self.max_len)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, num_workers=self.workers)

        true_labels, predictions = evaluate_model(model, test_loader, self.device)
        metrics = compute_metrics(true_labels, predictions)

        logger.info(f"Final Test Results:")
        logger.info(f"Accuracy: {metrics['accuracy']:.4f}, Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1 Score: {metrics['f1']:.4f}")

        new_row = pd.DataFrame({
            'iteration': ['test'],
            'accuracy': [metrics['accuracy']],
            'precision': [metrics['precision']],
            'recall': [metrics['recall']],
            'F1': [metrics['f1']]
        })
        self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)

def main():
    # Set paths for data files
    train_path = 'train.csv'
    test_path = 'test.csv'
    labels_path = 'test_labels.csv'

    df_train, df_test = load_and_preprocess_data(train_path, test_path, labels_path)

    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    active_learning = ActiveLearning(
        df_train, df_test, tokenizer, device,
        max_len=128, batch_size=64, epochs=4, lr=2e-5,
        workers=4, query_size=2000, num_iterations=8
    )
    active_learning.run()

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 0: 100%|██████████| 125/125 [01:17<00:00,  1.61it/s]
Training Epoch 1: 100%|██████████| 125/125 [01:19<00:00,  1.57it/s]
Training Epoch 2: 100%|██████████| 125/125 [01:19<00:00,  1.58it/s]
Training Epoch 3: 100%|██████████| 125/125 [01:19<00:00,  1.57it/s]
Evaluating: 100%|██████████| 32/32 [00:07<00:00,  4.06it/s]
  self.results_df = pd.concat([self.results_df, new_row], ignore_index=True)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weigh