In [None]:

# Install required libraries
!pip uninstall huggingface-hub
!pip install huggingface-hub
!pip install transformers datasets torch accelerate scikit-learn pandas numpy


In [None]:
# Load dataset directly from Hugging Face (English subset)
from datasets import load_dataset

dataset = load_dataset("textdetox/multilingual_toxicity_dataset", split='en[:1000]')

# Rename 'toxic' column to 'labels'
dataset = dataset.rename_column("toxic", "labels")

# Split into train/test sets
dataset = dataset.train_test_split(test_size=0.2)

# Inspect the dataset
print(dataset)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-large")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set dataset format for PyTorch
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/nb-bert-large", num_labels=2)


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-5,
    disable_tqdm=False  # explicitly enable progress bars
)


In [None]:
from transformers import Trainer

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
print("🚩 Starting training...")
trainer.train()
print("✅ Training completed.")


In [None]:
results = trainer.evaluate()
print("🎯 Evaluation Results:", results)


In [None]:
model.save_pretrained("fine-tuned-bert-colab")
tokenizer.save_pretrained("fine-tuned-bert-colab")


In [None]:
import subprocess

# Step 1: List of packages required for the project
required_packages = [
    "transformers",
    "datasets",
    "torch",
    "scikit-learn",
    "numpy",
    "accelerate",       # Required for Trainer
    "pandas",           # Sometimes needed for data manipulation
    "google-colab"      # Only if running in Google Colab
]

# Step 2: Get the versions of installed packages
def get_version(package):
    try:
        version = subprocess.check_output(
            ["pip", "show", package], text=True
        ).split("\n")
        for line in version:
            if line.startswith("Version:"):
                return line.split(": ")[1]
    except Exception:
        return None

# Step 3: Create the requirements.txt file
with open("requirements.txt", "w") as file:
    for package in required_packages:
        version = get_version(package)
        if version:
            file.write(f"{package}=={version}\n")
        else:
            print(f"Warning: {package} is not installed.")

print("✅ requirements.txt created successfully!")
