In [1]:
import json
from datasets import Dataset, DatasetDict, ClassLabel # <-- Import ClassLabel

# --- Load your manually labeled data ---
# (This part is the same)
ground_truth_file = 'ground_truth.jsonl'
data = []
with open(ground_truth_file, 'r') as f:
    for line in f:
        data.append(json.loads(line))

full_dataset = Dataset.from_list(data)

# --- Create the labels for our model ---
# (This part is the same)
labels = ["other", "personal_blog"]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

def add_labels(examples):
    # This assumes you changed your file to have "human_classification"
    return {'label': [label2id[label] for label in examples['human_classifier']]}

full_dataset = full_dataset.map(add_labels, batched=True)

# ==========================================================
# ---  THE NEW LINE IS HERE  ---
# Convert the 'label' column to the ClassLabel type
# ==========================================================
full_dataset = full_dataset.cast_column("label", ClassLabel(names=labels))


# --- Split the data into training and testing sets ---
# (This part is the same and will now work)
train_test_split = full_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='label')

# Create a final DatasetDict object which is the standard format
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})


print("--- Dataset prepared for training ---")
print(dataset_dict)
print("\nExample from the training set:")
print(dataset_dict['train'][0])

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 50/50 [00:00<00:00, 4735.37 examples/s]
Casting the dataset: 100%|██████████| 50/50 [00:00<00:00, 6055.88 examples/s]

--- Dataset prepared for training ---
DatasetDict({
    train: Dataset({
        features: ['url', 'title', 'text', 'classification_v1', 'human_classifier', 'label'],
        num_rows: 40
    })
    test: Dataset({
        features: ['url', 'title', 'text', 'classification_v1', 'human_classifier', 'label'],
        num_rows: 10
    })
})

Example from the training set:
{'url': 'http://store.waitbutwhy.com/collections/plush-toys', 'title': 'Plush Toys - Wait But Why Store', 'text': 'Plush Toys - Wait But Why Store Menu 0 Store Blog home about archive minis the shed dinner table support wbw Posters All Posters Life Calendar Wrapping Paper One in a Million Litographs A Perspective on Time Dark Playground Life Mountain Apparel Unisex Women Kid\'s Sweatshirts All Sweatshirts Crewnecks Hoodies Characters Instant Gratification Monkey The Panic Monster The Mammoth Mugs Accessories Plushies Cards & Wrapping Paper Tote Bags Buttons Coffee Mugs Stickers Login USD CAD AUD GBP EUR JPY 0 Your Cart i




In [2]:
from transformers import AutoTokenizer

# The name of the pre-trained model we will use
model_checkpoint = "distilbert-base-uncased" 

# Load the tokenizer that corresponds to our model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create a function that will tokenize our text
def tokenize_function(examples):
    # The tokenizer will pad shorter texts and truncate longer ones to a standard length
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer to our entire dataset (both train and test splits)
# The `batched=True` flag makes this process much faster.
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)


print("--- Tokenization Complete ---")
print("\nOur tokenized dataset now has new features:")
print(tokenized_datasets['train'].features)

print("\nExample of the new features for the first training example:")
# Let's look at the first example again to see the new fields
first_example = tokenized_datasets['train'][0]
print("input_ids (first 20):", first_example['input_ids'][:20])
print("attention_mask (first 20):", first_example['attention_mask'][:20])

Map: 100%|██████████| 40/40 [00:00<00:00, 64.92 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 252.81 examples/s]

--- Tokenization Complete ---

Our tokenized dataset now has new features:
{'url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'classification_v1': Value(dtype='string', id=None), 'human_classifier': Value(dtype='string', id=None), 'label': ClassLabel(names=['other', 'personal_blog'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

Example of the new features for the first training example:
input_ids (first 20): [101, 27729, 10899, 1011, 3524, 2021, 2339, 3573, 12183, 1014, 3573, 9927, 2188, 2055, 8756, 7163, 2015, 1996, 8328, 4596]
attention_mask (first 20): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]





In [3]:
!pip install transformers[torch]
!pip install 'accelerate>=0.26.0'
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# --- Load the Pre-trained Model ---
# We load the DistilBERT model and specify it's for sequence classification.
# We also tell it how many labels we have (2: "other" and "personal_blog")
# and provide our label2id/id2label mappings.
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

# --- Define Training Arguments ---
# This object contains all the settings for the training run.
training_args = TrainingArguments(
    output_dir="blog_classifier_model",  # The directory where the final model will be saved
    learning_rate=2e-5,                  # A standard, good learning rate for fine-tuning
    per_device_train_batch_size=8,       # How many examples to process at once during training
    per_device_eval_batch_size=8,        # How many examples to process at once during evaluation
    num_train_epochs=3,                  # The number of times to go through the entire training dataset
    weight_decay=0.01,                   # A technique to prevent the model from overfitting
    eval_strategy="epoch",         # Evaluate performance at the end of each epoch
    save_strategy="epoch",               # Save a checkpoint of the model at the end of each epoch
    load_best_model_at_end=True,         # Automatically load the best performing model at the end
)

# --- Create the Trainer ---
# The Trainer brings together the model, arguments, datasets, tokenizer, and evaluation metrics.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# --- Start Training! ---
print("--- Starting Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


--- Starting Fine-Tuning ---


Epoch,Training Loss,Validation Loss
1,No log,0.651261
2,No log,0.610426
3,No log,0.593786


--- Fine-Tuning Complete ---
