In [None]:
import json
from datasets import Dataset, DatasetDict, ClassLabel

# --- Configuration ---
ground_truth_file = 'ground_truth.jsonl'

# --- Step 1: Load all data from the file ---
all_records = []
try:
    with open(ground_truth_file, 'r') as f:
        for line in f:
            all_records.append(json.loads(line))
except FileNotFoundError:
    raise FileNotFoundError(f"Error: The file '{ground_truth_file}' was not found. Please make sure it's in the correct directory.")

# --- Step 2: Filter for records that have been labeled ---
# This is the new, robust filtering step.
# It checks if the 'human_classifier' key exists AND that its value is not an empty string.
labeled_data = [
    record for record in all_records 
    if 'human_classifier' in record and record.get('human_classifier') != ""
]

# --- Provide feedback on the filtering ---
total_records = len(all_records)
labeled_records_count = len(labeled_data)
dropped_records_count = total_records - labeled_records_count

print(f"Loaded {total_records} records from '{ground_truth_file}'.")
if dropped_records_count > 0:
    print(f"Found and dropped {dropped_records_count} records that were missing a 'human_classifier'.")
print(f"Proceeding with {labeled_records_count} labeled records.")

if not labeled_data:
    raise ValueError("No valid labeled data found to train on. Please label some entries in your 'ground_truth.jsonl' file.")

# --- Step 3: Create the Dataset from the clean data ---
# This now only uses the records we know are good.
full_dataset = Dataset.from_list(labeled_data)


# --- The rest of the script remains the same ---
# Create the labels for our model...
labels = ["other", "personal_blog"]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

def add_labels(examples):
    # This now uses 'human_classifier' as you defined
    return {'label': [label2id[label] for label in examples['human_classifier']]}

full_dataset = full_dataset.map(add_labels, batched=True)

# Convert the 'label' column to the ClassLabel type
full_dataset = full_dataset.cast_column("label", ClassLabel(names=labels))

# Split the data into training and testing sets
train_test_split = full_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='label')

# Create a final DatasetDict object
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print("\n--- Dataset successfully prepared for training ---")
print(dataset_dict)

In [None]:
from transformers import AutoTokenizer

# The name of the pre-trained model we will use
model_checkpoint = "distilbert-base-uncased" 

# Load the tokenizer that corresponds to our model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create a function that will tokenize our text
def tokenize_function(examples):
    # The tokenizer will pad shorter texts and truncate longer ones to a standard length
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenizer to our entire dataset (both train and test splits)
# The `batched=True` flag makes this process much faster.
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)


print("--- Tokenization Complete ---")
print("\nOur tokenized dataset now has new features:")
print(tokenized_datasets['train'].features)

print("\nExample of the new features for the first training example:")
# Let's look at the first example again to see the new fields
first_example = tokenized_datasets['train'][0]
print("input_ids (first 20):", first_example['input_ids'][:20])
print("attention_mask (first 20):", first_example['attention_mask'][:20])

In [None]:
!pip install transformers[torch]
!pip install 'accelerate>=0.26.0'
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# --- Load the Pre-trained Model ---
# We load the DistilBERT model and specify it's for sequence classification.
# We also tell it how many labels we have (2: "other" and "personal_blog")
# and provide our label2id/id2label mappings.
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

# --- Define Training Arguments ---
# This object contains all the settings for the training run.
training_args = TrainingArguments(
    output_dir="blog_classifier_model",  # The directory where the final model will be saved
    learning_rate=2e-5,                  # A standard, good learning rate for fine-tuning
    per_device_train_batch_size=8,       # How many examples to process at once during training
    per_device_eval_batch_size=8,        # How many examples to process at once during evaluation
    num_train_epochs=3,                  # The number of times to go through the entire training dataset
    weight_decay=0.01,                   # A technique to prevent the model from overfitting
    eval_strategy="epoch",         # Evaluate performance at the end of each epoch
    save_strategy="epoch",               # Save a checkpoint of the model at the end of each epoch
    load_best_model_at_end=True,         # Automatically load the best performing model at the end
)

# --- Create the Trainer ---
# The Trainer brings together the model, arguments, datasets, tokenizer, and evaluation metrics.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# --- Start Training! ---
print("--- Starting Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")