In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bbc-articles-dataset/bbc_news_text_complexity_summarization.csv
/kaggle/input/bbc-articles-dataset/archive (2)/bbc-news-data.csv
/kaggle/input/bbc-articles-dataset/archive/bbc_text_cls.csv


# Multi-Class News Classification: Fine-Tuning BERT for BBC Content Analysis

### Import Libraries

In [6]:
# 1. Update pyarrow first to fix binary incompatibility, then install others
!pip install -q -U pyarrow datasets transformers accelerate evaluate
!pip install -q evaluate
import pandas as pd
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import pyarrow # Verifying the fix
import evaluate

### load Data

In [7]:
# Load dataset
df = pd.read_csv("/kaggle/input/bbc-articles-dataset/bbc_news_text_complexity_summarization.csv")
df.columns = df.columns.str.strip()

# Set column names based on your file
target_col = 'labels' 
text_col = 'text'

# Convert text categories to numerical IDs
df[target_col] = df[target_col].astype('category')
label_map = dict(enumerate(df[target_col].cat.categories))
df['labels'] = df[target_col].cat.codes  # Plural 'labels' for BERT loss calculation

num_labels = len(label_map)
print(f"Detected {num_labels} classes: {label_map}")

Detected 5 classes: {0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}


### Data Tokenization and Dataset Preparation

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples[text_col], 
        padding="max_length", 
        truncation=True, 
        max_length=256
    )

# Convert to Hugging Face Dataset and Split
hf_dataset = Dataset.from_pandas(df[[text_col, 'labels']])
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

# Apply tokenization and format for PyTorch
encoded_dataset = hf_dataset.map(tokenize_function, batched=True)
encoded_dataset = encoded_dataset.remove_columns([text_col])
encoded_dataset.set_format("torch")

Map:   0%|          | 0/1701 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

### Model Initialization and Training Configuration

In [9]:
# Load Model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=num_labels
)

# Load Metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Configure Training
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",      # Updated from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=10,           # Log more frequently to see training loss
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics
)
# Start Training
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.43,0.303737,0.971831
2,0.1172,0.11424,0.978873
3,0.0604,0.094417,0.978873




TrainOutput(global_step=162, training_loss=0.3877436544424222, metrics={'train_runtime': 141.5347, 'train_samples_per_second': 36.055, 'train_steps_per_second': 1.145, 'total_flos': 671345940496896.0, 'train_loss': 0.3877436544424222, 'epoch': 3.0})

### Test

In [10]:
# 1. Define your custom text
text = "The team won the championship after a spectacular goal in the final minute!"

# 2. Tokenize and move to the same device as the model
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)

# 3. Get prediction
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# 4. Find the category index
prediction_id = torch.argmax(outputs.logits, dim=1).item()

# 5. Print the actual category name using your label_map
print(f"Predicted Category: {label_map[prediction_id]}")

Predicted Category: sport
