In [1]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv("news_category_dataset.csv")
dataset = dataset.dropna()
# print(len(dataset))
# dataset.iloc[22238]
# dataset['description'][22238]

In [3]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=10)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
learning_rate = 2e-5
batch_size = 32
num_epochs = 3

In [5]:
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [6]:
def tokenize_data(texts, labels):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    labels = torch.tensor(labels)
    return inputs, labels

In [7]:
text_data = dataset.apply(lambda row: row["title"] + " " + row["description"], axis=1).tolist()
categories = [
    'Technology',
    'Environment',
    'Entertainment',
    'Politics',
    'Education',
    'Crime',
    'Sports',
    'Business',
    'Travel',
    'Money'
]
for category in categories:
    dataset.loc[dataset['category']==category, 'category'] = categories.index(category)
categories = list(dataset.category)
categories[0]

0

In [8]:
# for i in range(len(dataset)):
#     if dataset['category'][i] in (0,1,2,3,4,5,6,7,8,9):
#         print(i)

In [9]:
dataset.iloc[23844]

title          John Podesta Says 'Forces Within The FBI' Want...
description    “I think to this day it’s inexplicable that th...
category                                                       3
Name: 23845, dtype: object

In [10]:
# print(text_data[1])
inputs, labels = tokenize_data(text_data, categories)

In [11]:
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [12]:
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

: 

In [None]:
model.save_pretrained("fine_tuned_bert_model")

In [None]:
def predict_category(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return predicted_label

In [None]:
sample_text = "Maury Wills, Base-Stealing Shortstop For Dodgers, Dies At 89 Maury Wills, who helped the Los Angeles Dodgers win three World Series titles with his base-stealing prowess, has died"
predicted_label = predict_category(sample_text)

In [None]:
categories = ["Category1", "Category2", "Category3", ...]
predicted_category = categories[predicted_label]
print(f'Predicted Category: {predicted_category}')