Download the News Category dataset from here: https://metatext.io/datasets/news-category-dataset then upload it to your google drive in a folder called "datasets"

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
file = "/content/gdrive/MyDrive/datasets/news_category_dataset_sample.txt"

In [None]:
import json

def process_file(file_path):
  news = []
  with open(file_path, 'r') as file:
          for line in file:
              json_object = json.loads(line)
              news.append(json_object)
  return news

news = process_file(file)
print(len(news))

124


In [None]:
news[5]

{'link': 'https://www.huffpost.com/entry/belk-worker-found-dead-columbiana-centre-bathroom_n_632c5f8ce4b0572027b0251d',
 'headline': 'Cleaner Was Dead In Belk Bathroom For 4 Days Before Body Found: Police',
 'category': 'U.S. NEWS',
 'short_description': 'The 63-year-old woman was seen working at the South Carolina store on Thursday. She was found dead Monday after her family reported her missing, authorities said.',
 'authors': '',
 'date': '2022-09-22'}

In [None]:
text_cat_pairs = []

for news_item in news:
    headline = news_item.get("headline")
    short_description = news_item.get("short_description")
    text = headline + " || " + short_description
    category = news_item.get("category")
    text_cat_pairs.append((text, category))

text_cat_pairs[5]

('Cleaner Was Dead In Belk Bathroom For 4 Days Before Body Found: Police || The 63-year-old woman was seen working at the South Carolina store on Thursday. She was found dead Monday after her family reported her missing, authorities said.',
 'U.S. NEWS')

In [None]:
# iterate over text_cat_pairs, which is a list of tuples. Looks at the second element in the tuple. Generate a label_to_index dict and an index_to_label dict from those labels.

label_to_index = {}
index_to_label = {}
i = 0

for (_, label) in text_cat_pairs:
    if label not in label_to_index:
        label_to_index[label] = i
        index_to_label[i] = label
        i += 1

In [None]:
label_to_index

{'U.S. NEWS': 0,
 'COMEDY': 1,
 'PARENTING': 2,
 'WORLD NEWS': 3,
 'CULTURE & ARTS': 4,
 'TECH': 5,
 'SPORTS': 6,
 'ENTERTAINMENT': 7,
 'POLITICS': 8,
 'WEIRD NEWS': 9,
 'ENVIRONMENT': 10,
 'EDUCATION': 11,
 'CRIME': 12,
 'SCIENCE': 13}

In [None]:
index_to_label

{0: 'U.S. NEWS',
 1: 'COMEDY',
 2: 'PARENTING',
 3: 'WORLD NEWS',
 4: 'CULTURE & ARTS',
 5: 'TECH',
 6: 'SPORTS',
 7: 'ENTERTAINMENT',
 8: 'POLITICS',
 9: 'WEIRD NEWS',
 10: 'ENVIRONMENT',
 11: 'EDUCATION',
 12: 'CRIME',
 13: 'SCIENCE'}

In [None]:
import torch

def convert_labels(label):
  return torch.tensor(label_to_index[label])


labels = [cat for (text, cat) in text_cat_pairs]
print(labels[5])
print(convert_labels(labels[5]))

U.S. NEWS
tensor(0)


In [None]:
labels = [convert_labels(label) for label in labels]
stacked_tensors_y = torch.stack(labels)
stacked_tensors_y.shape

torch.Size([124])

In [None]:
stacked_tensors_y = stacked_tensors_y.long()

In [None]:
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer and embedding model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
ex = text_cat_pairs[4][0]
ex

'Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer || Amy Cooper accused investment firm Franklin Templeton of unfairly firing her and branding her a racist after video of the Central Park encounter went viral.'

In [None]:
tokens = tokenizer.encode(ex)
print([tokenizer.decode([t]) for t in tokens])

['[CLS]', 'woman', 'who', 'called', 'cops', 'on', 'black', 'bird', '-', 'watch', '##er', 'loses', 'lawsuit', 'against', 'ex', '-', 'employer', '|', '|', 'amy', 'cooper', 'accused', 'investment', 'firm', 'franklin', 'temple', '##ton', 'of', 'unfair', '##ly', 'firing', 'her', 'and', 'branding', 'her', 'a', 'racist', 'after', 'video', 'of', 'the', 'central', 'park', 'encounter', 'went', 'viral', '.', '[SEP]']


In [None]:
def embed_sentence(sentence):
  inputs = tokenizer(sentence, return_tensors='pt')
  with torch.no_grad():
    outputs = embedding_model(**inputs)
    embeddings = outputs.last_hidden_state
  return embeddings.view(embeddings.size(1), -1)

In [None]:
len(tokens)

48

In [None]:
embedded_example = embed_sentence(ex)
embedded_example.shape

torch.Size([48, 768])

In [None]:
embedded_sentences = [embed_sentence(text) for (text, cat) in text_cat_pairs]
print(embedded_sentences[-1].shape)

torch.Size([45, 768])


In [None]:
len(embedded_sentences)

124

In [None]:
# Find the maximum length among the embedded sentences
max_len = max([x.shape[0] for x in embedded_sentences])

# Pad the shorter sentences with zeros
padded_sentences = []
for sentence in embedded_sentences:
  padding_length = max_len - sentence.shape[0]
  if padding_length == 0:
    padded_sentences.append(sentence)
  else:
    padding = torch.zeros(padding_length, sentence.shape[1])
    padded_sentence = torch.cat((sentence, padding), dim=0)
    padded_sentences.append(padded_sentence)

# Stack the padded tensors
stacked_tensors_x = torch.stack(padded_sentences)
stacked_tensors_x.shape

torch.Size([124, 69, 768])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads=6, dropout=0.1):
        super(TransformerClassifier, self).__init__()

        # Transformer layer
        self.transformer_encoder = torch.nn.MultiheadAttention(input_dim, num_heads,
                                                               dropout=dropout, bias=False,
                                                               kdim=input_dim, vdim=input_dim,
                                                               batch_first=True)
        self.norm = nn.LayerNorm(input_dim)
        # Fully connected Head
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, x):

        # Transformer Encoder
        output, _ = self.transformer_encoder(x, x, x, need_weights=False)
        # apply layer normalization
        output = self.norm(output)
        # take the average over all attention (hidden) states
        output = torch.mean(output, dim=1)
        # Fully Connected Layer for Classification
        output = self.fc(output)

        return output


Above, we used only one attention layer. To stack multiple attention layers, we can use the TransformerEncoder class from transformers instead.

In [None]:
embedding_dimensionality = stacked_tensors_x.shape[-1]
num_classes = len(label_to_index)
embedding_dimensionality, num_classes

(768, 14)

In [None]:
input_size = embedding_dimensionality
output_size = num_classes
model = TransformerClassifier(input_size, output_size)

In [None]:
output = model(stacked_tensors_x)
output.shape

torch.Size([124, 14])

In [None]:
criterion = nn.CrossEntropyLoss()
loss = criterion(output, stacked_tensors_y)

In [None]:
loss

tensor(2.8244, grad_fn=<NllLossBackward0>)

In [None]:
loss.backward()

In [None]:
from torch import optim

optimizer = optim.AdamW(model.parameters(), lr=0.005)

# take an optimization step to update weights
optimizer.step()
# delete the accumulated gradients after each pass
optimizer.zero_grad()