# Installing dependencies
## Please make a copy of this notebook.

In [69]:
!pip install geopy > delete.txt
!pip install datasets > delete.txt
!pip install torch torchvision datasets > delete.txt
!pip install huggingface_hub > delete.txt
!rm delete.txt

# Huggingface login
You will require your personal token.

In [None]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
# !huggingface-cli login

# Part 1: Load Data

## Downloading the train and test dataset

### NOTE: You will need to create subfolders in {PATH_TO_YOUR_DATA_FOLDER} for each split (train/test/validation) or just (train/test). Next, place the corresponding images into each split after randomly shuffling them. Then, create a metadata.csv file for each split and place it in the corresponding directory.

In [71]:
from datasets import load_dataset

dataset_train = load_dataset("CISProject/FOX_NBC", split="train")
dataset_test = load_dataset("CISProject/FOX_NBC", split="test")

In [72]:
label_map = {"nbc": 0, "fox": 1}
dataset_train = dataset_train.map(lambda x: {"labels": label_map[x["news"]]})
dataset_test = dataset_test.map(lambda x: {"labels": label_map[x["news"]]})

In [73]:
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['title', 'news', 'index', 'url', 'labels'],
    num_rows: 3044
})
Dataset({
    features: ['title', 'news', 'index', 'url', 'labels'],
    num_rows: 761
})


# Part 2: Model

## Defining the Custom Model

In [None]:
import torch
from torch import nn
from transformers import BertModel
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from huggingface_hub import PyTorchModelHubMixin
from sklearn.metrics import accuracy_score, classification_report

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }

class BertNewsClassifier(
    nn.Module,
    PyTorchModelHubMixin,
    repo_url="CISProject/bert_news_title_classifier",
    pipeline_tag="text-classification",
    license="mit",
):
    def __init__(self, bert_model_name, num_classes, max_length, batch_size):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.max_length = max_length
        self.batch_size = batch_size

    def forward(self, title):
        device = next(self.parameters()).device
        encoding = self.tokenizer(
            title,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        logits = self.forward_raw(input_ids, attention_mask)
        cate = torch.argmax(logits, dim=1).item()
        result = "nbc" if cate == 0 else "fox"
        return result

    def forward_raw(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

    def _load_dataset(self, dataset):
        dataset = TextClassificationDataset(
            texts=dataset["title"],
            labels=dataset["labels"],
            tokenizer=self.tokenizer,
            max_length=self.max_length,
        )
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

    def evaluate(self, dataset):
        device = next(self.parameters()).device
        dataloader = self._load_dataset(dataset)
        self.eval()
        all_preds = []
        all_labels = []
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"]
            with torch.no_grad():
                logits = self.forward_raw(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)

    def save_model(self, save_path):
        """Save the model locally using the Hugging Face format."""
        self.save_pretrained(save_path)

    def push_model(self, repo_name):
        """Push the model to the Hugging Face Hub."""
        self.push_to_hub(repo_name)

In [None]:
from huggingface_hub import hf_hub_download

def get_device():
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

REPO_NAME = "CISProject/bert_news_title_classifier"

In [76]:
# config = {
#     "bert_model_name": "bert-base-uncased",
#     "num_classes": 2,
#     "max_length": 32,
#     "batch_size": 16,
# }
# model = BertNewsClassifier(**config)
# model.to(get_device())
# model.load_state_dict(torch.load("bert_checkpoints_original/bert_classifier_epoch_40.pth"))
# model.save_model(REPO_NAME)

## Evaluate Model

# Part 3. Pushing the Model to the Hugging Face

In [77]:
# model.push_to_hub(REPO_NAME)

In [78]:
model = BertNewsClassifier.from_pretrained(REPO_NAME)
device = get_device()
model.to(device)

Loading weights from local directory


BertNewsClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [80]:
print(model("NRA leaders knock back liberal pols 'who want to exaggerate our death': 'We haven't lost a beat'"))

fox


In [81]:
accuracy, report = model.evaluate(dataset_test)
print(f"Accuracy: {accuracy:.4f}")
print(report)

Accuracy: 0.8449
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       356
           1       0.89      0.81      0.85       405

    accuracy                           0.84       761
   macro avg       0.85      0.85      0.84       761
weighted avg       0.85      0.84      0.85       761



### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.