# Installing dependencies
## Please make a copy of this notebook.

In [1]:
!pip install geopy > delete.txt
!pip install datasets > delete.txt
!pip install torch torchvision datasets > delete.txt
!pip install huggingface_hub > delete.txt
!pip install ipywidgets > delete.txt
!rm delete.txt

# Huggingface login
You will require your personal token.

In [1]:
# !python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
# !python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
!huggingface-cli login --token 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 --add-to-git-credential
!huggingface-cli whoami

# from huggingface_hub import notebook_login
# notebook_login()

Token is valid (permission: write).
The token `notebook` has been saved to /Users/sam/.cache/huggingface/stored_tokens
Your token has been saved in your configured git credential helpers (osxkeychain).
Your token has been saved to /Users/sam/.cache/huggingface/token
Login successful.
The current active token is: `notebook`
Carperis
[1morgs: [0m CISProject


# Part 1: Load Data

## Downloading the train and test dataset

### NOTE: You will need to create subfolders in {PATH_TO_YOUR_DATA_FOLDER} for each split (train/test/validation) or just (train/test). Next, place the corresponding images into each split after randomly shuffling them. Then, create a metadata.csv file for each split and place it in the corresponding directory.

In [2]:
from datasets import load_dataset

dataset_train = load_dataset("CISProject/FOX_NBC", split="train")
dataset_test = load_dataset("CISProject/FOX_NBC", split="test")

In [3]:
label_map = {"nbc": 0, "fox": 1}
dataset_train = dataset_train.map(lambda x: {"labels": label_map[x["news"]]})
dataset_test = dataset_test.map(lambda x: {"labels": label_map[x["news"]]})

In [4]:
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['title', 'news', 'index', 'url', 'labels'],
    num_rows: 3044
})
Dataset({
    features: ['title', 'news', 'index', 'url', 'labels'],
    num_rows: 761
})


# Part 2: Model

In [5]:
import torch
from torch import nn
from transformers import BertModel
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from huggingface_hub import PyTorchModelHubMixin
from sklearn.metrics import accuracy_score, classification_report

def get_device():
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"


REPO_NAME = "CISProject/bert_news_title_classifier_model"

## Defining the Custom Model (Method 1)

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }


class BertNewsTitleClassifier(
    nn.Module,
    PyTorchModelHubMixin,
    repo_url="CISProject/bert_news_title_classifier",
    pipeline_tag="text-classification",
    license="mit",
):
    def __init__(self, bert_model_name, num_classes, max_length, batch_size):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.max_length = max_length
        self.batch_size = batch_size

    def forward(self, title):
        device = next(self.parameters()).device
        encoding = self.tokenizer(
            title,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        logits = self.forward_raw(input_ids, attention_mask)
        cate = torch.argmax(logits, dim=1).item()
        result = "nbc" if cate == 0 else "fox"
        return result

    def forward_raw(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

    def _load_dataset(self, dataset):
        dataset = TextClassificationDataset(
            texts=dataset["title"],
            labels=dataset["labels"],
            tokenizer=self.tokenizer,
            max_length=self.max_length,
        )
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

    def evaluate(self, dataset):
        device = next(self.parameters()).device
        dataloader = self._load_dataset(dataset)
        self.eval()
        all_preds = []
        all_labels = []
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"]
            with torch.no_grad():
                logits = self.forward_raw(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)

    def save_model(self, save_path):
        """Save the model locally using the Hugging Face format."""
        self.save_pretrained(save_path)

    def push_model(self, repo_name):
        """Push the model to the Hugging Face Hub."""
        self.push_to_hub(repo_name)

In [None]:
# load model from local pretrained model

config = {
    "bert_model_name": "bert-base-uncased",
    "num_classes": 2,
    "max_length": 32,
    "batch_size": 16,
}

model = BertNewsTitleClassifier(**config)
model.to(get_device())
model.load_state_dict(torch.load("bert_checkpoints_original/bert_classifier_epoch_40.pth"))

In [None]:
# load model from hub pretrained model

# model = BertNewsTitleClassifier.from_pretrained(REPO_NAME)
# device = get_device()
# model.to(device)

### Evaluate Model

In [None]:
news = model("NRA leaders knock back liberal pols 'who want to exaggerate our death': 'We haven't lost a beat'")
print(news)

In [None]:
accuracy, report = model.evaluate(dataset_test)
print(f"Accuracy: {accuracy:.4f}")
print(report)

## Defining the Custom Model (Method 2)

In [7]:
from transformers import PretrainedConfig, CONFIG_MAPPING

class BertNewsTitleClassifierConfig(PretrainedConfig):
    model_type = "bert_news_title_classifier"

    def __init__(
        self,
        bert_model_name="bert-base-uncased",
        num_classes: int = 2,
        **kwargs,
    ):

        self.bert_model_name = bert_model_name
        self.num_classes = num_classes

        super().__init__(**kwargs)

# CONFIG_MAPPING.register("bert_news_title_classifier", BertNewsTitleClassifierConfig)

from transformers import PreTrainedModel, MODEL_MAPPING

class BertNewsTitleClassifer(PreTrainedModel):
    config_class = BertNewsTitleClassifierConfig

    def __init__(self, config):
        super(BertNewsTitleClassifer, self).__init__(config)
        bert_model_name = config.bert_model_name
        num_classes = config.num_classes

        self.bert = BertModel.from_pretrained(bert_model_name) # NEED TO FIX: do not do from_pretrained inside the model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

    def save_model(self, save_path):
        """Save the model locally using the Hugging Face format."""
        self.save_pretrained(save_path)

    def push_model(self, repo_name):
        """Push the model to the Hugging Face Hub."""
        self.push_to_hub(repo_name)

# MODEL_MAPPING.register(BertNewsTitleClassifierConfig, BertNewsTitleClassifer)

from transformers import AutoConfig, AutoModel

AutoConfig.register("bert_news_title_classifier", BertNewsTitleClassifierConfig)
AutoModel.register(BertNewsTitleClassifierConfig, BertNewsTitleClassifer)

In [8]:
# load model from local pretrained model

config = BertNewsTitleClassifierConfig(
    bert_model_name="bert-base-uncased",
    num_classes=2,
)

model = BertNewsTitleClassifer(config)
model.to(get_device())
model.load_state_dict(torch.load("bert_checkpoints_original/bert_classifier_epoch_40.pth"))

  model.load_state_dict(torch.load("bert_checkpoints_original/bert_classifier_epoch_40.pth"))


<All keys matched successfully>

In [10]:
from transformers import AutoModel

model = AutoModel.from_pretrained(REPO_NAME)
model.to(get_device())

# # Load model directly
# from transformers import BertNewsTitleClassifer
# model = BertNewsTitleClassifer.from_pretrained("CISProject/bert_news_title_classifier_model")

config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

KeyboardInterrupt: 

### Evaluate Model

In [11]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label),
        }


def evaluate(model, dataset, eval_config):
    bert_model_name = eval_config["bert_model_name"]
    max_length = eval_config["max_length"]
    batch_size = eval_config["batch_size"]

    dataset = TextClassificationDataset(
        texts=dataset["title"],
        labels=dataset["labels"],
        tokenizer = BertTokenizer.from_pretrained(bert_model_name),
        max_length=max_length,
    )
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    device = next(model.parameters()).device
    model.eval()
    all_preds = []
    all_labels = []
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"]
        with torch.no_grad():
            logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)

In [12]:
eval_config = {
    "bert_model_name": "bert-base-uncased",
    "max_length": 32,
    "batch_size": 16,
}
accuracy, report = evaluate(model, dataset_test, eval_config)
print(f"Accuracy: {accuracy:.4f}")
print(report)

Accuracy: 0.8449
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       356
           1       0.89      0.81      0.85       405

    accuracy                           0.84       761
   macro avg       0.85      0.85      0.84       761
weighted avg       0.85      0.84      0.85       761



# Part 3. Pushing the Model to the Hugging Face

In [13]:
# model.save_model(REPO_NAME) # save model to local in folder as specified in REPO_NAME
model.push_to_hub(REPO_NAME) # push model to Hugging Face Hub for sharing

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CISProject/bert_news_title_classifier_model/commit/dd49927ab794f3bc6590903eb42ec556ee8ccab8', commit_message='Upload BertNewsTitleClassifer', commit_description='', oid='dd49927ab794f3bc6590903eb42ec556ee8ccab8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CISProject/bert_news_title_classifier_model', endpoint='https://huggingface.co', repo_type='model', repo_id='CISProject/bert_news_title_classifier_model'), pr_revision=None, pr_num=None)

### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.