# Overview

Dataset from Hyperpartisan News Detection on Huggingface.

Cleaned by removing HTML artifacts from article text (and removing null entries).

Article text is tokenized using the DistilBERT tokenizer.

# Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
# NOTE: To be able to access the shared files, you need to go to Drive and click
# "Add shortcut to Drive" on the options for the shared folder to be able to access it when mounted

In [None]:
# Google Drive Paths to Folder
path = "/content/gdrive/MyDrive/"
%cd {path}
%pwd

In [None]:
# Pip installs
%pip install transformers datasets evaluate accelerate huggingface_hub

In [None]:
import torch
import datasets
from datasets import load_dataset
import pandas as pd
import html
import re


# Cuda Check
print(torch.__version__)
torch.cuda.is_available()

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Data Processing
This can take a couple minutes.
The dataset is downloaded from Huggingface, converted to dataframes and then cleaned, i.e. HTML tags and symbols are removed

In [None]:
# Use HuggingFace datasets Libarary to Load dataset for training
dataset = load_dataset("hyperpartisan_news_detection", "bypublisher")
print(dataset.keys())
print(dataset["train"])
print(dataset["validation"])

In [None]:
# Data Cleaning
error_values = ['null', 'n/a', 'nan', 'none', '']

# Filter out rows where article text or label is null
def clean_dataset(example):
    if isinstance(example["text"], str) and example["text"].lower() in error_values:
        return False
    if pd.isnull(example["bias"]):
        return False
    return True

cleaned_dataset = dataset.filter(clean_dataset, num_proc=16)
print("Train\tOriginal:", dataset['train'].num_rows, "\tCleaned:", cleaned_dataset['train'].num_rows)
print("Validation\tOriginal:", dataset['validation'].num_rows, "\tCleaned:", cleaned_dataset['validation'].num_rows)

In [None]:
# Data Processing

HTML_REMOVE = re.compile(r'<[^<>]*>')

def process_dataset(example):
  example["text"] = html.unescape(re.sub(r'<[^<>]*>', '', example["text"]))
  return example

processed_dataset = cleaned_dataset.map(process_dataset, num_proc=16)
print("Train\tOriginal:\n", cleaned_dataset['train']['text'][0].replace("\n\n","\n"))
print("Train\tCleaned:\n", processed_dataset['train']['text'][0].replace("\n\n","\n"))
print("##################################################")
print("Validation\tOriginal:\n", cleaned_dataset['validation']['text'][0].replace("\n\n","\n"))
print("Validation\tCleaned:\n", processed_dataset['validation']['text'][0].replace("\n\n","\n"))

In [None]:
processed_dataset.save_to_disk("processed_dataset")

# Tokenize
Article text is tokenized using the DistilBERT tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
processed_dataset = datasets.load_from_disk("processed_dataset")
print(processed_dataset.keys())
print(processed_dataset["train"])
print(processed_dataset["validation"])
print(processed_dataset['train']['text'][0].replace("\n\n","\n"))

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
  tokens = tokenizer(example["text"], return_tensors='pt', truncation=True, padding="max_length")
  return tokens

tokens = processed_dataset.map(tokenize, num_proc=16)
print("Train\tOriginal:\n", processed_dataset['train'])
print("Train\tCleaned:\n", tokens['train'])
print("Validation\tOriginal:\n", processed_dataset['validation'])
print("Validation\tCleaned:\n", tokens['validation'])

In [None]:
tokens.push_to_hub("bzhao18/hyperpartisan-news-distilbert-tokens")