## Hyperparameters

In [1]:
train_percent = 0.8
validation_percent = 0.1
test_percent = 0.1
vocab_cutoff = 2

random_seed = 100

In [30]:
import torch
import sqlite3
import pandas as pd
from tqdm import tqdm
import nltk
import numpy as np
import random
from collections import Counter
import json
import re
import html
from HeadlineDataset import HeadlineDataset
from torch.utils.data import RandomSampler, DataLoader, WeightedRandomSampler
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau, ExponentialLR

In [3]:

def clean_text(text):
    text = (
        text.replace("#39;", "'")
        .replace("amp;", "&")
        .replace("#146;", "'")
        .replace("nbsp;", " ")
        .replace("#36;", "$")
        .replace("\\n", "\n")
        .replace("quot;", "'")
        .replace("<br />", "\n")
        .replace('\\"', '"')
        .replace(" @.@ ", ".")
        .replace(" @-@ ", "-")
        .replace(" @,@ ", ",")
        .replace("\\", " \\ ")
    )
    text = re.compile(r"  +").sub(" ", html.unescape(text))
    text = re.sub(r"(\n(\s)*){2,}", "\n", text)
    text =  re.sub(r"([/#\n])", r" \1 ", text)
    text = re.sub(" {2,}", " ", text).strip()
    return text

## Load Data

In [4]:
con = sqlite3.connect("../Data/Headlines.db")

fake_headlines = pd.read_sql("SELECT * FROM fake_headlines", con)

real_headlines = pd.read_sql("""
select headline.title as text
from headline join feed f on headline.url = f.url
WHERE f.name not in ("The Onion (Fake News)", "Babylon Bee (Fake News)")
""", con)
con.close()

In [5]:

seed = 100
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
del seed
fake_headlines["tokenized"] = fake_headlines["text"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower()))).sample(frac=1).reset_index(drop=True)

real_headlines["tokenized"] = real_headlines["text"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower()))).sample(frac=1).reset_index(drop=True)

fake_headlines['label'] = 1
real_headlines['label'] = 0

## Split Up Data
80% for training, 10% for validation, 10% for testing

In [6]:
print(f"{train_percent * 10}% for training, {validation_percent * 10}% for validation, and {test_percent * 10}% for testing")
def split_train_val_test(df, props):
    assert round(sum(props), 2) == 1
    results = []
    start = 0
    for percent in props:
        length = int(len(df) * percent)
        end = start + length + 1
        results.append(df.iloc[start:end])
        start = end
    return results
props = [train_percent, validation_percent, test_percent]
fake_sets = split_train_val_test(fake_headlines, props)
for i in range(len(fake_sets)):
    for j in range(4):
        fake_sets[i] = pd.concat([fake_sets[i], fake_sets[i]])
real_sets = split_train_val_test(real_headlines, props)

train_set, validation_set, test_set = [pd.concat([x, y]).sample(frac=1).reset_index(drop=True) for x, y in zip(fake_sets, real_sets)]

8.0% for training, 1.0% for validation, and 1.0% for testing


In [7]:
print(f"training headlines: {train_set.shape[0]}")
print(f"validation headlines: {validation_set.shape[0]}")
print(f"test headlines: {test_set.shape[0]}")

training headlines: 429089
validation headlines: 53643
test headlines: 53624


Generate a vocab

In [8]:
PAD, UNK = 0, 1
def count_tokens(df):
    print("starting")
    count = Counter()
    for tokens in df["tokenized"]:
        count += Counter(tokens)
    return count
def generate_vocab_map(df, cutoff):
    vocab          = {"": PAD, "UNK": UNK}
    reversed_vocab = {PAD: "", UNK: "UNK"}
    count = Counter(np.concatenate(df['tokenized'].to_numpy()))
    keep_keys = [key for key in count if count[key] > cutoff]
    for i in range(len(keep_keys)):
        key = keep_keys[i]
        vocab[key] = i + 2
        reversed_vocab[i + 2] = key

    return vocab, reversed_vocab
vocab, reversed_vocab = generate_vocab_map(train_set, cutoff=2)
print(f"Vocab Length: {len(vocab)}")

Vocab Length: 40023


Store the vocab and reversed_vocab for later use after training

In [9]:
with open('../Data/vocab.json', 'w') as vocab_file:
    vocab_file.write(json.dumps(vocab))
with open('../Data/reversed_vocab.json', 'w') as reversed_vocab_file:
    reversed_vocab_file.write(json.dumps(reversed_vocab))
del vocab_file, reversed_vocab_file

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('cuda' if torch.cuda.is_available() else 'cpu')

cuda


In [11]:
train_data = HeadlineDataset(vocab, train_set)
validation_data = HeadlineDataset(vocab, validation_set)
test_data = HeadlineDataset(vocab, validation_set)



collate_func = lambda batch: (pad_sequence([x[0] for x in batch], padding_value=PAD, batch_first=True).to(device), torch.FloatTensor([x[1] for x in batch]).to(device))


def create_data_iterators(batch_size):
    train_sampler = RandomSampler(train_data)
    validation_sampler = RandomSampler(validation_data)
    test_sampler = RandomSampler(test_data)
    train_iterator = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=collate_func)
    validation_iterator = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size, collate_fn=collate_func)
    test_iterator = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_func)
    return train_iterator, validation_iterator, test_iterator


# Train and Validation Loop

In [28]:
def training_loop(model, loss_func, optimizer, iterator):
    model.train()
    total_loss = 0

    for data, labels in tqdm(iterator):
        optimizer.zero_grad()
        out = model(data)
        if out.shape != labels.shape:
            continue
        loss = loss_func(out, labels)
        total_loss += loss
        loss.backward()
        optimizer.step()
    return total_loss

def validation_loop(model, iterator):
    predictions, labels = [], []
    validator = ((data, new_labels) for data, new_labels in iterator if predictions.append(model(data).round()) is None and (labels.append(new_labels)) is None and False)
    list(validator)
    predictions, labels = torch.round(torch.cat(predictions).squeeze()).to(device), torch.round(torch.cat(labels).squeeze()).to(device)
    return labels, predictions


## Metrics

In [13]:
def accuracy(true, pred):
    tp = ((pred == True) & (true == True)).sum()
    tn = ((pred == False) & (true == False)).sum()
    return (tp + tn).sum() / len(true)
def binary_f1(true, pred, selected_class=True):
    tp = ((pred == selected_class) & (true == selected_class)).sum()
    fp = ((pred == selected_class) & (true != selected_class)).sum()
    fn = ((pred != selected_class) & (true == selected_class)).sum()
    if tp + fn == 0 or tp + fp == 0:
        return 0
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    return 2 * precision * recall / (precision + recall)

def binary_macro_f1(true, pred):
    return (binary_f1(true, pred, True) + binary_f1(true, pred, False)) / 2

def evaluate(model, iterator):
        labels, predictions  = validation_loop(model, iterator)
        f1 = binary_macro_f1(labels, predictions)
        acc = accuracy(labels, predictions)
        return f1, acc

# Train Model

In [14]:
def train_model(model, loss_func, optimizer, epochs, train_iterator, validation_iterator):
    validation_labels, validation_predictions = validation_loop(model, validation_iterator)
    validation_f1_pre_train = binary_macro_f1(validation_labels, validation_predictions)
    validation_accuracy_pre_train = accuracy(validation_predictions, validation_labels)
    print(f"""
Before Training
F1: {validation_f1_pre_train}
Accuracy: {validation_accuracy_pre_train}
""")
    for epoch in range(epochs):
        train_loss = training_loop(model, loss_func, optimizer, train_iterator)

        print(f"EPOCH: {epoch}")
        print(f"TRAIN LOSS: {train_loss}")
        f1, acc = evaluate(model, validation_iterator)
        print(f"VAL F-1: {f1}")
        print(f"VAL ACC: {acc}")
# def train_epoch(model, loss_func, )

# Model 1
## Neural Bag Of Words ([NBOW](https://www.aclweb.org/anthology/P15-1162.pdf))


In [26]:
def train_method_1(model, loss_func, optimizer, scheduler, epochs, train_iterator, validation_iterator):
    validation_labels, validation_predictions = validation_loop(model, validation_iterator)
    validation_f1_pre_train = binary_macro_f1(validation_labels, validation_predictions)
    validation_accuracy_pre_train = accuracy(validation_predictions, validation_labels)

    print(f"""
Before Training
F1: {validation_f1_pre_train}
Accuracy: {validation_accuracy_pre_train}
""")
    for epoch in range(epochs):
        train_loss = training_loop(model, loss_func, optimizer, train_iterator)
        val_labels, val_predictions  = validation_loop(model, validation_iterator)
        print(f"EPOCH: {epoch + 1}")
        print(f"TRAIN LOSS: {train_loss}")
        f1 = binary_macro_f1(val_labels, val_predictions)
        acc = accuracy(val_labels, val_predictions)
        print(f"VAL F-1: {f1}")
        print(f"VAL ACC: {acc}")
        scheduler.step()
        # scheduler.step(loss_func(val_predictions, val_labels))


In [35]:
from NBOW import NBOW
model_1 = NBOW(len(vocab.keys()), 300).to(device)
model_1_epochs = 20
model_1_batch_size = 16
model_1_optimizer = Adam(model_1.parameters(), lr=0.05)
# loss_func_1 = nn.BCEWithLogitsLoss()
loss_func_1 = nn.BCELoss()
# scheduler_1 = ReduceLROnPlateau(model_1_optimizer, mode='min')
scheduler_1 = ExponentialLR(model_1_optimizer, 0.1)



model_1_train, model_1_validation, model_1_test = create_data_iterators(model_1_batch_size)
train_method_1(model_1, loss_func_1, model_1_optimizer, scheduler_1, model_1_epochs, model_1_train, model_1_validation)





Before Training
F1: 0.2653433084487915
Accuracy: 0.3610536456108093



100%|██████████| 26819/26819 [01:37<00:00, 275.75it/s]


EPOCH: 1
TRAIN LOSS: 8170.75439453125
VAL F-1: 0.8089728355407715
VAL ACC: 0.8366050124168396


100%|██████████| 26819/26819 [01:33<00:00, 286.43it/s]


EPOCH: 2
TRAIN LOSS: 2738.53662109375
VAL F-1: 0.8093274831771851
VAL ACC: 0.8400909900665283


100%|██████████| 26819/26819 [01:33<00:00, 285.82it/s]


EPOCH: 3
TRAIN LOSS: 2174.182373046875
VAL F-1: 0.8140965700149536
VAL ACC: 0.8432227969169617


100%|██████████| 26819/26819 [01:33<00:00, 287.22it/s]


EPOCH: 4
TRAIN LOSS: 2114.98681640625
VAL F-1: 0.8151694536209106
VAL ACC: 0.8441548943519592


100%|██████████| 26819/26819 [01:32<00:00, 288.45it/s]


EPOCH: 5
TRAIN LOSS: 2116.216796875
VAL F-1: 0.8161953091621399
VAL ACC: 0.8449005484580994


100%|██████████| 26819/26819 [01:33<00:00, 286.94it/s]


EPOCH: 6
TRAIN LOSS: 2102.933837890625
VAL F-1: 0.8153846263885498
VAL ACC: 0.8440616726875305


100%|██████████| 26819/26819 [01:33<00:00, 287.58it/s]


EPOCH: 7
TRAIN LOSS: 2103.52197265625
VAL F-1: 0.816043496131897
VAL ACC: 0.8447887301445007


100%|██████████| 26819/26819 [01:34<00:00, 284.76it/s]


EPOCH: 8
TRAIN LOSS: 2108.86279296875
VAL F-1: 0.8170247673988342
VAL ACC: 0.8453852534294128


100%|██████████| 26819/26819 [01:32<00:00, 288.55it/s]


EPOCH: 9
TRAIN LOSS: 2105.103271484375
VAL F-1: 0.8177109956741333
VAL ACC: 0.8460936546325684


100%|██████████| 26819/26819 [01:33<00:00, 286.94it/s]


EPOCH: 10
TRAIN LOSS: 2108.978271484375
VAL F-1: 0.8170309662818909
VAL ACC: 0.8454039096832275


100%|██████████| 26819/26819 [01:33<00:00, 287.89it/s]


EPOCH: 11
TRAIN LOSS: 2101.121826171875
VAL F-1: 0.8171756863594055
VAL ACC: 0.8455530405044556


100%|██████████| 26819/26819 [01:33<00:00, 286.87it/s]


EPOCH: 12
TRAIN LOSS: 2109.872314453125
VAL F-1: 0.8149775266647339
VAL ACC: 0.8438566327095032


100%|██████████| 26819/26819 [01:32<00:00, 290.65it/s]


EPOCH: 13
TRAIN LOSS: 2109.208740234375
VAL F-1: 0.8152182698249817
VAL ACC: 0.8440244197845459


100%|██████████| 26819/26819 [01:32<00:00, 288.57it/s]


EPOCH: 14
TRAIN LOSS: 2105.4072265625
VAL F-1: 0.8161212205886841
VAL ACC: 0.8447514176368713


100%|██████████| 26819/26819 [01:33<00:00, 286.24it/s]


EPOCH: 15
TRAIN LOSS: 2104.37548828125
VAL F-1: 0.8160717487335205
VAL ACC: 0.8446582555770874


100%|██████████| 26819/26819 [01:32<00:00, 289.72it/s]


EPOCH: 16
TRAIN LOSS: 2109.271240234375
VAL F-1: 0.8163934946060181
VAL ACC: 0.8449192047119141


100%|██████████| 26819/26819 [01:32<00:00, 291.16it/s]


EPOCH: 17
TRAIN LOSS: 2112.466064453125
VAL F-1: 0.8172512650489807
VAL ACC: 0.8456276059150696


100%|██████████| 26819/26819 [01:34<00:00, 285.27it/s]


EPOCH: 18
TRAIN LOSS: 2110.344482421875
VAL F-1: 0.8162152767181396
VAL ACC: 0.8448632955551147


100%|██████████| 26819/26819 [01:33<00:00, 287.54it/s]


EPOCH: 19
TRAIN LOSS: 2105.7822265625
VAL F-1: 0.8156701326370239
VAL ACC: 0.8445091247558594


100%|██████████| 26819/26819 [01:33<00:00, 286.24it/s]


EPOCH: 20
TRAIN LOSS: 2103.451416015625
VAL F-1: 0.8155826330184937
VAL ACC: 0.844397246837616


# Evaluate

In [36]:
print(f"Evaluation on test data")
f1, acc = evaluate(model_1, model_1_test)
print(f"Test F-1: {f1}")
print(f"Test ACC: {acc}")

Evaluation on test data
Test F-1: 0.81651771068573
Test ACC: 0.8451429009437561


In [40]:
torch.save(model_1.state_dict(), "NBOW.pt")