In [1]:
ENVIRONMENT = "AWS"

In [None]:
import tensorflow as tf
from tensorflow import keras

In [2]:
# Adds local modules to sys path for importing
import sys
import os

# If running in colab, mount drive
if ENVIRONMENT == "COLAB":
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/CS105BProject/bias' )
    sys.path.append('/content/drive/MyDrive/CS105BProject')
elif ENVIRONMENT == "AWS":
    sys.path.append("~/fake-news-reasoning/code-acl/bias")
sys.path.append(os.getcwd())


Mounted at /content/drive


In [None]:
%%bash
# Get data
# git clone --branch test --single-branch https://github.com/CS115-fake-news-detection/fake-news-reasoning.git
echo "Downloading data..."
wget -q https://www.dropbox.com/s/3v5oy3eddg3506j/multi_fc_publicdata.zip
echo "Data downloaded"
unzip -q multi_fc_publicdata.zip
rm multi_fc_publicdata.zip
echo "Data unzipped"
mv multi_fc_publicdata/ ../

Downloading data...
Data downloaded
Data unzipped




In [4]:
!pip install transformers
!pip install pytorch-nlp

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 8.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 8.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 71.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 84.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [8]:
import sys
import os
# sys.path.append('../../code-acl')
# sys.path.append(os.getcwd())
sys.path.append('/content/drive/MyDrive/CS105BProject/bias/')
os.environ['OMP_NUM_THREADS'] = "1"
import argparse
import pandas as pd
import pickle
from model.generator import TransformerDataset, transformer_collate
from model.bertmodel import MyBertModel
from model.lstmmodel import LSTMModel
import torch
from parameters import BERT_MODEL_PATH, CLAIM_ONLY, CLAIM_AND_EVIDENCE, EVIDENCE_ONLY, DEVICE, INPUT_TYPE_ORDER
from transformers import AdamW
import numpy as np
from utils.utils import print_message, clean_str
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
from torchnlp.word_to_vector import GloVe
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
#from hypopt import GridSearch
from model_selection import GridSearch
from tqdm import tqdm

def load_data(dataset):
    #path = "../../multi_fc_publicdata/" + dataset + "/"

    path = "../multi_fc_publicdata/" + dataset + "/"

    main_data = pd.read_csv(path + dataset + ".tsv", sep="\t", header=None)
    snippets_data = pd.read_csv(path + dataset + "_snippets.tsv", sep="\t", header=None)
    label_order = pickle.load(open(path + dataset + "_labels.pkl", "rb"))
    splits = pickle.load(open(path + dataset + "_index_split.pkl", "rb"))

    return main_data, snippets_data, label_order, splits

def make_generators(main_data, snippets_data, label_order, splits, params, dataset_generator=TransformerDataset, other_dataset=False):
    generators = []

    all_labels = main_data.values[:,2]
    counter = Counter(all_labels)
    ss = ""
    for c in label_order:
        ss = ss + ", " + str(c) + " (" + str(np.around(counter[c]/len(all_labels) * 100,1)) + "\%)"
        #print(c, np.around(counter[c]/len(all_labels) * 100,1), "%", counter[c])
    print("len", len(all_labels), ss)

    for isplit, split in enumerate(splits):
        # print(f'isplit {isplit}')
        sub_main_data = main_data.values[split]
        # print(f'len sub_main_data: {len(sub_main_data)}')
        
        sub_snippets_data = snippets_data.values[split]
        # print(f'len sub_snippets_data: {len(sub_snippets_data)}')

        

        tmp = dataset_generator(sub_main_data, sub_snippets_data, label_order)
        if isplit == 0:
            generator = torch.utils.data.DataLoader(tmp, **params[0])
        else:
            generator = torch.utils.data.DataLoader(tmp, **params[1])

        generators.append(generator)

        # print(sub_main_data)
        # print(sub_snippets_data)
        # print(f'tmp: \n {tmp[0]}')
        # gen0 = next(iter(generator))
        # print(f'gen0: \n {gen0}')


    # make class weights
    labels = main_data.values[splits[0]][:,2]
    labels = np.array([label_order.index(v) for v in labels])


    if not other_dataset:
        label_weights = torch.tensor(compute_class_weight("balanced", classes=np.arange(len(label_order)), y=labels).astype(np.float32))
    else:
        label_weights = None

    return generators[0], generators[1], generators[2], label_weights

def evaluate(generator, model, other_from=None, ignore_snippet=None):
    all_labels = []
    all_predictions = []

    all_claimIDs = []
    all_logits = []

    for vals in generator:
        claimIDs, claims, labels, snippets = vals[0], vals[1], vals[2], vals[3]

        if ignore_snippet is not None:
            for i in range(len(snippets)):
                snippets[i][ignore_snippet] = "filler"

        all_labels += labels
        logits = model(claims, snippets)

        predictions = torch.argmax(logits, 1).cpu().numpy()

        if other_from == "pomt": # other data is pomt, and model is trained on snes
            # this case is fine
            pass
        elif other_from == "snes": # other data is snes, and model is trained on pomt
            # in this case both "pants on fire!" and "false" should be considered as false
            predictions[predictions == 0] = 1 # 0 is "pants on fire!" and 1 is "false" for pomt.

        all_predictions += predictions.tolist()

        all_claimIDs += claimIDs
        all_logits += logits.cpu().numpy().tolist()

    f1_micro = f1_score(all_labels, all_predictions, average="micro")
    f1_macro = f1_score(all_labels, all_predictions, average="macro")

    return f1_micro, f1_macro, all_claimIDs, all_logits, all_labels, all_predictions

def train_step(optimizer, vals, model, criterion):
    optimizer.zero_grad()

    claimIDs, claims, labels, snippets = vals[0], vals[1], torch.tensor(vals[2]).to(DEVICE), vals[3]

    logits = model(claims, snippets)
    loss = criterion(logits, labels)

    loss.backward()
    optimizer.step()

    return loss


def get_embedding_matrix(generators, dataset, min_occurrence=1):
    savename = "preprocessed/" + dataset + "_glove.pkl"
    if os.path.exists(savename):
        tmp = pickle.load(open(savename, "rb"))
        glove_embedding_matrix = tmp[0]
        word2idx = tmp[1]
        idx2word = tmp[2]
        return glove_embedding_matrix, word2idx, idx2word

    glove_vectors = GloVe('840B')
    all_claims = []
    all_snippets = []
    for gen in generators:
        for vals in gen:
            claims = vals[1]
            claims = [clean_str(v) for v in claims]
            snippets = vals[3]
            snippets = [clean_str(item) for sublist in snippets for item in sublist]

            all_claims += claims
            all_snippets += snippets

    all_words = [word for v in all_claims+all_snippets for word in v.split(" ")]
    counter = Counter(all_words)
    all_words = set(all_words)
    all_words = list(set([word for word in all_words if counter[word] > min_occurrence]))
    word2idx = {word: i+2 for i, word in enumerate(all_words)} # reserve 0 for potential mask and 1 for unk token
    idx2word = {word2idx[key]: key for key in word2idx}

    num_words = len(idx2word)

    glove_embedding_matrix = np.random.random((num_words+2, 300)) - 0.5
    missed = 0
    for word in word2idx:
        if word in glove_vectors:
            glove_embedding_matrix[word2idx[word]] = glove_vectors[word]
        else:
            missed += 1

    pickle.dump([glove_embedding_matrix, word2idx, idx2word], open(savename, "wb"))
    return glove_embedding_matrix, word2idx, idx2word

def train_model(model, criterion, optimizer, train_generator, val_generator, test_generator, args, other_generator, savename):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("model parameters", params)

    num_epochs = 0
    patience_counter = 0
    patience_max = 10
    best_f1 = -np.inf
    while (True):
        train_losses = []

        model.train()
        for ivals, vals in enumerate(train_generator):
            loss = train_step(optimizer, vals, model, criterion)
            train_losses.append(loss.item())

        num_epochs += 1
        print_message("TRAIN loss", np.mean(train_losses), num_epochs)

        if num_epochs % args.eval_per_epoch == 0:
            model.eval()
            with torch.no_grad():
                val_f1micro, val_f1macro, val_claimIDs, val_logits, val_labels, val_predictions = evaluate(val_generator, model)
                print_message("VALIDATION F1micro, F1macro, loss:", val_f1micro, val_f1macro, len(val_claimIDs))

            if val_f1macro > best_f1:
                with torch.no_grad():
                    test_f1micro, test_f1macro, test_claimIDs, test_logits, test_labels, test_predictions = evaluate(test_generator, model)
                    print_message("TEST F1micro, F1macro, loss:", test_f1micro, test_f1macro, len(test_claimIDs))

                    other_test_f1micro, other_test_f1macro, other_test_claimIDs, other_test_logits, other_test_labels, other_test_predictions = evaluate(other_generator, model, other_from="snes" if args.dataset == "pomt" else "pomt")
                    print_message("OTHER-TEST F1micro, F1macro, loss:", other_test_f1micro, other_test_f1macro, len(other_test_claimIDs))

                    test_remove_top_bottom = []
                    test_remove_bottom_top = []
                    other_test_remove_top_bottom = []
                    other_test_remove_bottom_top = []
                    ten = np.arange(10)
                    if args.inputtype != "CLAIM_ONLY":
                        for i in tqdm(range(10)):
                            top_is = ten[:(i+1)]
                            bottom_is = ten[-(i+1):]
                            test_remove_top_bottom.append( evaluate(test_generator, model, ignore_snippet=top_is) )
                            test_remove_bottom_top.append( evaluate(test_generator, model, ignore_snippet=bottom_is) )
                            other_test_remove_top_bottom.append(evaluate(other_generator, model, other_from="snes" if args.dataset == "pomt" else "pomt", ignore_snippet=top_is))
                            other_test_remove_bottom_top.append(evaluate(other_generator, model, other_from="snes" if args.dataset == "pomt" else "pomt", ignore_snippet=bottom_is))

                        print_message([np.around(v[1], 4) for v in test_remove_top_bottom])
                        print_message([np.around(v[1], 4) for v in test_remove_bottom_top])
                        print_message([np.around(v[1], 4) for v in other_test_remove_top_bottom])
                        print_message([np.around(v[1], 4) for v in other_test_remove_bottom_top])

                patience_counter = 0
                best_f1 = val_f1macro
                val_store = [val_f1micro, val_f1macro, val_claimIDs, val_logits, val_labels, val_predictions]
                test_store = [test_f1micro, test_f1macro, test_claimIDs, test_logits, test_labels, test_predictions, test_remove_top_bottom, test_remove_bottom_top]
                other_test_store = [other_test_f1micro, other_test_f1macro, other_test_claimIDs, other_test_logits, other_test_labels, other_test_predictions, other_test_remove_top_bottom, other_test_remove_bottom_top]
                misc_store = [args]
                total_store = [val_store, test_store, other_test_store, misc_store]
            else:
                patience_counter += 1

            print_message("PATIENCE", patience_counter, "/", patience_max)

            if patience_counter >= patience_max:
                pickle.dump(total_store, open(savename, "wb"))
                break

def run_bert(args, train_generator, val_generator, test_generator, label_weights, inputtype, label_order, savename, other_generator):
    model = MyBertModel.from_pretrained(BERT_MODEL_PATH, labelnum=len(label_order), input_type=inputtype)
    model.to(DEVICE)
    print("Model has been put on the torch device...")

    criterion = torch.nn.CrossEntropyLoss(weight=label_weights.to(DEVICE))
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, eps=1e-8)
    optimizer.zero_grad()

    train_model(model, criterion, optimizer, train_generator, val_generator, test_generator, args, other_generator, savename)

def run_lstm(args, train_generator, val_generator, test_generator, label_weights, inputtype, label_order, savename, other_generator):
    glove_embedding_matrix, word2idx, idx2word = get_embedding_matrix([train_generator, val_generator, test_generator, other_generator], args.dataset)

    model = LSTMModel(args.lstm_hidden_dim, args.lstm_layers, args.lstm_dropout, len(label_order), word2idx, glove_embedding_matrix, input_type=inputtype)
    model.to(DEVICE)

    criterion = torch.nn.CrossEntropyLoss(weight=label_weights.to(DEVICE))
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, eps=1e-8)
    optimizer.zero_grad()

    train_model(model, criterion, optimizer, train_generator, val_generator, test_generator, args, other_generator, savename)

    return model

def filter_snippet_for_bow(generator, ignore_snippet, inputtype):
    samples = []
    for vals in generator:
        claims = vals[1]
        labels = vals[2]
        snippets = vals[3]

        for i in range(len(snippets)):
            snippets[i][ignore_snippet] = "filler"

        for i in range(len(claims)):
            if inputtype == CLAIM_AND_EVIDENCE:
                sample = clean_str(claims[i]) + " ".join([clean_str(v) for v in snippets[i]])
            elif inputtype == CLAIM_ONLY:
                sample = clean_str(claims[i])
            elif inputtype == EVIDENCE_ONLY:
                sample = " ".join([clean_str(v) for v in snippets[i]])
            else:
                raise Exception("Unknown type", inputtype)
            samples.append(sample)
    return samples

def get_bows_labels(generators, dataset, inputtype):
    all_samples = []
    all_labels = []

    for gen in generators:
        gen_samples = []
        gen_labels = []
        for vals in gen:
            claims = vals[1]
            labels = vals[2]
            snippets = vals[3]

            for i in range(len(claims)):
                if inputtype == CLAIM_AND_EVIDENCE:
                    sample = clean_str(claims[i]) + " ".join([clean_str(v) for v in snippets[i]])
                elif inputtype == CLAIM_ONLY:
                    sample = clean_str(claims[i])
                elif inputtype == EVIDENCE_ONLY:
                    sample = " ".join([clean_str(v) for v in snippets[i]])
                else:
                    raise Exception("Unknown type", inputtype)
                gen_samples.append(sample)
                gen_labels.append(labels[i])

        all_samples.append(gen_samples)
        all_labels.append(gen_labels)

    test_remove_top_bottom = []
    test_remove_bottom_top = []
    other_test_remove_top_bottom = []
    other_test_remove_bottom_top = []
    ten = np.arange(10)
    for i in tqdm(range(10)):
        top_is = ten[:(i + 1)]
        bottom_is = ten[-(i + 1):]
        test_remove_top_bottom.append( filter_snippet_for_bow(generators[-2], top_is, inputtype) )
        test_remove_bottom_top.append( filter_snippet_for_bow(generators[-2], bottom_is, inputtype) )
        other_test_remove_top_bottom.append( filter_snippet_for_bow(generators[-1], top_is, inputtype) )
        other_test_remove_bottom_top.append( filter_snippet_for_bow(generators[-1], bottom_is, inputtype) )

    vectorizer = TfidfVectorizer(min_df=2)
    vectorizer.fit([item for sublist in all_samples for item in sublist])

    bows = [vectorizer.transform(all_samples[i]) for i in range(len(all_samples))]

    test_remove_top_bottom = [vectorizer.transform(test_remove_top_bottom[i]) for i in range(len(test_remove_top_bottom))]
    test_remove_bottom_top = [vectorizer.transform(test_remove_bottom_top[i]) for i in range(len(test_remove_bottom_top))]
    other_test_remove_top_bottom = [vectorizer.transform(other_test_remove_top_bottom[i]) for i in range(len(other_test_remove_top_bottom))]
    other_test_remove_bottom_top = [vectorizer.transform(other_test_remove_bottom_top[i]) for i in range(len(other_test_remove_bottom_top))]

    return bows, all_labels, test_remove_top_bottom, test_remove_bottom_top, other_test_remove_top_bottom, other_test_remove_bottom_top

def run_bow(args, train_generator, val_generator, test_generator, label_weights, inputtype, label_order, savename, other_test_generator):
    # print(f'train_generator0 :\n {next(iter(train_generator))}')

    bows, labels, test_remove_top_bottom, test_remove_bottom_top, other_test_remove_top_bottom, other_test_remove_bottom_top = get_bows_labels([train_generator, val_generator, test_generator, other_test_generator], args.dataset, inputtype)

    train_bow, val_bow, test_bow, other_test_bow = bows[0], bows[1], bows[2], bows[3]
    train_labels, val_labels, test_labels, other_test_labels = labels[0], labels[1], labels[2], labels[3]

    label_weights = label_weights.numpy()
    weights = {i: label_weights[i] for i in range(len(label_weights))}

    # print(f'****** run bow train_bow \n {train_bow}')
    # print('*********')

    param_grid = [
        {'n_estimators': [100, 500, 1000], 'min_samples_leaf': [1, 3, 5, 10], 'min_samples_split': [2, 5, 10]}
    ]

    opt = GridSearch(model=RandomForestClassifier(n_jobs=5, class_weight=weights), param_grid=param_grid, parallelize=False)

    
    opt.fit(train_bow, train_labels, val_bow, val_labels, scoring="f1_macro")

    def rf_eval(model, bow, labels, other_from=None):
        preds = model.predict(bow)

        if other_from == "pomt": # other data is pomt, and model is trained on snes
            # this case is fine
            pass
        elif other_from == "snes": # other data is snes, and model is trained on pomt
            # in this case both "pants on fire!" and "false" should be considered as false
            preds[preds == 0] = 1 # 0 is "pants on fire!" and 1 is "false" for pomt.

        f1_macro = f1_score(labels, preds, average="macro")
        f1_micro = f1_score(labels, preds, average="micro")
        return f1_micro, f1_macro, labels, preds

    # val_store = [val_f1micro, val_f1macro, val_claimIDs, val_logits, val_labels, val_predictions]
    # test_store = [test_f1micro, test_f1macro, test_claimIDs, test_logits, test_labels, test_predictions,test_remove_top_bottom, test_remove_bottom_top]
    # other_test_store = [other_test_f1micro, other_test_f1macro, other_test_claimIDs, other_test_logits,
    #                     other_test_labels, other_test_predictions, other_test_remove_top_bottom,
    #                     other_test_remove_bottom_top]
    #misc_store = [args]


    val_store = rf_eval(opt, val_bow, val_labels)
    test_store = list(rf_eval(opt, test_bow, test_labels)) + [[rf_eval(opt, test_remove_top_bottom[i], test_labels) for i in range(10)],
                                                       [rf_eval(opt, test_remove_bottom_top[i], test_labels) for i in range(10)]]
    other_test_store = list(rf_eval(opt, other_test_bow, other_test_labels, other_from="snes" if args.dataset == "pomt" else "pomt")) + [[rf_eval(opt, other_test_remove_top_bottom[i], other_test_labels, other_from="snes" if args.dataset == "pomt" else "pomt") for i in range(10)],
                                                       [rf_eval(opt, other_test_remove_bottom_top[i], other_test_labels, other_from="snes" if args.dataset == "pomt" else "pomt") for i in range(10)]]
    misc_store = [opt.get_best_params()]
    total_store = [val_store, test_store, other_test_store, misc_store]

    print_message("VALIDATION", val_store[0], val_store[1])
    print_message("TEST", test_store[0], test_store[1])
    print_message("OTHER-TEST", other_test_store[0], other_test_store[1])

    print_message([np.around(v[1], 4) for v in test_store[-2]])
    print_message([np.around(v[1], 4) for v in test_store[-1]])
    print_message([np.around(v[1], 4) for v in other_test_store[-2]])
    print_message([np.around(v[1], 4) for v in other_test_store[-1]])
    print(misc_store)

    pickle.dump(total_store, open(savename, "wb"))

def filter_websites(snippets_data):
    bad_websites = ["factcheck.org", "politifact.com", "snopes.com", "fullfact.org", "factscan.ca"]
    ids = snippets_data.values[:, 0]
    remove_count = 0
    for i, id in enumerate(ids):
        with open("../../multi_fc_publicdata/snippets/" + id, "r", encoding="utf-8") as f:
            lines = f.readlines()

        links = [line.strip().split("\t")[-1] for line in lines]
        remove = [False for _ in range(10)]
        for j in range(len(links)):
            remove[j] = any([bad in links[j] for bad in bad_websites])
        remove = remove[:10]  # 1 data sample has 11 links by mistake in the dataset
        snippets_data.iloc[i, [False] + remove] = "filler"

        remove_count += np.sum(remove)
    print_message("REMOVE COUNT", remove_count)
    return snippets_data



In [9]:
class vars():
    def __init__(self, mode):
        if mode == "bow":
            self.dataset = "snes"
            self.inputtype = "CLAIM_AND_EVIDENCE"
            self.filter_websites = 0
            self.model = "bow"
            self.batchsize = 2
            self.eval_per_epoch = 1
            self.lr = 0.0001
        elif mode == 'lstm':
            self.dataset = "snes"
            self.inputtype = "CLAIM_AND_EVIDENCE"
            self.filter_websites = 0
            self.model = "lstm"
            self.batchsize = 16
            self.eval_per_epoch = 1
            self.lr = 0.0001
            self.lstm_hidden_dim = 128
            self.lstm_layers = 2
            self.lstm_dropout = 0.1
        elif mode == 'bert':
            self.dataset = "snes"
            self.inputtype = "CLAIM_AND_EVIDENCE"
            self.filter_websites = 0
            self.model = "bert"
            self.batchsize = 8
            self.eval_per_epoch = 1
            self.lr = 0.000003            


            
args = vars("lstm")

if args.filter_websites > 0.5:
    savename = "results/" + "-".join([str(v) for v in [args.filter_websites, args.model, args.dataset, args.inputtype, args.lr, args.batchsize]])
else:
    savename = "results/" + "-".join([str(v) for v in [args.model, args.dataset, args.inputtype, args.lr, args.batchsize]])

if args.model == "lstm":
    savename += "-" + "-".join([str(v) for v in [args.lstm_hidden_dim, args.lstm_layers, args.lstm_dropout]])
savename += ".pkl"

inputtype = INPUT_TYPE_ORDER.index(args.inputtype)
main_data, snippets_data, label_order, splits = load_data(args.dataset)

if args.filter_websites > 0.5:
    snippets_data = filter_websites(snippets_data)

params = {"batch_size": args.batchsize, "shuffle": True, "num_workers": 1, "collate_fn": transformer_collate, "persistent_workers": True, "prefetch_factor":5}
eval_params = {"batch_size": args.batchsize, "shuffle": False, "num_workers": 1, "collate_fn": transformer_collate, "persistent_workers": True, "prefetch_factor":5}

train_generator, val_generator, test_generator, label_weights = make_generators(main_data, snippets_data, label_order, splits, [params, eval_params])

if args.dataset == "snes":
    main_data, snippets_data, _, splits = load_data("pomt")
    if args.filter_websites > 0.5:
        snippets_data = filter_websites(snippets_data)
    main_data.iloc[main_data.iloc[:, 2] == "pants on fire!", 2] = "false"
    main_data.iloc[main_data.iloc[:, 2] == "half-true", 2] = "mixture"
    _, _, other_test_generator, _ = make_generators(main_data, snippets_data, label_order, splits, [params, eval_params], other_dataset=True)
else:
    main_data, snippets_data, _, splits = load_data("snes")
    if args.filter_websites > 0.5:
        snippets_data = filter_websites(snippets_data)
    main_data.iloc[main_data.iloc[:, 2] == "mixture", 2] = "half-true"
    _, _, other_test_generator, _ = make_generators(main_data, snippets_data, label_order, splits, [params, eval_params], other_dataset=True)


if args.model == "bert":
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    run_bert(args, train_generator, val_generator, test_generator, label_weights, inputtype, label_order, savename, other_test_generator)
elif args.model == "lstm":
    model = run_lstm(args, train_generator, val_generator, test_generator, label_weights, inputtype, label_order, savename, other_test_generator)
elif args.model == "bow":
    # print("run bow")
    run_bow(args, train_generator, val_generator, test_generator, label_weights, inputtype, label_order, savename, other_test_generator)





len 5069 , false (64.3\%), mostly false (7.5\%), mixture (12.3\%), mostly true (2.8\%), true (13.0\%)
len 13581 , false (29.7\%), mostly false (17.0\%), mixture (19.8\%), mostly true (18.8\%), true (14.8\%)
model parameters 1678089
[Nov 10, 21:45:49] TRAIN loss 1.6047742039233714 1
[Nov 10, 21:45:50] VALIDATION F1micro, F1macro, loss: 0.6429980276134122 0.1565426170468187 507
[Nov 10, 21:45:54] TEST F1micro, F1macro, loss: 0.6429980276134122 0.1565426170468187 1014
[Nov 10, 21:46:04] OTHER-TEST F1micro, F1macro, loss: 0.29738682370261316 0.09168794326241134 2717


100%|██████████| 10/10 [03:08<00:00, 18.85s/it]

[Nov 10, 21:49:12] [0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565]
[Nov 10, 21:49:12] [0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565]
[Nov 10, 21:49:12] [0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917]
[Nov 10, 21:49:12] [0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917]
[Nov 10, 21:49:12] PATIENCE 0 / 10





[Nov 10, 21:49:33] TRAIN loss 1.6014498187614992 2
[Nov 10, 21:49:34] VALIDATION F1micro, F1macro, loss: 0.6272189349112426 0.1694253011326182 507
[Nov 10, 21:49:38] TEST F1micro, F1macro, loss: 0.6370808678500987 0.17419002796535027 1014
[Nov 10, 21:49:47] OTHER-TEST F1micro, F1macro, loss: 0.29628266470371734 0.1336480398045242 2717


100%|██████████| 10/10 [03:07<00:00, 18.78s/it]

[Nov 10, 21:52:55] [0.1564, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1691, 0.1801]
[Nov 10, 21:52:55] [0.1742, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1801]
[Nov 10, 21:52:55] [0.1033, 0.0932, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0918, 0.1319]
[Nov 10, 21:52:55] [0.114, 0.0926, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.1319]
[Nov 10, 21:52:55] PATIENCE 0 / 10





[Nov 10, 21:53:16] TRAIN loss 1.600072490202414 3
[Nov 10, 21:53:17] VALIDATION F1micro, F1macro, loss: 0.5877712031558185 0.19531171442936152 507
[Nov 10, 21:53:21] TEST F1micro, F1macro, loss: 0.6025641025641025 0.18627007494718495 1014
[Nov 10, 21:53:30] OTHER-TEST F1micro, F1macro, loss: 0.2885535517114464 0.14338706308229868 2717


100%|██████████| 10/10 [03:07<00:00, 18.76s/it]

[Nov 10, 21:56:37] [0.1567, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.188, 0.1995]
[Nov 10, 21:56:37] [0.1859, 0.1592, 0.1566, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1995]
[Nov 10, 21:56:37] [0.115, 0.0941, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0918, 0.1459]
[Nov 10, 21:56:37] [0.1331, 0.0979, 0.0916, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.1459]
[Nov 10, 21:56:37] PATIENCE 0 / 10





[Nov 10, 21:56:58] TRAIN loss 1.596267115962398 4
[Nov 10, 21:57:00] VALIDATION F1micro, F1macro, loss: 0.41025641025641024 0.2050240824393747 507
[Nov 10, 21:57:03] TEST F1micro, F1macro, loss: 0.41420118343195267 0.19104199632888158 1014
[Nov 10, 21:57:13] OTHER-TEST F1micro, F1macro, loss: 0.25874125874125875 0.161902414786437 2717


100%|██████████| 10/10 [03:08<00:00, 18.82s/it]

[Nov 10, 22:00:21] [0.2156, 0.1801, 0.1563, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1828, 0.1874]
[Nov 10, 22:00:21] [0.1938, 0.2141, 0.1804, 0.1625, 0.1565, 0.1565, 0.1565, 0.1565, 0.1565, 0.1874]
[Nov 10, 22:00:21] [0.1713, 0.1361, 0.0988, 0.0917, 0.0917, 0.0917, 0.0917, 0.0917, 0.0918, 0.1421]
[Nov 10, 22:00:21] [0.1704, 0.1538, 0.1151, 0.0939, 0.0916, 0.0917, 0.0917, 0.0917, 0.0917, 0.1421]
[Nov 10, 22:00:21] PATIENCE 0 / 10





[Nov 10, 22:00:42] TRAIN loss 1.5953730989146877 5
[Nov 10, 22:00:43] VALIDATION F1micro, F1macro, loss: 0.5641025641025641 0.19591446389982947 507
[Nov 10, 22:00:43] PATIENCE 1 / 10
[Nov 10, 22:01:04] TRAIN loss 1.5883691912298803 6
[Nov 10, 22:01:06] VALIDATION F1micro, F1macro, loss: 0.4378698224852071 0.18619379429456256 507
[Nov 10, 22:01:06] PATIENCE 2 / 10
[Nov 10, 22:01:27] TRAIN loss 1.5866524641578261 7
[Nov 10, 22:01:29] VALIDATION F1micro, F1macro, loss: 0.46942800788954636 0.20951867713342426 507
[Nov 10, 22:01:32] TEST F1micro, F1macro, loss: 0.46844181459566075 0.2198495703274775 1014
[Nov 10, 22:01:41] OTHER-TEST F1micro, F1macro, loss: 0.25322046374677953 0.17793786954418278 2717


100%|██████████| 10/10 [03:07<00:00, 18.76s/it]

[Nov 10, 22:04:49] [0.2141, 0.1919, 0.1654, 0.1563, 0.1565, 0.1565, 0.1565, 0.1565, 0.1783, 0.1662]
[Nov 10, 22:04:49] [0.2206, 0.2063, 0.1855, 0.1784, 0.1652, 0.1658, 0.1564, 0.1565, 0.1565, 0.1662]
[Nov 10, 22:04:49] [0.1742, 0.1417, 0.1042, 0.0946, 0.0917, 0.0917, 0.0917, 0.0917, 0.0918, 0.1503]
[Nov 10, 22:04:49] [0.1761, 0.1647, 0.1235, 0.1015, 0.093, 0.0916, 0.0917, 0.0917, 0.0917, 0.1503]
[Nov 10, 22:04:49] PATIENCE 0 / 10





[Nov 10, 22:05:10] TRAIN loss 1.5768557496972986 8
[Nov 10, 22:05:12] VALIDATION F1micro, F1macro, loss: 0.4891518737672584 0.22096023865031036 507
[Nov 10, 22:05:15] TEST F1micro, F1macro, loss: 0.4960552268244576 0.24259108085323397 1014
[Nov 10, 22:05:24] OTHER-TEST F1micro, F1macro, loss: 0.26720647773279355 0.19138346956509092 2717


100%|██████████| 10/10 [03:07<00:00, 18.76s/it]

[Nov 10, 22:08:32] [0.2255, 0.2135, 0.1929, 0.1683, 0.1594, 0.1562, 0.1564, 0.1562, 0.163, 0.1007]
[Nov 10, 22:08:32] [0.2377, 0.2263, 0.2076, 0.1982, 0.1842, 0.1725, 0.1677, 0.159, 0.1557, 0.1007]
[Nov 10, 22:08:32] [0.1757, 0.1554, 0.1195, 0.0986, 0.0955, 0.0925, 0.0917, 0.0917, 0.0925, 0.1695]
[Nov 10, 22:08:32] [0.1836, 0.1685, 0.1448, 0.1107, 0.0987, 0.0952, 0.0923, 0.0926, 0.0933, 0.1695]
[Nov 10, 22:08:32] PATIENCE 0 / 10





[Nov 10, 22:08:53] TRAIN loss 1.5727000768120225 9
[Nov 10, 22:08:54] VALIDATION F1micro, F1macro, loss: 0.4911242603550296 0.23250743969332702 507
[Nov 10, 22:08:58] TEST F1micro, F1macro, loss: 0.4635108481262328 0.21430170591324518 1014
[Nov 10, 22:09:07] OTHER-TEST F1micro, F1macro, loss: 0.24843577475156423 0.16909741032298098 2717


100%|██████████| 10/10 [03:07<00:00, 18.75s/it]

[Nov 10, 22:12:15] [0.2146, 0.2136, 0.1995, 0.1657, 0.1596, 0.1565, 0.1565, 0.1562, 0.1487, 0.0472]
[Nov 10, 22:12:15] [0.2131, 0.2086, 0.2116, 0.2036, 0.1783, 0.1689, 0.1564, 0.1564, 0.1562, 0.0472]
[Nov 10, 22:12:15] [0.1663, 0.152, 0.124, 0.0994, 0.0948, 0.0917, 0.0917, 0.0917, 0.0936, 0.0706]
[Nov 10, 22:12:15] [0.1653, 0.166, 0.1433, 0.1104, 0.0964, 0.0924, 0.0917, 0.0917, 0.0917, 0.0706]
[Nov 10, 22:12:15] PATIENCE 0 / 10





[Nov 10, 22:12:36] TRAIN loss 1.5666737964561395 10
[Nov 10, 22:12:37] VALIDATION F1micro, F1macro, loss: 0.5167652859960552 0.23396341810320304 507
[Nov 10, 22:12:41] TEST F1micro, F1macro, loss: 0.4990138067061144 0.2192531551518421 1014
[Nov 10, 22:12:50] OTHER-TEST F1micro, F1macro, loss: 0.26720647773279355 0.18145989133212745 2717


100%|██████████| 10/10 [03:08<00:00, 18.82s/it]

[Nov 10, 22:15:58] [0.215, 0.2097, 0.1734, 0.1597, 0.1565, 0.1565, 0.1565, 0.1565, 0.1496, 0.0416]
[Nov 10, 22:15:58] [0.2203, 0.2148, 0.2062, 0.1779, 0.1658, 0.1596, 0.1564, 0.1565, 0.1564, 0.0416]
[Nov 10, 22:15:58] [0.1663, 0.1438, 0.1134, 0.0965, 0.0935, 0.0918, 0.0918, 0.0917, 0.0941, 0.066]
[Nov 10, 22:15:58] [0.1745, 0.1669, 0.1277, 0.0996, 0.0933, 0.0925, 0.0916, 0.0917, 0.0917, 0.066]
[Nov 10, 22:15:58] PATIENCE 0 / 10





[Nov 10, 22:16:19] TRAIN loss 1.5596770944896046 11
[Nov 10, 22:16:21] VALIDATION F1micro, F1macro, loss: 0.40236686390532544 0.2150121328727717 507
[Nov 10, 22:16:21] PATIENCE 1 / 10
[Nov 10, 22:16:42] TRAIN loss 1.551466575613967 12
[Nov 10, 22:16:43] VALIDATION F1micro, F1macro, loss: 0.4161735700197239 0.22452309438835893 507
[Nov 10, 22:16:43] PATIENCE 2 / 10
[Nov 10, 22:17:04] TRAIN loss 1.5449568027848597 13
[Nov 10, 22:17:06] VALIDATION F1micro, F1macro, loss: 0.46745562130177515 0.24534613845538217 507
[Nov 10, 22:17:09] TEST F1micro, F1macro, loss: 0.46055226824457596 0.24426055803448188 1014
[Nov 10, 22:17:19] OTHER-TEST F1micro, F1macro, loss: 0.2528524107471476 0.21689772045671357 2717


100%|██████████| 10/10 [03:07<00:00, 18.73s/it]

[Nov 10, 22:20:26] [0.2351, 0.2134, 0.2014, 0.1939, 0.1964, 0.1944, 0.1969, 0.1512, 0.078, 0.0279]
[Nov 10, 22:20:26] [0.2412, 0.2331, 0.2122, 0.2117, 0.2157, 0.2184, 0.2155, 0.1969, 0.1356, 0.0279]
[Nov 10, 22:20:26] [0.2008, 0.1735, 0.1571, 0.145, 0.1316, 0.1242, 0.1339, 0.157, 0.1573, 0.058]
[Nov 10, 22:20:26] [0.2117, 0.1884, 0.1621, 0.1508, 0.1362, 0.1338, 0.1242, 0.1426, 0.16, 0.058]
[Nov 10, 22:20:26] PATIENCE 0 / 10





[Nov 10, 22:20:47] TRAIN loss 1.5436873929994601 14
[Nov 10, 22:20:49] VALIDATION F1micro, F1macro, loss: 0.42406311637080873 0.23169454641604564 507
[Nov 10, 22:20:49] PATIENCE 1 / 10
[Nov 10, 22:21:10] TRAIN loss 1.5383128765467051 15
[Nov 10, 22:21:11] VALIDATION F1micro, F1macro, loss: 0.40828402366863903 0.23096778815874516 507
[Nov 10, 22:21:11] PATIENCE 2 / 10
[Nov 10, 22:21:33] TRAIN loss 1.5212226939630937 16
[Nov 10, 22:21:34] VALIDATION F1micro, F1macro, loss: 0.3431952662721893 0.1977870696400626 507
[Nov 10, 22:21:34] PATIENCE 3 / 10
[Nov 10, 22:21:55] TRAIN loss 1.514678061545432 17
[Nov 10, 22:21:57] VALIDATION F1micro, F1macro, loss: 0.3905325443786982 0.22951326574349123 507
[Nov 10, 22:21:57] PATIENCE 4 / 10
[Nov 10, 22:22:18] TRAIN loss 1.507170110135465 18
[Nov 10, 22:22:19] VALIDATION F1micro, F1macro, loss: 0.5226824457593688 0.2661521275876955 507
[Nov 10, 22:22:23] TEST F1micro, F1macro, loss: 0.5167652859960552 0.260544850028564 1014
[Nov 10, 22:22:32] OTHER-TE

100%|██████████| 10/10 [03:07<00:00, 18.74s/it]

[Nov 10, 22:25:40] [0.218, 0.1906, 0.1844, 0.1816, 0.1767, 0.1713, 0.1331, 0.048, 0.028, 0.0279]
[Nov 10, 22:25:40] [0.2591, 0.2167, 0.1917, 0.1818, 0.1822, 0.1786, 0.1637, 0.1192, 0.035, 0.0279]
[Nov 10, 22:25:40] [0.1805, 0.1576, 0.1481, 0.1453, 0.1464, 0.1397, 0.135, 0.1227, 0.0855, 0.058]
[Nov 10, 22:25:40] [0.1886, 0.1729, 0.1482, 0.1397, 0.1401, 0.1377, 0.1349, 0.134, 0.1063, 0.058]
[Nov 10, 22:25:40] PATIENCE 0 / 10





[Nov 10, 22:26:01] TRAIN loss 1.500718953373196 19
[Nov 10, 22:26:02] VALIDATION F1micro, F1macro, loss: 0.33136094674556216 0.21563349781162464 507
[Nov 10, 22:26:02] PATIENCE 1 / 10
[Nov 10, 22:26:23] TRAIN loss 1.4949647902368426 20
[Nov 10, 22:26:25] VALIDATION F1micro, F1macro, loss: 0.39447731755424065 0.23302513830802618 507
[Nov 10, 22:26:25] PATIENCE 2 / 10
[Nov 10, 22:26:46] TRAIN loss 1.47785011390308 21
[Nov 10, 22:26:48] VALIDATION F1micro, F1macro, loss: 0.41025641025641024 0.24125015770768016 507
[Nov 10, 22:26:48] PATIENCE 3 / 10
[Nov 10, 22:27:08] TRAIN loss 1.461178274841996 22
[Nov 10, 22:27:10] VALIDATION F1micro, F1macro, loss: 0.42011834319526625 0.2343754333709173 507
[Nov 10, 22:27:10] PATIENCE 4 / 10
[Nov 10, 22:27:31] TRAIN loss 1.4545292567025434 23
[Nov 10, 22:27:33] VALIDATION F1micro, F1macro, loss: 0.3431952662721893 0.23580114068056926 507
[Nov 10, 22:27:33] PATIENCE 5 / 10
[Nov 10, 22:27:54] TRAIN loss 1.4422206637021657 24
[Nov 10, 22:27:55] VALIDATION

In [10]:
pickle.dump(model, open('lstm_claim_and_evidence', "wb"))

In [12]:
def evaluate_after_train(generator, model, other_from=None, ignore_snippet=None):
    all_labels = []
    all_predictions = []

    all_claimIDs = []
    all_logits = []

    for vals in generator:
        claimIDs, claims, labels, snippets = vals[0], vals[1], vals[2], vals[3]

        if ignore_snippet is not None:
            for i in range(len(snippets)):
                snippets[i][ignore_snippet] = "filler"

        all_labels += labels
        logits = model(claims, snippets)

        predictions = torch.argmax(logits, 1).cpu().numpy()

        if other_from == "pomt": # other data is pomt, and model is trained on snes
            # this case is fine
            pass
        elif other_from == "snes": # other data is snes, and model is trained on pomt
            # in this case both "pants on fire!" and "false" should be considered as false
            predictions[predictions == 0] = 1 # 0 is "pants on fire!" and 1 is "false" for pomt.

        all_predictions += predictions.tolist()

        all_claimIDs += claimIDs
        all_logits += logits.detach().cpu().numpy().tolist()

    f1_micro = f1_score(all_labels, all_predictions, average="micro")
    f1_macro = f1_score(all_labels, all_predictions, average="macro")

    return f1_micro, f1_macro, all_claimIDs, all_logits, all_labels, all_predictions

In [13]:
#  returns f1_micro, f1_macro, all_claimIDs, all_logits, all_labels, all_predictions
f1_micro, f1_macro, _, _, labels, predictions = evaluate_after_train(test_generator, model, other_from=None, ignore_snippet=None)


In [14]:
correct, incorrect = [], []
for i, (label, pred) in enumerate(zip(labels, predictions)):
    if label == pred:
        correct.append(i)
    else:
        incorrect.append(i)

In [18]:
print(f"{incorrect[:5]}")
print(f"{correct[:5]}")

[0, 5, 6, 7, 8]
[1, 2, 3, 4, 19]


In [27]:
dataset = "snes"
path = "../multi_fc_publicdata/" + dataset + "/"
splits = pickle.load(open(path + dataset + "_index_split.pkl", "rb"))
test_samples = splits[2]

# Lookup table maps sample in dataset -> its index in the labels/predictions lists
sample_to_test_idx = {sample_idx: i for i, sample_idx in enumerate(test_samples)}



In [58]:
# First incorrect sample
print(incorrect[0])

# Corresponds to index:
print(test_samples[incorrect[0]])
cols = [1, 2, 4] # headline, label, text

# Observe this sample
print(main_data.iloc[4696, cols])
obs = main_data.iloc[4696, cols]
print(type(obs))

0
4696
1    Because of the failure to pass a repeal bill, ...
2                                              mixture
4    Chalk up Sen. David Perdue, R-Ga., as someone ...
Name: 4696, dtype: object
<class 'pandas.core.series.Series'>


In [62]:
values = obs.values
print(f"Headline: {values[0]}")
print(f"Label: {values[1]}")
print(f"Text: {values[2]}")

Headline: Because of the failure to pass a repeal bill, "Obamacare remains the law of the land ... This means more than 300,000 Georgians below the poverty line will still not have access to the insurance Obamacare promised."
Label: mixture
Text: Chalk up Sen. David Perdue, R-Ga., as someone who was unhappy about the Senate’s failure to pass a bill to repeal and replace the Affordable Care Act. After the Senate fell one vote short of overturning key elements of President Barack Obama’s signature health care law, Perdue offered a statement, reprinted here in part: "Throughout this entire process, we have witnessed everything that’s wrong with Washington. The Senate had a real opportunity to dismantle the most damaging parts of Obamacare. As Republicans have railed against the failures of Obamacare for the last seven years, Democrats have failed to acknowledge any shortcomings of Obamacare and refused to try to fix a broken system. "Now, due to an unworkable budget process and politician

In [77]:
import textwrap

def write_samples_to_file(data, indices, filename):
    """indices: the label/prediction indicies you want the corresponding sample for"""
    cols = [1, 2, 4]
    samples = [test_samples[i] for i in indices]
    data_wanted = data.iloc[samples, cols]
    with open(filename, 'w') as f:
        for i, row in data_wanted.iterrows():
            headline, label, text = row.values[0], row.values[1], row.values[2]
            headline_lines = textwrap.wrap(headline, width=80)
            lines = textwrap.wrap(text, width=80)
            f.write("HEADLINE:\n")
            for line in headline_lines:
                f.write(line + '\n')
            f.write("LABEL: " + label + '\n')
            f.write("TEXT: ")
            for line in lines:
                f.write(line + '\n')
            f.write("===============================\n")

    

In [79]:
write_samples_to_file(main_data, correct, 'correct.txt')
write_samples_to_file(main_data, incorrect, 'incorrect.txt')

In [105]:
correct[10:15]

[32, 35, 36, 38, 40]

In [107]:
main_data.loc[test_samples[35], :]

0                                            pomt-12218
1     Kim Jong Un dead: North Koreans calling Trump ...
2                                                 false
3     /punditfact/statements/2017/jul/20/blog-postin...
4     A report that Kim Jong Un is dead and that it ...
5                                                  None
6                                              Bloggers
7                                                  None
8                                                  None
9                                                  None
10                                  2017-07-20T17:25:19
11                                           2017-07-06
12                                             ['None']
Name: 2750, dtype: object