In [1]:
import os
import torch
import torchvision
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


In [2]:
# preprocessing
fake_news_frame = pd.read_csv('fake_and_real_news/Fake.csv')
true_news_frame = pd.read_csv('fake_and_real_news/True.csv')

# add authenticity label


fake_column = ["Fake"] * len(fake_news_frame)
#print(len(fake_column))

fake_news_frame.insert(4, 'authenticity', fake_column)
#fake_news_frame.to_csv(path_or_buf='fake_and_real_news/NewFake.csv')


true_column = ["True"] * len(true_news_frame)
#print(len(true_column))

true_news_frame.insert(4, 'authenticity', true_column)
#true_news_frame.to_csv(path_or_buf='fake_and_real_news/NewTrue.csv')


# combine datasets

frames = [fake_news_frame, true_news_frame]
combined_news_frame = pd.concat(frames)

combined_news_frame.to_csv(path_or_buf='fake_and_real_news/Combined.csv')

In [3]:
news_frame = pd.read_csv('fake_and_real_news/Combined.csv')

n = 1
title = news_frame.iloc[n, 1]
text = news_frame.iloc[n, 2]
subject = news_frame.iloc[n, 3]
date = news_frame.iloc[n, 4]
authenticity = news_frame.iloc[n, 5]

print('Title: {}'.format(title))
print('Text: {}'.format(text))
print('Subject: {}'.format(subject))
print('Date: {}'.format(date))
print('Authenticity: {}'.format(authenticity))

Title:  Drunk Bragging Trump Staffer Started Russian Collusion Investigation
Text: House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investigation so he s been lashing out at the Department of Justice and the FBI in order to protect Trump. As it happens, the dossier is not what started the investigation, according to documents obtained by the New York Times.Former Trump campaign adviser George Papadopoulos was drunk in a wine bar when he revealed knowledge of Russian opposition research on Hillary Clinton.On top of that, Papadopoulos wasn t just a covfefe boy for Trump, as his administration has alleged. He had a much larger role, but none so damning as being a drunken fool in a wine bar. Coffee boys  don t help to arrange a New York meeting between Trump and President Abdel Fattah el-Sisi of Egypt two months before the election. It was known bef

In [4]:
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html


class NewsDataset(Dataset):
    """ News dataset. """
    
    def __init__(self, csv_file, root_dir):
        """Args:
            csv_file (string): Path to the news csv file.
            root_dir (string): Path to the root directory
        """
        
        self.news_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        
    def __len__(self):
        return len(self.news_frame)
    
    
    def __getitem__(self, idx):
        news = self.news_frame.iloc[idx, 1:]
        
        # we are going to have to make some way to parse text as words, and to feed it to the NN
        
        return news

In [5]:
news_dataset = NewsDataset('fake_and_real_news/Combined.csv', 'fake_and_real_news/')

In [6]:
# processing: populate text dataset with news article texts

rows = np.arange(len(news_dataset))

text_dataset = []

for idx in range(0, len(news_dataset)):
    text_dataset.append(news_dataset[idx][1])

In [7]:
pip install --user -U nltk

Requirement already up-to-date: nltk in /home/paul/.local/lib/python3.6/site-packages (3.5)
Note: you may need to restart the kernel to use updated packages.


In [8]:
# processing: encode one news article using nltk encoder
import nltk

from torchnlp.encoders.text import TreebankEncoder


encoder = TreebankEncoder(text_dataset)
encoder.encode("Test input.")
print(len(encoder.vocab))

encoder.encode(text_dataset[10923])

#lineno = 0

#for text in text_dataset.values():
#    encoder.encode(text)
#    if lineno % 5000 == 0:
#        print(lineno)
#    lineno += 1

[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


296562


tensor([])

In [9]:
encoder.encode(text_dataset[10923])

tensor([])

In [10]:
lineno = 0

lengths = []

for text in text_dataset:
    lengths.append(len(text))
#     lineno += 1
#     if lineno % 5000 == 0:
#         print(lineno)

MAX_TEXT_LENGTH = max(lengths)

In [11]:
len(text_dataset[0])

2893

In [12]:
MAX_TEXT_LENGTH

51794

In [13]:
# processing: pad texts to have matching length

from torchnlp.encoders import Encoder
from torchnlp.encoders.text import pad_tensor
from torchnlp.encoders.text.default_reserved_tokens import DEFAULT_PADDING_INDEX

lineno = 0

padded_texts = []

for text in text_dataset:
    padded_texts.append(pad_tensor(encoder.encode(text).long(), MAX_TEXT_LENGTH))
    if lineno % 5000 == 0:
        print(lineno)
    lineno += 1

0
5000
10000
15000
20000
25000
30000
35000
40000


In [14]:
# processing: convert longs to floats (for NN)

# lineno = 0

# for text in padded_texts:
#     for long in text:
#         long = float(long)
#     if lineno % 5000 == 0:
#         print(lineno)
#     lineno += 1

In [15]:
padded_texts[1]

tensor([272, 273, 274,  ...,   0,   0,   0])

In [16]:
# processing: split news article texts into strings, store in dictionary

# dictionary = []
# lengths = []

# split_texts = []
# lineno = 0

# for text in text_dataset.values():
#     text_words = text.split()
#     split_texts.append(text_words)
#     #print(len(text), lineno)
#     lengths.append(len(text))
#     lineno += 1
#     for word in text_words:
#         if word not in dictionary:
#             dictionary.append(word)

            
# MAX_TEXT_LENGTH = max(lengths)
# print(MAX_TEXT_LENGTH)


In [17]:
# processing: pad data to provide uniform input to NN

# dictionary = {'fake': 0, 'true': 1}


###

# random train data to test NN

# split_texts = []

# split_texts.append(['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake'])
# split_texts.append(['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake'])
# split_texts.append(['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake'])
# split_texts.append(['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake'])
# split_texts.append(['true', 'true', 'true', 'true', 'true', 'true', 'true'])
# split_texts.append(['true', 'true', 'true', 'true', 'true', 'true', 'true'])
# split_texts.append(['true', 'true', 'true', 'true', 'true', 'true', 'true'])
# split_texts.append(['true', 'true', 'true', 'true', 'true', 'true', 'true'])

# MAX_TEXT_LENGTH = 7

###

# for text in split_texts:
#     while len(text) < MAX_TEXT_LENGTH:
#         text.append('word')

In [18]:
# processing: convert split texts (lists of strings) into lists of ints, using dictionary

# converted_split_texts = []

# for text in split_texts:
#     converted_text = []
#     for word in text:
#         converted_text.append(dictionary[word])
#     converted_split_texts.append(converted_text)
    
# print(converted_split_texts)

In [19]:
# processing: collect vector of true/fake booleans to train dataset

rows = np.arange(len(news_dataset))

true_fake_dataset = []

for idx in np.nditer(rows):
    if news_dataset[int(idx)][4] == 'Fake':
        true_fake_dataset.append(0)
    if news_dataset[int(idx)][4] == 'True':
        true_fake_dataset.append(1)
        
###

# random train data to test NN

# true_fake_dataset = [0, 0, 0, 0, 1, 1, 1, 1]

###

In [20]:
# define Net class

import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(MAX_TEXT_LENGTH, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)


In [21]:
# initialize net and print parameters

net = Net()
print(net)

params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

Net(
  (fc1): Linear(in_features=51794, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
)
8
torch.Size([64, 51794])


In [22]:
# optimize net with backprop; 3 epochs

# import torch.optim as optim

# optimizer = optim.Adam(net.parameters(), lr=0.001)

# EPOCHS = 3


# for epoch in range(EPOCHS):
#     for i, padded_text in enumerate(padded_texts, start=0):
#         # data is a batch of featuresets and labels
#         #print(true_fake_dataset[i])
#         X = padded_text
#         #print(X)
#         y = torch.tensor([true_fake_dataset[i]], dtype=torch.long)
#         if i % 500 == 0:
#             print(i)
#         #print(y)
#         net.zero_grad()
#         X_float = X.new_tensor(X, dtype=torch.float) # convert tensor long to float, NN must read in float
# #         print(X_float)
#         output = net(X_float.view(-1, MAX_TEXT_LENGTH))
#         loss = F.nll_loss(output, y)
#         loss.backward()
#         optimizer.step()
#     print(loss)


In [23]:
# calculate and print accuracy

# correct = 0
# total = 0

# with torch.no_grad():
#     for i, padded_text in enumerate(padded_texts, start=0):
#         X = padded_text
#         y = torch.tensor([true_fake_dataset[i]], dtype=torch.long)
#         X_float = X.new_tensor(X, dtype=torch.float) # convert tensor long to float
#         output = net(X_float.view(-1, MAX_TEXT_LENGTH))
# #         print(torch.argmax(output))
#         for idx, i in enumerate(output):
#             if torch.argmax(i) == y[idx]:
#                 correct += 1
#             total += 1

# print("Accuracy: ", round(correct/total, 3))

In [24]:
# import glove relations

import bcolz
import pickle

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'6B.50.dat', mode='w')


with open(f'glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
#         print(vectors[idx])
    
# print(vectors[20000000:20000050])
    
# save outputs to disk    

vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'6B.50_idx.pkl', 'wb'))

In [25]:
# create glove dictionary

vectors = bcolz.open(f'6B.50.dat')[:]
words = pickle.load(open(f'6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [26]:
glove['<unk>']

array([ 0.072617, -0.51393 ,  0.4728  , -0.52202 , -0.35534 ,  0.34629 ,
        0.23211 ,  0.23096 ,  0.26694 ,  0.41028 ,  0.28031 ,  0.14107 ,
       -0.30212 , -0.21095 , -0.10875 , -0.33659 , -0.46313 , -0.40999 ,
        0.32764 ,  0.47401 , -0.43449 ,  0.19959 , -0.55808 , -0.34077 ,
        0.078477,  0.62823 ,  0.17161 , -0.34454 , -0.2066  ,  0.1323  ,
       -1.8076  , -0.38851 ,  0.37654 , -0.50422 , -0.012446,  0.046182,
        0.70028 , -0.010573, -0.83629 , -0.24698 ,  0.6888  , -0.17986 ,
       -0.066569, -0.48044 , -0.55946 , -0.27594 ,  0.056072, -0.18907 ,
       -0.59021 ,  0.55559 ])

In [27]:
# populate weights matrix for entire vocab of encoder

matrix_len = len(encoder.vocab)
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(encoder.vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(50))

In [28]:
weights_matrix.shape

(296562, 50)

In [29]:
from torch.autograd import Variable

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = torch.Tensor(weights_matrix).size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': torch.Tensor(weights_matrix)})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class ToyNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers):
        super().__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.lin = nn.Linear(hidden_size, 2)
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
    
    def forward(self, inp, hidden):
        output, h_n = self.gru(self.embedding(inp), self.init_hidden(10))
        print(output, "GRU output (output)")
#         print(x.output.view(seq_len, batch, num_directions, hidden_size), "GRU output, (output) unpacked")
#         print(x.h_n, "GRU output (h_n)")
#         print(x.h_n.view(num_layers, num_directions, batch, hidden_size), "GRU output (h_n) unpacked")
        x = F.relu(self.lin(output[:,-1,:]))
        print(x, "Relu output")
        return x

In [30]:
toynet = ToyNN(weights_matrix, 20, 3)

In [31]:
print(toynet)

params = list(toynet.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

ToyNN(
  (embedding): Embedding(296562, 50)
  (gru): GRU(50, 20, num_layers=3, batch_first=True)
  (lin): Linear(in_features=20, out_features=2, bias=True)
)
15
torch.Size([296562, 50])


In [32]:
embedding = nn.Embedding(296562, 50)

output = embedding(torch.stack(padded_texts[0:10]))
# plus_batch = torch.Tensor([3, output])
output.size()

torch.Size([10, 51794, 50])

In [33]:
tens = [torch.Tensor([1, 2, 3, 4]), torch.Tensor([5, 6, 7, 8])]
tens = torch.stack(tens)
tens

tensor([[1., 2., 3., 4.],
        [5., 6., 7., 8.]])

In [34]:
stacked_padded_texts = torch.stack(padded_texts)

In [35]:
print(stacked_padded_texts.size())
print(len(padded_texts))
padded_texts[10923]

torch.Size([44898, 51794])
44898


tensor([0, 0, 0,  ..., 0, 0, 0])

In [36]:
output = toynet(stacked_padded_texts[0:10], torch.zeros([20]))

tensor([[[ 0.0802, -0.0089, -0.0682,  ..., -0.0275, -0.0292, -0.0552],
         [ 0.1526, -0.0160, -0.1275,  ..., -0.0467, -0.0839, -0.0824],
         [ 0.1915,  0.0049, -0.1622,  ..., -0.0642, -0.0947, -0.1064],
         ...,
         [ 0.1878,  0.0209, -0.2578,  ..., -0.2110, -0.2441, -0.2395],
         [ 0.1878,  0.0209, -0.2578,  ..., -0.2110, -0.2441, -0.2395],
         [ 0.1878,  0.0209, -0.2578,  ..., -0.2110, -0.2441, -0.2395]],

        [[ 0.1070, -0.0142, -0.0691,  ...,  0.0047, -0.0547, -0.0737],
         [ 0.1517,  0.0010, -0.1778,  ..., -0.0149, -0.0944, -0.1389],
         [ 0.1791,  0.0353, -0.2583,  ..., -0.0364, -0.1395, -0.1748],
         ...,
         [ 0.1878,  0.0209, -0.2578,  ..., -0.2110, -0.2441, -0.2395],
         [ 0.1878,  0.0209, -0.2578,  ..., -0.2110, -0.2441, -0.2395],
         [ 0.1878,  0.0209, -0.2578,  ..., -0.2110, -0.2441, -0.2395]],

        [[ 0.0557, -0.0187, -0.0648,  ..., -0.0122, -0.0182, -0.0220],
         [ 0.1206, -0.0307, -0.0808,  ..., -0

In [37]:
output[0]

tensor([0.0683, 0.0000], grad_fn=<SelectBackward>)