In [1]:
import os
import torch
import torchvision
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


In [2]:
# preprocessing
fake_news_frame = pd.read_csv('fake_and_real_news/Fake.csv')
true_news_frame = pd.read_csv('fake_and_real_news/True.csv')

# add authenticity label


fake_column = ["Fake"] * len(fake_news_frame)
#print(len(fake_column))

fake_news_frame.insert(4, 'authenticity', fake_column)
#fake_news_frame.to_csv(path_or_buf='fake_and_real_news/NewFake.csv')


true_column = ["True"] * len(true_news_frame)
#print(len(true_column))

true_news_frame.insert(4, 'authenticity', true_column)
#true_news_frame.to_csv(path_or_buf='fake_and_real_news/NewTrue.csv')


# combine datasets

frames = [fake_news_frame, true_news_frame]
combined_news_frame = pd.concat(frames)

combined_news_frame.to_csv(path_or_buf='fake_and_real_news/Combined.csv')

In [3]:
news_frame = pd.read_csv('fake_and_real_news/Combined.csv')

n = 1
title = news_frame.iloc[n, 1]
text = news_frame.iloc[n, 2]
subject = news_frame.iloc[n, 3]
date = news_frame.iloc[n, 4]
authenticity = news_frame.iloc[n, 5]

print('Title: {}'.format(title))
print('Text: {}'.format(text))
print('Subject: {}'.format(subject))
print('Date: {}'.format(date))
print('Authenticity: {}'.format(authenticity))

Title:  Drunk Bragging Trump Staffer Started Russian Collusion Investigation
Text: House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investigation so he s been lashing out at the Department of Justice and the FBI in order to protect Trump. As it happens, the dossier is not what started the investigation, according to documents obtained by the New York Times.Former Trump campaign adviser George Papadopoulos was drunk in a wine bar when he revealed knowledge of Russian opposition research on Hillary Clinton.On top of that, Papadopoulos wasn t just a covfefe boy for Trump, as his administration has alleged. He had a much larger role, but none so damning as being a drunken fool in a wine bar. Coffee boys  don t help to arrange a New York meeting between Trump and President Abdel Fattah el-Sisi of Egypt two months before the election. It was known bef

In [4]:
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html


class NewsDataset(Dataset):
    """ News dataset. """
    
    def __init__(self, csv_file, root_dir):
        """Args:
            csv_file (string): Path to the news csv file.
            root_dir (string): Path to the root directory
        """
        
        self.news_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        
    def __len__(self):
        return len(self.news_frame)
    
    
    def __getitem__(self, idx):
        news = self.news_frame.iloc[idx, 1:]
        
        # we are going to have to make some way to parse text as words, and to feed it to the NN
        
        return news

In [5]:
news_dataset = NewsDataset('fake_and_real_news/Combined.csv', 'fake_and_real_news/')

In [6]:
# processing: populate text dataset with news article texts

rows = np.arange(len(news_dataset))

text_dataset = []

for idx in range(0, len(news_dataset)):
    text_dataset.append(news_dataset[idx][1])

In [7]:
pip install --user -U nltk

Requirement already up-to-date: nltk in /home/paul/.local/lib/python3.6/site-packages (3.5)
Note: you may need to restart the kernel to use updated packages.


In [8]:
# processing: encode one news article using nltk encoder
import nltk

from torchnlp.encoders.text import TreebankEncoder


encoder = TreebankEncoder(text_dataset)
encoder.encode("Test input.")
print(len(encoder.vocab))

encoder.encode(text_dataset[10923])

#lineno = 0

#for text in text_dataset.values():
#    encoder.encode(text)
#    if lineno % 5000 == 0:
#        print(lineno)
#    lineno += 1

[nltk_data] Downloading package perluniprops to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/paul/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


296562


tensor([])

In [9]:
encoder.encode(text_dataset[10923])

tensor([])

In [10]:
lineno = 0

lengths = []

for text in text_dataset:
    lengths.append(len(encoder.encode(text)))
    lineno += 1
    if lineno % 5000 == 0:
        print(lineno, "lineno")
        print(len(encoder.encode(text)), "len")
        print(text)

# MAX_TEXT_LENGTH = max(lengths)

average_len = sum(lengths)/len(lengths)
print(average_len)

TEXT_LENGTH = round(average_len)

5000 lineno
398 len
As Donald Trump s campaign continues to sink deeper into its self-sabotaging downward spiral, it s becoming clear that even Trump s campaign surrogates and former staffers are having trouble trying to stay positive about the outcome of this election. In just the past few days, we ve seen them completely deny that Trump s campaign made some massive changes in desperation, ignore polls that The Donald is losing and now thanks to former Trump campaign manager Corey Lewandowski, they re holding onto false hope that Trump can still somehow win this.Earlier today on CNN, Lewandowski   who is forbidden by contract to say anything negative about Trump   tried to convince everyone that Trump was still on track to win this election because his biggest opponent, Democratic nominee Hillary Clinton, was losing voters to Green Party candidate Jill Stein and Libertarian Party candidate Gary Johnson. Lewandowski said: This is not a two-person race. Gary Johnson and Jill Stein are i

In [11]:
lengths[2]

671

In [12]:
TEXT_LENGTH

435

In [13]:
# processing: pad texts to have matching length

from torchnlp.encoders import Encoder
from torchnlp.encoders.text import pad_tensor
from torchnlp.encoders.text.default_reserved_tokens import DEFAULT_PADDING_INDEX

lineno = 0

padded_texts = []

print(encoder.encode(text_dataset[5])[0:TEXT_LENGTH])
print(text_dataset[5])
print(lengths[5])
print(len(encoder.encode(text_dataset[5])[0:TEXT_LENGTH]))
print(len(encoder.encode(text_dataset[5])))
# print(encoder.encode(text_dataset[5])[TEXT_LENGTH])

for text in text_dataset:
    if len(encoder.encode(text)) < TEXT_LENGTH:
        padded_texts.append(pad_tensor(encoder.encode(text).long(), TEXT_LENGTH))
#         print(len(pad_tensor(encoder.encode(text).long(), TEXT_LENGTH)))
    if len(encoder.encode(text)) > TEXT_LENGTH:
        padded_texts.append(encoder.encode(text)[0:TEXT_LENGTH])
    if len(encoder.encode(text)) == TEXT_LENGTH:
        padded_texts.append(encoder.encode(text))
    if lineno % 5000 == 0:
        print(lineno)
    lineno += 1

tensor([  39,  696,   57,  997,   57,  998,  999,   17, 1000,  155,   57, 1001,
        1002,   26,  463,  449, 1003, 1004,   23,  673,  151, 1005, 1006,  153,
        1007,   26,   72, 1008, 1009,   17, 1010, 1011, 1012, 1013,  268,   33,
         843,   57, 1014, 1015, 1008,   13, 1016, 1017,   57, 1018, 1019,   23,
         408,  168,   13, 1020, 1021,  354,   13, 1022,   17, 1023, 1024,   23,
         317, 1025,  348, 1007,   26,   72, 1008, 1009,   17, 1026,   23,  542,
         102,  195, 1027,  195,  385, 1028, 1015, 1019,  286,  168,  448, 1029,
         574,  191, 1030,   57, 1031, 1032,  168,   30, 1033,   23,  136,  311,
          94,  463,   33, 1017,   94,   71, 1034,  153,  490, 1035, 1036, 1037,
        1038,  215,  217, 1039,   26, 1040,   17, 1041,  183,  168,  398, 1042,
        1043,   33, 1044,   94,  215,   91,   26,  463,   23, 1045, 1015, 1046,
        1018,  562,  153,   33,  998,   25,  398, 1047, 1048,  317,  844,  159,
         262, 1049, 1018, 1019,   23,  1

In [14]:
# processing: convert longs to floats (for NN)

lineno = 0

for text in padded_texts:
    for long in text:
        long = float(long)
    if lineno % 5000 == 0:
        print(lineno)
    lineno += 1

0
5000
10000
15000
20000
25000
30000
35000
40000


In [15]:
len(padded_texts[10])

435

In [19]:
# processing: collect vector of true/fake booleans to train dataset

rows = np.arange(len(news_dataset))

true_fake_dataset = []

lineno = 0

for idx in np.nditer(rows):
    if news_dataset[int(idx)][4] == 'Fake':
        true_fake_dataset.append(torch.Tensor([0]))
    if news_dataset[int(idx)][4] == 'True':
        true_fake_dataset.append(torch.Tensor([1]))
    if lineno % 5000 == 0:
        print(lineno)
    lineno += 1
        
###

# random train data to test NN

# true_fake_dataset = [0, 0, 0, 0, 1, 1, 1, 1]

###

0
5000
10000
15000
20000
25000
30000
35000
40000


In [20]:
# processing: add true/fake label to padded_texts

print(len(padded_texts))
print(len(true_fake_dataset))
print(len(text_dataset))

trainset = []

lineno = 0

for idx, text in enumerate(padded_texts):
    if lineno % 5000 == 0:
#         print(padded_texts[idx][0])
#         print(len(padded_texts[idx]))
        print(lineno)
    lineno += 1
    trainset.append((text, true_fake_dataset[idx]))

44898
44898
44898
0
5000
10000
15000
20000
25000
30000
35000
40000


In [21]:
trainset[30000][1]

tensor([1.])

In [22]:
trainset[0:10]

[(tensor([  5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,
           19,  20,  21,  22,  23,  24,  25,  26,  27,  13,  28,  29,  26,  30,
           31,  23,  32,  17,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
           43,  25,   7,  44,  45,  26,  46,  17,  24,   8,   9,  46,  47,  48,
           49,  50,  51,  52,  53,  17,  54,  23,  55,  56,  26,  10,  11,  57,
           58,  59,  23,  60,  23,  31,  23,  32,  23,  17,  61,  33,  34,  35,
           62,  63,  64,  23,  13,  14,  17,  65,  15,  16,  23,  66,  67,  68,
           69,  70,  71,  72,  13,  73,  74,  75,  76,  77,  48,  49,  50,  51,
           52,  53,  17,  54,  23,  55,  56,  26,  10,  11,  57,  58,  59,  23,
           60,  23,  31,  23,  32,  23,  17,  61,  33,  34,  35,  62,  63,  64,
           23,  13,  14,  17,  65,  15,  78,  70,  71,  72,  13,  73,  74,  75,
           76,  77,   5,  79,   6,  80,  81,  82,  83,  84,  85,  23,  86,  87,
           88,  89,  90,  91,  92,  93, 

In [25]:
# define Net class

import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(TEXT_LENGTH, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)


In [27]:
# initialize net and print parameters

net = Net()
print(net)

params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

Net(
  (fc1): Linear(in_features=435, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=2, bias=True)
)
8
torch.Size([64, 435])


In [29]:
# optimize net with backprop; 3 epochs

import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr=0.001)

EPOCHS = 3


for epoch in range(EPOCHS):
    for i, padded_text in enumerate(padded_texts, start=0):
        # data is a batch of featuresets and labels
        #print(true_fake_dataset[i])
        X = padded_text
        #print(X)
        y = torch.tensor([true_fake_dataset[i]], dtype=torch.long)
        if i % 500 == 0:
            print(i)
        #print(y)
        net.zero_grad()
        X_float = X.new_tensor(X, dtype=torch.float) # convert tensor long to float, NN must read in float
#         print(X_float)
        output = net(X_float.view(-1, TEXT_LENGTH))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
    print(loss)


0




500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
tensor(0., grad_fn=<NllLossBackward>)
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
3900

In [30]:
# calculate and print accuracy

correct = 0
total = 0

with torch.no_grad():
    for i, padded_text in enumerate(padded_texts, start=0):
        X = padded_text
        y = torch.tensor([true_fake_dataset[i]], dtype=torch.long)
        X_float = X.new_tensor(X, dtype=torch.float) # convert tensor long to float
        output = net(X_float.view(-1, TEXT_LENGTH))
#         print(torch.argmax(output))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

  # Remove the CWD from sys.path while we load stuff.


Accuracy:  0.477
