In [1]:
# built with help from this youtube guide: https://www.youtube.com/watch?v=BzcBsTou0C0&list=PLQVvvaa0QuDdeMyHEYc0gxFpYwHY2Qfdh

import os
import torch
import torchvision
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


In [3]:
# preprocessing
fake_news_frame = pd.read_csv('fake_and_real_news/Fake.csv')
true_news_frame = pd.read_csv('fake_and_real_news/True.csv')

# add authenticity label


fake_column = ["Fake"] * len(fake_news_frame)
#print(len(fake_column))

fake_news_frame.insert(4, 'authenticity', fake_column)
#fake_news_frame.to_csv(path_or_buf='fake_and_real_news/NewFake.csv')


true_column = ["True"] * len(true_news_frame)
#print(len(true_column))

true_news_frame.insert(4, 'authenticity', true_column)
#true_news_frame.to_csv(path_or_buf='fake_and_real_news/NewTrue.csv')


# combine datasets

frames = [fake_news_frame, true_news_frame]
combined_news_frame = pd.concat(frames)

combined_news_frame.to_csv(path_or_buf='fake_and_real_news/Combined.csv')

In [4]:
news_frame = pd.read_csv('fake_and_real_news/Combined.csv')

n = 1
title = news_frame.iloc[n, 1]
text = news_frame.iloc[n, 2]
subject = news_frame.iloc[n, 3]
date = news_frame.iloc[n, 4]
authenticity = news_frame.iloc[n, 5]

print('Title: {}'.format(title))
print('Text: {}'.format(text))
print('Subject: {}'.format(subject))
print('Date: {}'.format(date))
print('Authenticity: {}'.format(authenticity))

Title:  Drunk Bragging Trump Staffer Started Russian Collusion Investigation
Text: House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investigation so he s been lashing out at the Department of Justice and the FBI in order to protect Trump. As it happens, the dossier is not what started the investigation, according to documents obtained by the New York Times.Former Trump campaign adviser George Papadopoulos was drunk in a wine bar when he revealed knowledge of Russian opposition research on Hillary Clinton.On top of that, Papadopoulos wasn t just a covfefe boy for Trump, as his administration has alleged. He had a much larger role, but none so damning as being a drunken fool in a wine bar. Coffee boys  don t help to arrange a New York meeting between Trump and President Abdel Fattah el-Sisi of Egypt two months before the election. It was known bef

In [5]:
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html


class NewsDataset(Dataset):
    """ News dataset. """
    
    def __init__(self, csv_file, root_dir):
        """Args:
            csv_file (string): Path to the news csv file.
            root_dir (string): Path to the root directory
        """
        
        self.news_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        
    def __len__(self):
        return len(self.news_frame)
    
    
    def __getitem__(self, idx):
        news = self.news_frame.iloc[idx, 1:]
        
        # we are going to have to make some way to parse text as words, and to feed it to the NN
        
        return news

In [6]:
news_dataset = NewsDataset('fake_and_real_news/Combined.csv', 'fake_and_real_news/')

In [7]:
# populate text dataset with news article texts

rows = np.arange(len(news_dataset))

text_dataset = {}

for idx in np.nditer(rows):
    text_dataset[int(idx)] = news_dataset[int(idx)][1]

In [8]:
# split news article texts into strings, store in dictionary

dictionary = []
lengths = []

split_texts = []
lineno = 0

for text in text_dataset.values():
    text_words = text.split()
    split_texts.append(text_words)
    #print(len(text), lineno)
    lengths.append(len(text))
    lineno += 1
    for word in text_words:
        if word not in dictionary:
            dictionary.append(word)

            
MAX_TEXT_LENGTH = max(lengths)
print(MAX_TEXT_LENGTH)


2893 0
1898 1
3597 2
2774 3
2346 4
1741 5
2166 6
2224 7
2772 8
1644 9
2003 10
1625 11
2252 12
1579 13
2795 14
2103 15
2979 16
1988 17
2623 18
2817 19
2932 20
2023 21
1909 22
1594 23
1859 24
2280 25
2017 26
2808 27
2109 28
2750 29
2102 30
2551 31
2599 32
3051 33
1762 34
1657 35
2243 36
2518 37
4039 38
3276 39
2633 40
1666 41
1933 42
2285 43
1785 44
2086 45
2338 46
1848 47
2099 48
1715 49
2967 50
2061 51
2515 52
2135 53
3457 54
3169 55
2491 56
3364 57
4967 58
2012 59
2424 60
2012 61
3234 62
2422 63
2857 64
6942 65
1951 66
1871 67
2996 68
2508 69
2602 70
4042 71
2741 72
2131 73
3943 74
3315 75
2136 76
2042 77
2638 78
4467 79
2356 80
1692 81
1593 82
3010 83
1150 84
2568 85
2209 86
3453 87
2912 88
3086 89
2100 90
3215 91
2348 92
1686 93
3114 94
2631 95
2616 96
3261 97
1882 98
2812 99
2032 100
2275 101
2529 102
2023 103
2604 104
2832 105
2445 106
2893 107
770 108
2754 109
2035 110
2926 111
2356 112
2220 113
3176 114
1881 115
2029 116
2157 117
2184 118
2048 119
1810 120
1381 121
2205 122
2672

In [None]:
# processing: convert dictionary to actual dictionary

print(len(dictionary))

real_dictionary = {}

for word in dictionary:
    real_dictionary.update({ word : dictionary.index(word) })
    if dictionary.index(word) % 50000 == 0:
        print(dictionary.index(word))

397481
0
50000
100000
150000
200000


In [None]:
# processing: write original dictionary to file (useful for later access)

import csv

with open('dictionary.csv', 'w', newline='') as f:
    

In [9]:
# processing: write dictionary to file (useful for later access)

with open('real_dictionary.csv', 'w', newline='') as f:
    fieldnames = ['word', 'number']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
 
    writer.writeheader()

    for key in real_dictionary:
        writer.writerow({ key : dictionary[key] })
        if dictionary[key] % 50000 == 0:
            print(dictionary[key])

TypeError: list indices must be integers or slices, not str

In [None]:
# processing: write split texts to file ( again useful for later access)

with open('split_texts.txt', 'w') as filehandle:
    filehandle.writelines("%s\n" % text for text in split_texts)

In [None]:
# processing: convert split texts (lists of strings) into lists of ints, using dictionary

converted_split_texts = []

for text in split_texts:
    converted_text = []
    for word in text:
        converted_text.append(real_dictionary[word])
    converted_split_texts.append(converted_text)
    
print(converted_split_texts)

In [None]:
# processing: pad data to provide uniform input to NN

for text in split_texts:
    while text.length < MAX_TEXT_LENGTH:
        text.append('word')

In [None]:
# processing: collect vector of true/fake booleans to train dataset

rows = np.arange(len(news_dataset))

true_fake_dataset = []

for idx in np.nditer(rows):
    if news_dataset[int(idx)][4] == 'Fake':
        true_fake_dataset.append(0)
    if news_dataset[int(idx)][4] == 'True':
        true_fake_dataset.append(1)

In [None]:
# define Net class

import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(MAX_TEXT_LENGTH, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)


In [None]:
# initialize net and print parameters

net = Net()
print(net)

params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

In [None]:
# optimize net with backprop; 3 epochs

import torch.optim as optim

optimizer = optim.Adam(net.parameters(), lr=0.001)

EPOCHS = 3


for epoch in range(EPOCHS):
    for i, text in enumerate(converted_split_texts, start=0):
        # data is a batch of featuresets and labels
        #print(true_fake_dataset[i])
        X = torch.Tensor(text)
        #print(X)
        y = torch.tensor([true_fake_dataset[i]], dtype=torch.long)
        #print(i)
        #print(y)
        net.zero_grad()
        output = net(X.view(-1, MAX_TEXT_LENGTH))
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
    print(loss)


In [None]:
# calculate and print accuracy

correct = 0
total = 0

with torch.no_grad():
    for i, text in enumerate(converted_split_texts, start=0):
        X = torch.Tensor(text)
        y = torch.tensor([true_fake_dataset[i]], dtype=torch.long)
        output = net(X.view(-1, MAX_TEXT_LENGTH))
        #print(torch.argmax(output))
        for idx, i in enumerate(output):
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))