# Download data from Kaggle


In [0]:
import os
os.environ['KAGGLE_USERNAME'] = "xxxxx" # username from the json file
os.environ['KAGGLE_KEY'] = "xxxxxxxxxxxxx" # key from the json file
!kaggle competitions download -c fake-news

In [0]:
import zipfile

with zipfile.ZipFile('test.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('')

with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('')

In [0]:
import pandas as pd

test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv') 
submit_df = pd.read_csv('submit.csv') 
test_df['label'] = submit_df['label']

print("Rows in test: %s " % len(test_df))
print("Rows in train: %s " % len(train_df))

In [0]:
train_df.head(5)

# Imports

In [0]:
import torch
import torch.nn as nn
import math
import random
import string
import time
import matplotlib.pyplot as plt

%matplotlib inline

# Preparing Data

In [0]:
# Generate a list of tuples (title, label) for each data row
def read_data(dataframe):
    df = dataframe[['title', 'label']]
    df = df.dropna(subset=['title', 'label'])
    return [tuple(x) for x in df.to_numpy()]


def random_training_pair(pairs):
    rand_index = random.randint(0, len(pairs) - 1)
    return pairs[rand_index]

# Establish Tensors

In [0]:
all_characters = string.printable
n_characters = len(all_characters)

In [0]:
# Turns line into <line_length x 1 x n_letters> tensor
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, n_characters)
    for li, letter in enumerate(line):
        tensor[li][0][all_characters.find(letter)] = 1
    return tensor


# Turns label into <1 x 1> tensor
def label_to_tensor(label):
    return torch.tensor([label], dtype=torch.long)

# Create Network

In [0]:
class RNNClassify(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNClassify, self).__init__()

        self.hidden_size = hidden_size
        
        # Initialize linear and softmax layers
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden=None):
        # Concatenate input tensor and hidden state
        combined = torch.cat((input, hidden), 1)

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)


# Takes the category with the highest probability as a guess
def category_from_output(output):
    top_n, top_i = output.topk(1)
    return top_i[0].item()

# Train Network

In [0]:
# Helper function to display how long the training has been running
def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


# Creates function that performs a step in the training loop
def make_train_step(model, criterion, optimizer):
    def train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        
        # Initialize hidden state
        hidden = model.init_hidden()

        # Makes predictions, running through each letter tensor
        for i in range(x.size()[0]):
            output, hidden = model(x[i], hidden)

        # Computes loss
        loss = criterion(output, y)

        # Computes gradients
        loss.backward()

        # Updates parameters
        optimizer.step()
        optimizer.zero_grad()

        return output, loss.item()

    return train_step

# Run training on a given dataframe
def run(train_df, plot=False):
    n_iters = 100000
    print_every = 5000
    plot_every = 1000
    hidden_len = 256

    current_loss = 0
    all_losses = []

    model = RNNClassify(n_characters, hidden_len, 2)
    data_tuples = read_data(train_df)

    # Create the optimizer and loss function (criterion)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0002)
    criterion = nn.NLLLoss()

    train_step = make_train_step(model, criterion, optimizer)

    start = time.time()
    
    for i in range(1, n_iters + 1):
        # Get data and turn input/target into tensors
        title, label = random_training_pair(data_tuples)
        input_tensor = line_to_tensor(title)
        target_tensor = label_to_tensor(label)

        # Run one training step
        output, loss = train_step(input_tensor, target_tensor)

        # The rest of the code in this function is to show how
        # the network is learning
        current_loss += loss

        if i % print_every == 0:
            guess = category_from_output(output)
            correct = '✓' if label == guess else '✗'
            print('%d %d%% (%s) %.4f %s %s' % (i, i / n_iters * 100, time_since(start), loss, title, correct))
        
        if i % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0

    if plot:
        plt.figure()
        plt.plot(all_losses)

    # Save the model
    torch.save(model.state_dict(), "test.model")

run(train_df, plot=True)

# Evaluate Model

In [0]:
# Predict the label given a title
def evaluate(title, model):
    model.eval()
    hidden = model.init_hidden()
    input_tensor = line_to_tensor(title)
    for i in range(input_tensor.size()[0]):
        output, hidden = model(input_tensor[i], hidden)
    return category_from_output(output)

# Calculate accuracy, recall, and precision on a given test dataframe
def calculate_accuracy(model, test_df):
    false_positives = 0
    false_negatives = 0
    true_positives = 0
    true_negatives = 0

    tuples = read_data(test_df)
    for title, label in tuples:
        prediction = evaluate(title, model)
        if label == prediction and label:
            true_positives += 1
        if label == prediction and not label:
            true_negatives += 1
        if label != prediction and label:
            false_negatives += 1
        if label != prediction and not label:
            false_positives += 1
    accuracy = (true_positives + true_negatives) / len(test_df)
    recall = true_positives / (true_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    return accuracy, recall, precision

In [0]:
model = RNNClassify(n_characters, 256, 2)
model.load_state_dict(torch.load('test.model'))
calculate_accuracy(model, test_df)