# Recurrent network for fake news classification using tokens/embedding for single words instead of embedding for whole document

In [9]:
#!pip install torchvision --quiet

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# nlp stuff
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 

# PyTorch libraries
import torch
import nlp_nets as nlp
from torch import nn
import torch.optim as optim
from os.path import join as opj
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm.auto import tqdm

In [4]:
data_folder = r'/Users/AdamHarris/Documents/neuromatch_nlp/Neuromatch_NLP/dataset'

test_df = pd.read_csv(f'{data_folder}/test_df.csv')
val_df = pd.read_csv(f'{data_folder}/validation_df_embeddings.csv')
train_df = pd.read_csv(f'{data_folder}/train_df_embeddings.csv')

test_txt = test_df['text']
test_label = test_df['label']


train_txt = train_df['text']
train_label = train_df['label']


val_txt = val_df['text']
val_label = val_df['label']


### Tokenisation and word embedding

In [6]:
def tokenize(text):
    return text.lower().split()

In [7]:
tokenized_train = [tokenize(text) for text in train_txt]
tokenized_val = [tokenize(text) for text in val_txt]
tokenized_test = [tokenize(text) for text in test_txt]

In [8]:
"""
apparently, while it is technically possible for RNNs to handle variable input length input,
in practice this makes things tricky. 
here's the preprocessing steps i've seen other people use:
1 - tokenise (i.e. split article into individual words)
2 - use this to build a dictionary of individual words
3 - rank order by word frequency
4 - consider only the most common n words (e.g. a dictionary of 40,000)
5 - assign each word a number based on it's position in the ranked frequency
6 - choose a 'sentence length' hyperparameter. ours may be in the hundreds because the articles are longer than other nlp data ive seen (tweets)
7 - use the word number ids for each article in the right order to get a vector of length==sentence_length (e.g. [34,1,2,45,3, ...], or [0,0,0,0,34,56, ...])
This fixed sentence length makes things like batch processing play much nicer
"""

['video',
 'hillary',
 'left',
 'want',
 'america',
 'see',
 'today',
 'compelled',
 'sit',
 'write',
 'letter',
 'anyone',
 'particular',
 'maybe',
 'even',
 'black',
 'female',
 'executive',
 'trump',
 'organization',
 'longer',
 'remain',
 'silent',
 'repeated',
 'reprehensible',
 'attempts',
 'align',
 'boss',
 'family',
 'racist',
 'hate',
 'mongering',
 'groups',
 'campaigns',
 'messaging',
 'daughter',
 'man',
 'born',
 'birmingham',
 'alabama',
 'rose',
 'odds',
 'become',
 'one',
 'established',
 'respected',
 'doctor',
 'yale',
 'university',
 'amount',
 'money',
 'world',
 'could',
 'buy',
 'loyalty',
 'family',
 'subscribed',
 'intolerant',
 'bigoted',
 'ideologies',
 'lynne',
 'patton',
 'reading',
 'powerful',
 'letter',
 'penned',
 'dispel',
 'lies',
 'donald',
 'j',
 'trump',
 'well',
 'family',
 'linkedin',
 'lynn',
 'patton',
 'chief',
 'staff',
 'eric',
 'trump',
 'ivanka',
 'trump',
 'donald',
 'trump',
 'jr',
 'may',
 'present',
 'oversee',
 'primary',
 'assistants

TypeError: 'str' object does not support item assignment

In [None]:
class Net(nn.Module):
  """
  Initialize MLP Network
  """

  def __init__(self, actv, input_feature_num, hidden_unit_nums, output_feature_num):
    """
    Initialize MLP Network parameters

    Args:
      actv: string
        Activation function
      input_feature_num: int
        Number of input features
      hidden_unit_nums: list
        Number of units per hidden layer, list of integers
      output_feature_num: int
        Number of output features

    Returns:
      Nothing
    """
    super(Net, self).__init__()
    self.input_feature_num = input_feature_num # Save the input size for reshaping later
    self.mlp = nn.Sequential() # Initialize layers of MLP

    in_num = input_feature_num # Initialize the temporary input feature to each layer
    for i in range(len(hidden_unit_nums)): # Loop over layers and create each one

      out_num = hidden_unit_nums[i] # Assign the current layer hidden unit from list
      layer = nn.Linear(in_num, out_num) # Use nn.Linear to define the layer
      in_num = out_num # Assign next layer input using current layer output
      self.mlp.add_module('Linear_%d'%i, layer) # Append layer to the model with a name

      actv_layer = eval('nn.%s'%actv) # Assign activation function (eval allows us to instantiate object from string)
      self.mlp.add_module('Activation_%d'%i, actv_layer) # Append activation to the model with a name

    out_layer = nn.Linear(in_num, output_feature_num) # Create final layer
    self.mlp.add_module('Output_Linear', out_layer) # Append the final layer

  def forward(self, x):
    """
    Simulate forward pass of MLP Network

    Args:
      x: torch.tensor
        Input data

    Returns:
      logits: Instance of MLP
        Forward pass of MLP
    """
    # Reshape inputs to (batch_size, input_feature_num)
    # Just in case the input vector is not 2D, like an image!
    x = x.view(-1, self.input_feature_num)

    logits = self.mlp(x) # Forward pass of MLP
    return logits

### Now lets define our recurrent model 

In [None]:
class RecurrentNLP(nn.module):
    def __init__(self, )