In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import math
import time
import numpy as np
import sys
import argparse
import os
import re
from nltk.util import ngrams

In [12]:
def decode(vocab,corpus):
    
    text = ''
    for i in range(len(corpus)):
        wID = corpus[i]
        text = text + vocab[wID] + ' '
    return(text)

def encode(words,text):
    corpus = []
    tokens = text.split(' ')
    for t in tokens:
        try:
            wID = words[t][0]
        except:
            wID = words['<unk>'][0]
        corpus.append(wID)
    return(corpus)

def read_encode(file_name,vocab,words,corpus,threshold):
    
    wID = len(vocab)
    
    if threshold > -1:
        with open(file_name,'rt', encoding='utf8') as f:
            for line in f:
                line = line.replace('\n','')
                # Added lower-casing
                line = line.lower()
                
                # Strips out all charcters other than alphanumeric
                line = re.sub('[\W_]+', ' ', line, flags=re.UNICODE)
                
                # Strips out numbers
                line = re.sub('\d+', '', line)
                
                tokens = line.split(' ')
                for t in tokens:
                    try:
                        elem = words[t]
                    except:
                        elem = [wID,0]
                        vocab.append(t)
                        wID = wID + 1
                    elem[1] = elem[1] + 1
                    words[t] = elem

        temp = words
        words = {}
        vocab = []
        wID = 0
        words['<unk>'] = [wID,100]
        vocab.append('<unk>')
        for t in temp:
            if temp[t][1] >= threshold:
                vocab.append(t)
                wID = wID + 1
                words[t] = [wID,temp[t][1]]
            
                    
    with open(file_name,'rt', encoding='utf8') as f:
        for line in f:
            line = line.replace('\n','')
            tokens = line.split(' ')
            for t in tokens:
                try:
                    wID = words[t][0]
                except:
                    wID = words['<unk>'][0]
                corpus.append(wID)
                
    return [vocab,words,corpus]

In [3]:
params = {
        'd_model': 100,
        'd_hidden': 100,
        'n_layers': 2,
        'batch_size': 20,
        'seq_len': 30,
        'printevery': 5000,
        'window': 3,
        'epochs': 20,
        'lr': 0.0001,
        'dropout': 0.35,
        'clip': 2.0,
        'model': 'FFNN',
        'savename': 'lstm',
        'loadname': None,
        'trainname': 'mix.train.txt',
        'validname': 'mix.valid.txt',
        'testname': 'mix.test.txt'
    }
torch.manual_seed(0)

[vocab,words,train] = read_encode(params['trainname'],[],{},[],3)
print('vocab: %d train: %d' % (len(vocab), len(train)))
[vocab,words,test] = read_encode(params['testname'], vocab,words,[],-1)
print('vocab: %d test: %d' % (len(vocab),len(test)))
params['vocab_size'] = len(vocab)

vocab: 33633 train: 3366260
vocab: 33633 test: 441210


In [25]:
words

{'<unk>': [0, 100],
 '': [1, 516387],
 'start': [2, 8048],
 'bio': [3, 15934],
 'hildebrand': [4, 17],
 'bothe': [5, 11],
 'october': [6, 1610],
 'september': [7, 1525],
 'was': [8, 50623],
 'a': [9, 52817],
 'swiss': [10, 276],
 'botanist': [11, 1152],
 'born': [12, 6232],
 'in': [13, 98637],
 'strasbourg': [14, 213],
 'and': [15, 77821],
 'studied': [16, 3552],
 'medicine': [17, 1291],
 'stuttgart': [18, 180],
 'bonn': [19, 186],
 'with': [20, 11974],
 'dissertation': [21, 343],
 'on': [22, 18211],
 'plants': [23, 810],
 'collected': [24, 414],
 'at': [25, 26330],
 'the': [26, 146257],
 'geofried': [27, 1],
 'trier': [28, 28],
 'sent': [29, 360],
 'by': [30, 9396],
 'natural': [31, 1790],
 'history': [32, 2917],
 'collection': [33, 895],
 'of': [34, 118356],
 'london': [35, 3215],
 'society': [36, 5339],
 'to': [37, 37719],
 'belgium': [38, 135],
 'he': [39, 56398],
 'became': [40, 6711],
 'plant': [41, 787],
 'pathologist': [42, 107],
 'started': [43, 598],
 'work': [44, 5276],
 'as

In [4]:
# Removed threshold -- all words will be trained
[vocab,words,train] = read_encode(params['trainname'],[],{},[],0)

In [13]:
# Returns bios in [(bio without puncutation, label), ...]
# 0: FAKE
# 1: REAL
def read_bios(file_name):
    with open(file_name,'rt', encoding='utf8') as f:
        all_bios = f.readlines()
        
    split_bios = []
    curr_bio = ""
    curr_index = 0
    while curr_index < len(all_bios):
        curr_line = all_bios[curr_index].lower()
        # Strips out all charcters other than alphanumeric
        curr_line = re.sub('[\W_]+', ' ', curr_line, flags=re.UNICODE)
        
        # Strips out numbers
        curr_line = re.sub('\d+', '', curr_line)
        
        curr_line = curr_line.strip()
        
        if curr_line == "start bio":
            # Skips their name
            curr_index += 1
        
        elif curr_line == "end bio":
            curr_index += 2
            if "FAKE" in all_bios[curr_index]:
                label = 0
            else:
                label = 1
            
            split_bios.append((curr_bio, label))
            curr_bio = ""
        
        else:
            # Check to ensure not empty space
            if curr_line:
                if curr_bio == "":
                    curr_bio = curr_line
                else:
                    curr_bio += " " + curr_line
        
        curr_index += 1
        
    
    return split_bios
    
split_bios = read_bios(params['trainname'])

In [6]:
# Create windows
# split_bios: [(bio without puncutation, label), ...]
# Returns sliding windows (multiple per biography):
# [
#  [
#   (['Hildebrand', 'Bothe', 'October'], 'September'),
#   (['Bothe', 'October', 'September'], 'was'),
#  ],
#  [
#   ([Hermann', 'Robert', 'Kaiser'], 'September'),
#   ...
#  ]
# ]

# Reference: https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
def create_windows(split_bios, window_size):
    sliding_windows = []
    for bio, _ in split_bios:
        bio_without_nums = ''.join([i for i in bio if not i.isdigit()])
        tokens = [token for token in bio_without_nums.split(" ") if token != ""]
        
        ngrams = []
        for i in range(len(tokens) - window_size):
            ngrams.append((
                [tokens[i + j] for j in range(window_size)],
                tokens[i + window_size]
            ))
        
        sliding_windows.append(ngrams)
    
    return sliding_windows

windows = create_windows(split_bios, 3)

In [83]:
all_context = []
all_labels = []
skipped_labels = 0

for each_bio in windows:
    for context, label in each_bio:
        found_in_words = [word in words for word in context]
        found_in_words.extend([label in words])
        if all(found_in_words):
            all_context.append([words[word][0] for word in context])
            all_labels.append([words[label][0]])
        else:
            all_context.append([0] * len(context))
            all_labels.append([0])
            
            skipped_labels += 1

# Skipping certain sliding windows because they weren't found in the dictionary
# Unk'd?
print(skipped_labels)

all_context = torch.LongTensor(all_context)
all_labels = torch.LongTensor(all_labels)

14


In [155]:
class FFNN(nn.Module):
    # d_model = embedding dimensions
    def __init__(self, vocab, words,d_model, d_hidden, dropout):
        super().__init__() 
    
        self.vocab = vocab
        self.words = words
        self.vocab_size = len(self.vocab)
        self.d_model = d_model
        self.d_hidden = d_hidden
        self.dropout = dropout
        self.embeds = nn.Embedding(self.vocab_size,d_model)
        
        # Context size * dimensions
        self.linear1 = nn.Linear(3 * d_model, 128)
        self.linear2 = nn.Linear(128, self.vocab_size)
        

    def forward(self, src):
        embeds = self.embeds(src).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
                
    def init_weights(self):
        pass

In [156]:
## NON BATCHED

model = FFNN(vocab, words, d_model=100, d_hidden=100, dropout=0.1)
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(2):
    for i in range(len(all_context)):
        context = all_context[i]
        label = all_labels[i]
        log_probabilities = model(context)
        loss = loss_function(log_probabilities, label)
        
        loss.backward()
        optimizer.step()
        
        model.zero_grad()
        
    print("Epoch {}/{}: Loss {:.4f}".format(i, epochs, loss))

KeyboardInterrupt: 

In [157]:
import torch
from torch.utils.data import DataLoader, TensorDataset

BATCH_SIZE = 50

dataset = TensorDataset(all_context, all_labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [165]:
# AFTER BATCHING

class FFNN(nn.Module):
    # d_model = embedding dimensions
    def __init__(self, vocab, words,d_model, d_hidden, dropout):
        super().__init__() 
    
        self.vocab = vocab
        self.words = words
        self.vocab_size = len(self.vocab)
        self.d_model = d_model
        self.d_hidden = d_hidden
        self.dropout = dropout
        self.embeds = nn.Embedding(self.vocab_size,d_model)
        
        # Context size * dimensions
        self.linear1 = nn.Linear(3 * d_model, 128)
        self.linear2 = nn.Linear(128, self.vocab_size)
        

    def forward(self, src):
        embeds = self.embeds(src).view((BATCH_SIZE, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
                
    def init_weights(self):
        pass

model = FFNN(vocab, words, d_model=100, d_hidden=100, dropout=0.1)
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    running_loss = 0
    for i, (context, label) in enumerate(dataloader):
        log_probabilities = model(context)
        # Collapsing labels to correct dimensions
        label = label.squeeze()
        loss = loss_function(log_probabilities, label)
        
        loss.backward()
        optimizer.step()
        
        model.zero_grad()
        
        running_loss += loss.item()        
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
            
            torch.save({
                'epoch': epoch,
                'batch': i,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, "fnn.pt")
        
    print("Epoch {}/{}: Loss {:.4f}".format(i, epochs, loss))
    
print("Finished!")

  batch 1000 loss: 7.588064741134644
  batch 2000 loss: 7.059284504413605
  batch 3000 loss: 7.006073610782623
  batch 4000 loss: 6.927615993022918
  batch 5000 loss: 6.919543179988861
  batch 6000 loss: 6.864118236541748
  batch 7000 loss: 6.855780578136444
  batch 8000 loss: 6.875775602340698


KeyboardInterrupt: 