In [None]:
%pylab inline  

import pandas as pd
import re
import string
import operator

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader

## 0. Data Cleanup

In [None]:
df = pd.read_csv('nosleep.csv')

# Some posts were deleted
df = df[df.Author != '[deleted]']

# Some text is empty
df = df[df.Text.isna() == False]

## 1.  Word2Vec
### 1.1 Preprocess Text and Prepare Word Lists

In [None]:
%%time

NUM_STORIES = 2500

# Take the first NUM_STORIES stories
stories = df.Text[:NUM_STORIES]

# Our final list of clean sentences
data = []

# Gather some statistics
num_sentences = []
num_unique_words = []

# Our dictionary, pre-initialized with special tokens
unique_words = set(['<PAD>','<UNK>, <NUM>'])

# Word counts, so we can use the most common words
word_counts = {}

stories_handled = 0
for this_story in stories:
    
    # A story is broken up into markdown paragraphs
    paragraphs = list(filter(lambda x: len(x) > 0, this_story.split("\n")))
    for this_paragraph in paragraphs:
        
        # If the paragram markdown contains a link, remove it
        if "[" in this_paragraph and 'https://' in this_paragraph:
            continue
        
        # Split paragraph into sentences based on end-of-sentence punctuation
        all_sentences = re.split("[.!?]", this_paragraph)
        for this_sentence in all_sentences:
            
            # Remove punctuation symbols
            for char in string.punctuation + "’”“…—-":
                this_sentence = this_sentence.replace(char, ' ')
                
            # Remove multiple whitespace
            this_sentence = re.sub("\s{2,}", " ", this_sentence)
                
            # Remove initial and trailing whitespace, and make everything lowercase
            this_sentence = this_sentence.strip().lower()
            
            # Replace numbers with a special <NUM> token
            this_sentence = re.sub("\d+", "<NUM>", this_sentence)
            
            # Process non-empty sentences
            if len(this_sentence) > 0:
                data.append(this_sentence)
                for this_word in this_sentence.split(' '):
                    # Gather statistics
                    unique_words.add(this_word)
                    word_counts[this_word] = word_counts.get(this_word, 0) + 1
        
    stories_handled += 1
    
    # Print progress every once in a while
    if stories_handled % 50 == 0:
        print("Parsed {} stories".format(stories_handled))
        
    # Append current statistics to a list 
    num_sentences.append(len(data))
    num_unique_words.append(len(unique_words))

In [None]:
# Randomly sample some sentences so we can have a look

print(np.random.choice(data, 10))

In [None]:
figure(figsize=(20,7))
plot(range(NUM_STORIES), num_sentences)
xlabel("# Of Stories")
ylabel("# Of Sentences");
title("# of Stories vs. # of Sentences")
grid(axis='both')
hlines(num_sentences[-1],0, len(num_sentences),linestyles='dashed',colors='red')
text(0,0.95*num_sentences[-1],num_sentences[-1],fontsize=18);

In [None]:
figure(figsize=(20,7))
plot(range(NUM_STORIES), num_unique_words)
xlabel("# Of Stories")
ylabel("# Of Unique Words");
title("# of Stories vs. # of Unique Words")
grid(axis='both')
hlines(num_unique_words[-1],0, len(num_unique_words),linestyles='dashed',colors='red')
text(0,0.95*num_unique_words[-1],num_unique_words[-1],fontsize=18);

In [None]:
figure(figsize=(20,7))
hist(word_counts.values(), bins=250, log=True)
title("Distribution of Occurances of Individual Words")
xlabel("# of Occurances")
ylabel("Count of Individual Words");

### 1.2 Prepare Vocabulary

In [None]:

# Size of vocabulary to use
MAX_VOCAB_SIZE = 10000

# Skip n most common words
NUM_COMMON_WORDS_TO_SKIP = 0

if MAX_VOCAB_SIZE < len(unique_words):
    
    # Take only the MAX_SIZE most common words
    words = list(sorted(word_counts.items(),key=operator.itemgetter(1),reverse=True)) \
                        [NUM_COMMON_WORDS_TO_SKIP:NUM_COMMON_WORDS_TO_SKIP+MAX_VOCAB_SIZE]
    unique_words = set(map(lambda x: x[0], words)).union(['<PAD>','<UNK>','<NUM>'])

# Lookup tables
word_to_index = dict([(word,index) for (index,word) in enumerate(unique_words)])
index_to_word = dict([(index,word) for (index,word) in enumerate(unique_words)])


unk_index = word_to_index['<UNK>']

# Helper functions for translating back and forth between indices and text
def sentence_to_indices(sentence):
    return [word_to_index.get(x, unk_index) for x in sentence.split(' ') ]

def indices_to_sentence(indices):
    return ' '.join(map(lambda index: index_to_word.get(index, '<UNK>'), indices))

VOCAB_SIZE = len(unique_words)
print("Vocabulary Size: " + str(VOCAB_SIZE))

### 1.3 CBOW

In [None]:
# Prepare data for CBOW: for each target words, compute context left and right context words and combine them

CONTEXT_SIZE = 3

X = []
y = []

sentence_count = 0


for this_sentence in data:
    
    # Pad at beginning and end of sentences 
    this_sentence = ['<PAD>' for _ in range(CONTEXT_SIZE)] + this_sentence.split(' ') \
        + ['<PAD>' for _ in range(CONTEXT_SIZE)] 
    
    # Convert words to indices in vocabulary
    indices = [word_to_index.get(x, unk_index) for x in this_sentence]
    
    # Prepare context words in X, target word in y
    for target_location in range(CONTEXT_SIZE, len(indices) - CONTEXT_SIZE):
        target = indices[target_location]
        context = np.zeros(CONTEXT_SIZE*2, dtype=np.long)
        
        left_context = list(enumerate(range(target_location-CONTEXT_SIZE, target_location)))
        right_context = list(enumerate(range(target_location+1, target_location+CONTEXT_SIZE+1), CONTEXT_SIZE))
        for (index, context_location) in left_context + right_context:
            context[index] = indices[context_location]
        X.append(context)
        y.append(target)
        
    sentence_count +=1
    if sentence_count % 5000 == 0:
        print("Prepared {} sentences".format(sentence_count))
        
X = np.array(X)
y = np.array(y)

print("X shape: ", X.shape)
print("y shape: ", y.shape)

In [None]:
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 512

class CBOW(nn.Module):
    def __init__(self, **kwargs):
        super(CBOW, self).__init__(**kwargs)
        self.embeddings = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
        self.linear1 = nn.Linear(EMBEDDING_SIZE, HIDDEN_SIZE)
        self.linear2 = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE)
        
    def forward(self, x):
        x = self.embeddings(x).sum(dim=1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        return x

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 2048

cbow = CBOW().cuda()
optimizer = torch.optim.Adam(cbow.parameters())




dataset = TensorDataset(torch.Tensor(X), torch.Tensor(y))
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(NUM_EPOCHS):
    for batch_index, (context, target) in enumerate(dataloader):
        context, target = context.long().cuda(), target.long().cuda()
        
        optimizer.zero_grad()
        prediction = cbow(context)
        loss = F.cross_entropy(prediction, target)
        loss.backward()
        
        optimizer.step()
        
        if batch_index % 10 == 0:
            print("Epoch: {}   Batch: {}   Loss: {}".format(epoch, batch_index, loss.item()))
        
        


In [None]:
embeddings = cbow.embeddings.weight

# Given a set of embeddings and a word, find the n words with the closest embeddings
def get_closest(embeddings, word, n=10):
    src_embeddings = embeddings[word_to_index[word]]
    distances = torch.zeros(VOCAB_SIZE)
    for i in range(VOCAB_SIZE):
        distances[i] = torch.dist(src_embeddings, embeddings[i])
    distances[word_to_index[word]]=999999
    n_best = torch.argsort(distances)[:n]
    return list([index_to_word[i.item()] for i in n_best])

In [None]:
get_closest(embeddings, 'crime')

### 1.4 Skip-gram

In [None]:
EMBEDDING_SIZE = 40
HIDDEN_SIZE = 128

class Skipgram(nn.Module):
    def __init__(self, **kwargs):
        super(Skipgram, self).__init__(**kwargs)
        self.embeddings = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
        self.linear1 = nn.Linear(EMBEDDING_SIZE, HIDDEN_SIZE)
        self.outputs = []
        
        # Multiple outputs - for each context word we want to predict
        for i in range(CONTEXT_SIZE * 2):
            layer = nn.Linear(HIDDEN_SIZE, VOCAB_SIZE)
            setattr(self, 'output' + str(i+1), layer)
            self.outputs.append(layer)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = F.relu(self.linear1(x))
        results = []
        for i in range(CONTEXT_SIZE*2):
            results.append(F.relu(self.outputs[i](x)))
        return results

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 1024

skipgram = Skipgram().cuda()
optimizer = torch.optim.Adam(skipgram.parameters())



# We already have context and target words - we now need the single words at the input and the
# multiple words at the output
dataset = TensorDataset(torch.Tensor(y), torch.Tensor(X))

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(NUM_EPOCHS):
    for batch_index, (context, target) in enumerate(dataloader):
        context, target = context.long().cuda(), target.long().cuda()
        
        optimizer.zero_grad()
        predictions = skipgram(context)

        optimizer.zero_grad()
        
        loss = torch.zeros(1).cuda()
        for i in range(CONTEXT_SIZE*2):
            
            # Calculate total loss by summing the individual losses
            loss += F.cross_entropy(predictions[i], target[:,i])
            
        loss.backward()
        optimizer.step()
        
        
        if batch_index % 10 == 0:
            print("Epoch: {}   Batch: {}   Loss: {}".format(epoch, batch_index, loss.item()))
        
         
            

In [None]:
embeddings = skipgram.embeddings.weight
get_closest(embeddings, 'crime')