In [None]:
%pylab inline  

import pandas as pd
import re
import string
import operator

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## 0. Data Cleanup

In [None]:
df = pd.read_csv('nosleep.csv')

# Some posts were deleted
df = df[df.Author != '[deleted]']

# Some text is empty
df = df[df.Text.isna() == False]

# Some text is too short
df = df[df.Text.str.len() > 100]


In [None]:
df = df[(df.Flair=='Series') | (df.Flair=='None')]
df.Flair.value_counts()

## 2.  Classification
### 2.1 Preprocess Text and Prepare Word Lists

In [None]:
%%time

# Number of stories to take from each category
NUM_STORIES = 1000

# Number of words to take from end of each story
NUM_FINAL_WORDS = 30

# Take the first NUM_STORIES stories
stories_none = df[df.Flair == 'None'][:NUM_STORIES]
stories_series = df[df.Flair == 'Series'][:NUM_STORIES]




def get_words_from_story(text, num_words = NUM_FINAL_WORDS):
    
    paragraphs = list(filter(lambda x: len(x) > 0, text.split("\n")))
    entire_text = ''
    for this_paragraph in paragraphs:
        
        # If the paragram markdown contains a link, remove it
        if "[" in this_paragraph and 'https://' in this_paragraph:
            continue
            
        # Remove special symbols
        if this_paragraph.startswith("&amp;"):
            continue
        
        
      
            
        # Remove punctuation symbols
        for char in string.punctuation + "’”“…—-":
            this_paragraph = this_paragraph.replace(char, ' ')
                
        # Remove multiple whitespace
        this_paragraph = re.sub("\s{2,}", " ", this_paragraph)

        # Remove initial and trailing whitespace, and make everything lowercase
        this_paragraph = this_paragraph.strip().lower()

        # Replace numbers with a special <NUM> token
        this_paragraph = re.sub("\d+", "<NUM>", this_paragraph)

        entire_text += this_paragraph + " "
    
    return entire_text.split(' ')[-num_words:]

words_none = []
words_series = []

unique_words = set(['<UNK>','<NUM>'])

for this_story in stories_series.Text:
    final_words = get_words_from_story(this_story)
    if len(final_words) < NUM_FINAL_WORDS:
        continue
    words_series.append(final_words)
    unique_words = unique_words.union(set(final_words))
    
for this_story in stories_none.Text:
    final_words = get_words_from_story(this_story)
    if len(final_words) < NUM_FINAL_WORDS:
        continue
    words_none.append(final_words)
    unique_words = unique_words.union(set(final_words))
    


print("Vocabulary Size: {}".format(len(unique_words)))

In [None]:
random_choices = choice(range(NUM_STORIES), 5)

for i in random_choices:
    print(words_none[i])

print("----")

for i in random_choices:
    print(words_series[i])
    
    
    

### 2.2 Prepare Vocabulary

In [None]:

# Lookup tables
word_to_index = dict([(word,index) for (index,word) in enumerate(unique_words)])
index_to_word = dict([(index,word) for (index,word) in enumerate(unique_words)])


unk_index = word_to_index['<UNK>']

# Helper functions for translating back and forth between indices and text
def sentence_to_indices(sentence):
    return [word_to_index.get(x, unk_index) for x in sentence.split(' ') ]

def indices_to_sentence(indices):
    return ' '.join(map(lambda index: index_to_word.get(index, '<UNK>'), indices))

VOCAB_SIZE = len(unique_words)
print("Vocabulary Size: " + str(VOCAB_SIZE))

### 2.3 Classification

In [None]:
indices_none = np.zeros((len(words_none), NUM_FINAL_WORDS))
indices_series = np.zeros((len(words_series), NUM_FINAL_WORDS))


for i in range(indices_none.shape[0]):
    for j in range(NUM_FINAL_WORDS):
        indices_none[i,j] = word_to_index[words_none[i][j]]
        
for i in range(indices_series.shape[0]):
    for j in range(NUM_FINAL_WORDS):
        indices_series[i,j] = word_to_index[words_series[i][j]]
        
print("Indices_none shape: {}".format(indices_none.shape))
print("Indices_series shape: {}".format(indices_series.shape))

In [None]:
X = np.concatenate((indices_none, indices_series))

y = np.concatenate((np.repeat(0,indices_none.shape[0]), np.repeat(1,indices_series.shape[0])))


X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
EMBEDDING_SIZE = 70
HIDDEN_SIZE = 512

class Model(nn.Module):
    def __init__(self, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.embeddings = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
        self.lstm = nn.LSTM(input_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE, bidirectional=True,
                           batch_first=True)
        self.fc1 = nn.Linear(NUM_FINAL_WORDS, NUM_FINAL_WORDS)
        self.output = nn.Linear(NUM_FINAL_WORDS, 2)
        
        self.attn = nn.Linear(2*HIDDEN_SIZE, 2*HIDDEN_SIZE)
       
    def forward(self, x):
        embeddings = self.embeddings(x) # BATCH_SIZE * NUM_FINAL_WORDS * EMBEDDING_SIZE
       
        
        # Only use the (aggregated) hidden state at time t
        x,_ = self.lstm(embeddings)    # BATCH_SIZE * NUM_FINAL_WORDS * (2 * HIDDEN_SIZE)
        
        # Attention implementation
        weights = F.softmax(self.attn(x), dim=1)
        x = torch.bmm(weights, x.transpose(1,2))
        x = x.sum(dim=2)
        
    
        # Run through fully-connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.output(x))
        
        return x.squeeze(0), weights

In [None]:
NUM_EPOCHS = 20
BATCH_SIZE = 64

model = Model().cuda()
model.train()

optimizer = torch.optim.Adam(model.parameters())

train_dataset = TensorDataset(torch.Tensor(X_train).long(), torch.Tensor(y_train).long())
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)

for epoch in range(NUM_EPOCHS):
    for batch_index, (sequence, target) in enumerate(train_dataloader):
        
        sequence, target = sequence.cuda(), target.cuda()
        optimizer.zero_grad()
        predictions,_ = model(sequence)
        loss = F.cross_entropy(predictions, target)
        loss.backward()
        
        optimizer.step()
        
        if batch_index % 5 == 0:
            print("Epoch: {}   Batch: {}   Loss: {}".format(epoch, batch_index, loss.item()))
      

In [None]:
model.eval()
test_results,_ = model(torch.Tensor(X_test).long().cuda())
test_results = test_results.argmax(dim=1).cpu().numpy()

print(classification_report(y_test, test_results))

In [None]:
sequence_index = choice(range(len(X_test)))

_, weights = model(torch.Tensor(X_test[sequence_index:sequence_index+1]).long().cuda())

weights = F.softmax(weights.sum(dim=-1)[0])

words = list(["'" + index_to_word[j] + "'" for j in X_test[sequence_index]])

figure(figsize=(20,7))
bar(words, weights.cpu().detach().numpy())