### Adversarial Text Example Experiment Runner Template
11/6/2017 - Basic pipeline to run adversarial text generation experiments.

### Dataset Preparation
Base dataset: The Enron Spam Dataset: http://www2.aueb.gr/users/ion/data/enron-spam/ 
    

In [13]:
import os
import numpy as np
import torch
from collections import defaultdict, Counter

In [11]:
## Spam Preprocessing - UNIX Command line
# 1. Removed all \n and replaced with spaces: find . -type f -exec perl -i. -pe 's/\r?\n/ /' {} +
# 2. Concatenated all spam into a single file and all ham into a single file.
#      To concatenate within dirs: awk 1 enron1/ham/*.txt > enron1_ham.txt 
# 3. Randomly shuffled: shuf input > output
# 4. Create 80, 10, 10 train, val, and test splits.

# Total ham: 16545 messages; train/val/test = 13236, 1654, 1655
# Total spam: 17171 messages; train/val/test = 13736, 1717, 1718

1718

In [33]:

base_data_dir = "/cvgl2/u/catwong/cs332_final_project/data/"
classes = ['spam', 'ham']
vocabulary_filename = 'email_train_vocab.txt' 
# Truncation and vocabulary shortening:
# Using the train data only:
# 1. Truncate both the spam and ham messages to truncation_len characters (adding padding where needed).
# 2. From the truncated messages, compile a vocabulary of the class_vocabulary_size most frequent tokens for each class.
# 3. Write a vocabulary file composed of the full, combined vocabulary (ie. the most frequent tokens across both classes.)
truncation_len = 100
class_vocabulary_size = 3000

combined_vocab = []
for class_name in classes:
    filename = os.path.join(base_data_dir, 'train', class_name + '.txt') 
    print "Now processing: %s" % filename
    with open(filename) as f:
        all_lines = [line.strip().lower().split() for line in f.readlines()]
        
    # Truncate the files.
    truncated_lines = [line[:truncation_len] for line in all_lines]
    
    # Add tokens to the counter
    token_counts = Counter()
    for line in truncated_lines:
        token_counts.update(line)
    combined_vocab += [elem for (elem, count) in token_counts.most_common(class_vocabulary_size)]
    # Convert the combined vocabulary into a set.
combined_vocab = set(combined_vocab)
# Write out the combined_vocab to the vocabulary file
with open(vocabulary_filename, 'w') as f:
    for token in combined_vocab:
        f.write(token + "\n")

Now processing: /cvgl2/u/catwong/cs332_final_project/data/train/spam.txt
Now processing: /cvgl2/u/catwong/cs332_final_project/data/train/ham.txt


In [71]:
# Class that takes in a file and a vocabulary file (which has a truncation len) and converts the text into
# encoded/truncated sentences.

class DatasetEncoderDecoder(object):
    """
    Encodes and decodes sentences according to a vocabulary.
    
    Sentences are truncated. OOV words are assigned an <UNK> token, and <SOS>, <PAD>, and <EOS> tokens are added.
    
    truncation_len
    """
    def __init__(self, truncation_len, vocab_file):
        self.truncation_len = truncation_len
        # Create index to word and word to index dicts from the vocab_file.
        num_default_tokens = 4
        self.index2word = {0:'<SOS>', 1:'<EOS>', 2: '<UNK>', 3: '<PAD>'}
        self.word2index = {'<SOS>':0, '<EOS>':1, '<UNK>': 2, '<PAD>': 3}
        with open(vocab_file) as f:
            all_lines = [line.strip() for line in f.readlines()]
        for idx, token in enumerate(all_lines):
            self.index2word[idx + num_default_tokens] = token
            self.word2index[token] = idx + num_default_tokens
          
    def encode_sentence(self, sentence):
        """
        Encodes a sentence according to the vocabulary.
        Returns:
            normalized: the normalized sentence, as it would be decoded.
            encoded: the space-separated numerical sentence.
        """
        truncated = sentence.lower().split()[:truncation_len]
        truncated += ['<PAD>'] * max(truncation_len - len(truncated), 0)
        truncated = ['<SOS>'] + truncated + ['<EOS>']
        
        normalized = []
        encoded = []
        # Encode, removing the UNK tokens
        for token in truncated:
            token = token if token in self.word2index else '<UNK>'
            normalized.append(token)
            encoded.append(str(self.word2index[token]))
        
        normalized = " ".join(normalized)
        encoded = " ".join(encoded)
        return normalized, encoded
    
    def decode_sentence(self, encoded):
        """Returns the decoded sentence."""
        numerical_encoded = [int(token) for token in encoded.split()]
        return " ".join([self.index2word[token] for token in numerical_encoded])

# Demonstration:
truncation_len = 100
vocab_file = 'data/email_train_vocab.txt'
sample_text = 'Subject: does your business depend on the online success of your website ? submitting your website in search engines may increase your online sales dramatically . if you invested time and money into your website , you simply must submit your website online otherwise it will be invisible virtually , which means efforts spent in vain . if you want people to know about your website and boost your revenues , the only way to do that is to make your site visible in places where people search for information , i . e . submit your website in multiple search engines . submit your website online and watch visitors stream to your e - business . best regards , myrtice melendez'
demo = DatasetEncoderDecoder(truncation_len, vocab_file)
normalized, encoded = demo.encode_sentence(sample_text)
print sample_text
print normalized
print encoded
decoded = demo.decode_sentence(encoded)
print decoded


Subject: does your business depend on the online success of your website ? submitting your website in search engines may increase your online sales dramatically . if you invested time and money into your website , you simply must submit your website online otherwise it will be invisible virtually , which means efforts spent in vain . if you want people to know about your website and boost your revenues , the only way to do that is to make your site visible in places where people search for information , i . e . submit your website in multiple search engines . submit your website online and watch visitors stream to your e - business . best regards , myrtice melendez
<SOS> subject: does your business <UNK> on the online success of your website ? submitting your website in search engines may increase your online sales dramatically . if you invested time and money into your website , you simply must submit your website online otherwise it will be invisible virtually , which means efforts s

In [80]:
# Write the train, test, and text encoded files using this encoder.
base_data_dir = "/cvgl2/u/catwong/cs332_final_project/data/"
splits = ['train', 'val', 'test']
classes = ['spam.txt', 'ham.txt']
truncation_len = 100
vocab_file = 'data/email_train_vocab.txt'

vocab_encoder = DatasetEncoderDecoder(truncation_len, vocab_file)
for split in splits:
    for class_file in classes:
        raw_file = os.path.join(base_data_dir, split, class_file)
        with open(raw_file) as f:
            all_lines = [line.strip() for line in f.readlines()]
        # Encode the lines
        encoded_lines = [vocab_encoder.encode_sentence(line)[1] for line in all_lines]
        
        # Write out the encoded line
        encoded_file = os.path.join(base_data_dir, split, 'encoded_' + class_file)
        with open(encoded_file, 'w') as f:
            for line in encoded_lines:
                f.write(line + "\n")

### Autoencoder - Seq2Seq Model
Source: http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#the-seq2seq-model 