### Adversarial Text Example Experiment Runner Template
11/6/2017 - Basic pipeline to run adversarial text generation experiments.

### Dataset Preparation
Base dataset: The Enron Spam Dataset: http://www2.aueb.gr/users/ion/data/enron-spam/ 
    

In [105]:
import os
import numpy as np
import scipy
import scipy.stats
import sklearn
import sklearn.feature_extraction, sklearn.naive_bayes, sklearn.metrics
import torch
from collections import defaultdict, Counter

In [11]:
## Spam Preprocessing - UNIX Command line
# 1. Removed all \n and replaced with spaces: find . -type f -exec perl -i. -pe 's/\r?\n/ /' {} +
# 2. Concatenated all spam into a single file and all ham into a single file.
#      To concatenate within dirs: awk 1 enron1/ham/*.txt > enron1_ham.txt 
# 3. Randomly shuffled: shuf input > output
# 4. Create 80, 10, 10 train, val, and test splits.

# Total ham: 16545 messages; train/val/test = 13236, 1654, 1655
# Total spam: 17171 messages; train/val/test = 13736, 1717, 1718

1718

In [33]:

base_data_dir = "/cvgl2/u/catwong/cs332_final_project/data/"
classes = ['spam', 'ham']
vocabulary_filename = 'email_train_vocab.txt' 
# Truncation and vocabulary shortening:
# Using the train data only:
# 1. Truncate both the spam and ham messages to truncation_len characters (adding padding where needed).
# 2. From the truncated messages, compile a vocabulary of the class_vocabulary_size most frequent tokens for each class.
# 3. Write a vocabulary file composed of the full, combined vocabulary (ie. the most frequent tokens across both classes.)
truncation_len = 100
class_vocabulary_size = 3000

combined_vocab = []
for class_name in classes:
    filename = os.path.join(base_data_dir, 'train', class_name + '.txt') 
    print "Now processing: %s" % filename
    with open(filename) as f:
        all_lines = [line.strip().lower().split() for line in f.readlines()]
        
    # Truncate the files.
    truncated_lines = [line[:truncation_len] for line in all_lines]
    
    # Add tokens to the counter
    token_counts = Counter()
    for line in truncated_lines:
        token_counts.update(line)
    combined_vocab += [elem for (elem, count) in token_counts.most_common(class_vocabulary_size)]
    # Convert the combined vocabulary into a set.
combined_vocab = set(combined_vocab)
# Write out the combined_vocab to the vocabulary file
with open(vocabulary_filename, 'w') as f:
    for token in combined_vocab:
        f.write(token + "\n")

Now processing: /cvgl2/u/catwong/cs332_final_project/data/train/spam.txt
Now processing: /cvgl2/u/catwong/cs332_final_project/data/train/ham.txt


In [4]:
# Class that takes in a file and a vocabulary file (which has a truncation len) and converts the text into
# encoded/truncated sentences.

class DatasetEncoderDecoder(object):
    """
    Encodes and decodes sentences according to a vocabulary.
    
    Sentences are truncated. OOV words are assigned an <UNK> token, and <SOS>, <PAD>, and <EOS> tokens are added.
    
    truncation_len
    """
    def __init__(self, vocab_file):
        self.truncation_len = 100
        # Create index to word and word to index dicts from the vocab_file.
        num_default_tokens = 4
        self.index2word = {0:'<SOS>', 1:'<EOS>', 2: '<UNK>', 3: '<PAD>'}
        self.word2index = {'<SOS>':0, '<EOS>':1, '<UNK>': 2, '<PAD>': 3}
        with open(vocab_file) as f:
            all_lines = [line.strip() for line in f.readlines()]
        for idx, token in enumerate(all_lines):
            self.index2word[idx + num_default_tokens] = token
            self.word2index[token] = idx + num_default_tokens
          
    def encode(self, sentence):
        """
        Encodes a sentence according to the vocabulary.
        Returns:
            normalized: the normalized sentence, as it would be decoded.
            encoded: the space-separated numerical sentence.
        """
        truncated = sentence.lower().split()[:self.truncation_len]
        truncated += ['<PAD>'] * max(self.truncation_len - len(truncated), 0)
        truncated = ['<SOS>'] + truncated + ['<EOS>']
        
        normalized = []
        encoded = []
        # Encode, removing the UNK tokens
        for token in truncated:
            token = token if token in self.word2index else '<UNK>'
            normalized.append(token)
            encoded.append(str(self.word2index[token]))
        
        normalized = " ".join(normalized)
        encoded = " ".join(encoded)
        return normalized, encoded
    
    def decode(self, encoded):
        """Returns the decoded sentence."""
        numerical_encoded = [int(token) for token in encoded.split()]
        return " ".join([self.index2word[token] for token in numerical_encoded])

# Demonstration:
vocab_file = 'data/email_train_vocab.txt'
sample_text = 'Subject: does your business depend on the online success of your website ? submitting your website in search engines may increase your online sales dramatically . if you invested time and money into your website , you simply must submit your website online otherwise it will be invisible virtually , which means efforts spent in vain . if you want people to know about your website and boost your revenues , the only way to do that is to make your site visible in places where people search for information , i . e . submit your website in multiple search engines . submit your website online and watch visitors stream to your e - business . best regards , myrtice melendez'
demo = DatasetEncoderDecoder(vocab_file)
normalized, encoded = demo.encode(sample_text)
print sample_text
print normalized
print encoded
decoded = demo.decode(encoded)
print decoded


Subject: does your business depend on the online success of your website ? submitting your website in search engines may increase your online sales dramatically . if you invested time and money into your website , you simply must submit your website online otherwise it will be invisible virtually , which means efforts spent in vain . if you want people to know about your website and boost your revenues , the only way to do that is to make your site visible in places where people search for information , i . e . submit your website in multiple search engines . submit your website online and watch visitors stream to your e - business . best regards , myrtice melendez
<SOS> subject: does your business <UNK> on the online success of your website ? submitting your website in search engines may increase your online sales dramatically . if you invested time and money into your website , you simply must submit your website online otherwise it will be invisible virtually , which means efforts s

In [80]:
# Write the train, test, and text encoded files using this encoder.
base_data_dir = "/cvgl2/u/catwong/cs332_final_project/data/"
splits = ['train', 'val', 'test']
classes = ['spam.txt', 'ham.txt']
truncation_len = 100
vocab_file = 'data/email_train_vocab.txt'

vocab_encoder = DatasetEncoderDecoder(vocab_file)
for split in splits:
    for class_file in classes:
        raw_file = os.path.join(base_data_dir, split, class_file)
        with open(raw_file) as f:
            all_lines = [line.strip() for line in f.readlines()]
        # Encode the lines
        encoded_lines = [vocab_encoder.encode(line)[1] for line in all_lines]
        
        # Write out the encoded line
        encoded_file = os.path.join(base_data_dir, split, 'encoded_' + class_file)
        with open(encoded_file, 'w') as f:
            for line in encoded_lines:
                f.write(line + "\n")

In [8]:
# Samples of the encoded data
base_data_dir = "/cvgl2/u/catwong/cs332_final_project/data/"
splits = ['train', 'val', 'test']
classes = ['encoded_spam.txt', 'encoded_ham.txt']

truncation_len = 100
vocab_file = 'data/email_train_vocab.txt'
vocab_encoder = DatasetEncoderDecoder(vocab_file)
for class_file in classes:
    sample_file = os.path.join(base_data_dir, splits[0], class_file)
    print "Sample file: " + sample_file
    with open(sample_file) as f:
        all_lines = [line.strip() for line in f.readlines()]
    sample_line = all_lines[0]
    print "Sample line: " + sample_line
    print "Sample decoding: " + vocab_encoder.decode(sample_line)

Sample file: /cvgl2/u/catwong/cs332_final_project/data/train/encoded_spam.txt
Sample line: 0 542 672 3079 3946 2 4153 2 2780 2 3119 1755 1555 987 2402 771 2 2194 4305 2 3378 1613 672 2 1322 302 323 2881 1299 2421 1658 323 248 838 1299 4021 3946 2256 1352 3997 772 4403 4153 720 1852 2336 838 1613 672 2 2 4400 3658 2129 2 949 2780 2 3119 1755 1555 987 720 2 1399 2194 302 4427 720 3740 517 2129 2139 949 3119 1613 302 3081 302 3081 3534 2559 2707 2 2 2 2 302 2018 3267 3138 2607 302 3365 3473 2 501 2 2 302 4427 1
Sample decoding: <SOS> subject: news alert ( <UNK> ) <UNK> orders <UNK> $ 3 million dollars what is <UNK> technologies ? <UNK> issued 2 news <UNK> today , one during market hours and one after the market closed ( you can view it below ) . according to the 2 news <UNK> <UNK> signed letters of <UNK> for orders <UNK> $ 3 million dollars . <UNK> max technologies , inc . announces letter of intent for $ 2 , 000 , 000 from a major <UNK> <UNK> <UNK> <UNK> , ny : march 29 , 2004 ; <UNK> - 

In [5]:
class SpamDataset(object):
    """
    Dataset: encapsulates utility functions to get the dataset files.
    """
    def __init__(self,
                 base_data_dir="/cvgl2/u/catwong/cs332_final_project/data/",
                 splits=['train', 'val', 'test'],
                 label_names=['ham', 'spam'],
                 encoded_files=['encoded_ham.txt', 'encoded_spam.txt'],
                 vocab_file='/cvgl2/u/catwong/cs332_final_project/data/email_train_vocab.txt',
                 random_seed=10):
        self.base_data_dir = base_data_dir
        self.splits = splits
        self.label_names = label_names
        self.encoded_files = encoded_files
        self.vocab_encoder = DatasetEncoderDecoder(vocab_file)
        self.random_seed = random_seed
        
        # Read in all of the lines from the files.
        self.examples_dict = {}
        self.labels_dict = {}
        for split in splits:
            all_examples = []
            all_labels = []
            for label, encoded_file in enumerate(encoded_files):
                data_file = os.path.join(base_data_dir, split, encoded_file)
                with open(data_file) as f:
                    all_lines = [line.strip().split() for line in f.readlines()]
                all_examples += all_lines
                all_labels += [label] * len(all_lines)
            self.examples_dict[split] = all_examples
            self.labels_dict[split] = all_labels
            
    
    def examples(self, 
                 split, 
                 shuffled=False):
        """
        Args:
            split: one of the splits (ex. train, val, test) with labels.
            shuffled: whether to shuffle the examples.(default: True)
        Returns:
            examples: (list of lists)
            labels: (list)
        """
        examples = np.array(self.examples_dict[split]).astype(int)
        labels = np.array(self.labels_dict[split])
        if shuffled:
            examples, labels = sklearn.utils.shuffle(examples, labels, random_state=self.random_seed)
        return examples, labels
    
    def dataset_stats(self):
        """Prints useful stats about the dataset."""
        for split in self.splits:
            labels = self.labels_dict[split]
            num_pos = np.sum(labels)
            num_neg = len(labels) - num_pos
            print "Total %s examples: %d, %s: %d, %s: %d" % (split, len(labels), self.label_names[0], num_neg, self.label_names[1], num_pos)
            

# Demo
dataset = SpamDataset()
examples, labels =  dataset.examples(split='train', shuffled=True)
print examples[0]
print labels[0]
print dataset.vocab_encoder.decode(" ".join(examples[0].astype(str)))
dataset.dataset_stats()

[   0  542 1597 1132 3012 4078  660 2911 1453   29  838 2704 2109 1870 1658
 4125 3928  838 3541  143 1011   36    2  224  638  954 2595  954  640  720
  838 1355  771 3727 3544  971 1065  311  720  853 4096 3742 2559  720 2481
 1332 4227 3720 4149  302 3404 1767 3645  302 3599 1012 3415  501 1703  501
 2860   70 3415  501 1703  501  263 2349  501 1597 1132 3012 4078 1386  720
  529    3    3    3    3    3    3    3    3    3    3    3    3    3    3
    3    3    3    3    3    3    3    3    3    3    3    1]
0
<SOS> subject: calpine daily gas nomination we are still under the scheduled outage period and will bring the next unit down @ <UNK> saturday 03 / 24 / 01 . the following is our estimated burn until then . thanks > ricky a . archer fuel supply 700 louisiana , suite 2700 houston , texas 77002 713 - 830 - 8659 direct 713 - 830 - 8722 fax - calpine daily gas nomination 1 . doc <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <

### Discriminator
A general discriminator class and two implementations.

NBDiscriminator and RNNDiscriminator


In [None]:
class Discriminator(object):
    """
    Discriminator: a general discriminator class.
    """
    def __init__(self):
        pass
    
    def train(self, dataset):
        raise Exception("Not implemented")
        
    def evaluate(self, dataset):
        raise Exception("Not implemented")
    
    def save_model(self):
        # Outputs a path that can be passed into the restore.
        raise Exception("Not implemented")
    
    def restore_model(self, path):
        raise Exception("Not implemented")

class MultinomialNBDiscriminator(Discriminator):
    """
    MultinomialNB: Multinomial Naive Bayes Classifier w. alpha=1.0
    
    Trained using TF-IDF features.
    """
    def __init__(self):
        Discriminator.__init__(self)
        self.model = sklearn.naive_bayes.MultinomialNB()
    
    def examples_to_term_doc(self, examples, num_terms=4480):
        """
        Converts a numerically-encoded examples matrix into a sparse term-documents matrix.
        """
        all_row_inds = all_col_inds = all_data = None
        for row_ind, example in enumerate(small_examples):
            if row_ind % 5000 == 0:
                print "Now on examples to terms: " + str(row_ind)
            itemfreqs = scipy.stats.itemfreq(example).T
            # Column indices: the term indices in that document.
            col_inds = itemfreqs[0]
            # Data: the counts of the terms in that document.
            data = itemfreqs[1]
            # Row indices: the current document, for each of the terms in that document.
            row_inds = np.ones(itemfreqs.shape[1], dtype=np.int) * row_ind

            # Concatenate to the existing data.
            if all_row_inds is None:
                all_row_inds = row_inds
                all_col_inds = col_inds
                all_data = data
            else:
                all_row_inds = np.append(all_row_inds, row_inds)
                all_col_inds = np.append(all_col_inds, col_inds)
                all_data = np.append(all_data, data)

        num_docs = len(examples)
        return scipy.sparse.csr_matrix((all_data, (all_row_inds, all_col_inds)), shape=(num_docs, num_terms))

    def train(self, dataset):
        examples, labels = dataset.examples(split='train', shuffled=True)
        # Silly way to compute sparse doc term matrix from examples matrix by converting it back into "strings".
        self.train_counts = self.examples_to_term_doc(examples)
        
        # Featurize using TFIDF.
        self.tf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
        X_transformed = self.tf_transformer.fit_transform(doc_terms)
        
        # Fit the model to TFIDF counts.
        self.model.fit(X_transformed, labels)
        
    def evaluate(self, dataset, split, verbose=True):
        examples, labels = dataset.examples(split=split, shuffled=True)
        doc_terms = self.examples_to_term_doc(examples)
        X_transformed = self.tf_transformer.transform(doc_terms)
        log_probs = self.model.predict_log_proba(doc_terms)
        predicted = np.argmax(log_probs, axis=1)
        mean_accuracy = np.mean(predicted == labels)  
        #roc_auc = sklearn.metrics.roc_auc_score(log_probs, val_labels)
        print "Mean_accuracy: %f" % mean_accuracy
        #print "ROC AUC: %f" % roc_auc
        

# Demo
spam_dataset = SpamDataset()
discriminator = MultinomialNBDiscriminator()
discriminator.train(spam_dataset)
discriminator.evaluate(spam_dataset, 'val')

Now on examples to terms: 0
Now on examples to terms: 5000


In [69]:
examples, labels =  dataset.examples(split='train', shuffled=True)
# Naively construct a term-document matrix


In [90]:
small_examples = examples

def examples_to_term_doc(examples, num_terms):
    """
    Converts a numerically-encoded examples matrix into a sparse term-documents matrix.
    
    Iterates over each example and gets the column_inds, counts.
    Then converts all the documents into a big 
    """
    all_row_inds = all_col_inds = all_data = None
    for row_ind, example in enumerate(examples):
        if row_ind % 500 == 0:
            print "Now on: " + str(row_ind)
        itemfreqs = scipy.stats.itemfreq(example).T
        # Column indices: the term indices in that document.
        col_inds = itemfreqs[0]
        # Data: the counts of the terms in that document.
        data = itemfreqs[1]
        # Row indices: the current document, for each of the terms in that document.
        row_inds = np.ones(itemfreqs.shape[1], dtype=np.int) * row_ind

        # Concatenate to the existing data.
        if all_row_inds is None:
            all_row_inds = row_inds
            all_col_inds = col_inds
            all_data = data
        else:
            all_row_inds = np.append(all_row_inds, row_inds)
            all_col_inds = np.append(all_col_inds, col_inds)
            all_data = np.append(all_data, data)

    num_docs = len(examples)
    return scipy.sparse.csr_matrix((all_data, (all_row_inds, all_col_inds)), shape=(num_docs, num_terms))


doc_terms = examples_to_term_doc(small_examples, num_terms=4480)

In [86]:

tf_transformer = sklearn.feature_extraction.text.TfidfTransformer()
X_transformed = tf_transformer.fit_transform(doc_terms)
        
# Fit the model to TFIDF counts.
clf = sklearn.naive_bayes.MultinomialNB().fit(X_transformed, labels)


In [94]:
val_examples, val_labels =  dataset.examples(split='val', shuffled=True)
print val_examples.shape
val_doc_terms = examples_to_term_doc(val_examples, num_terms=4480)
predicted = clf.predict(val_doc_terms)
np.mean(predicted == val_labels)    

(3371, 102)
Now on: 0
Now on: 500
Now on: 1000
Now on: 1500
Now on: 2000
Now on: 2500
Now on: 3000


0.96024918421833283

In [107]:
log_probs = clf.predict_log_proba(val_doc_terms)
predicted = np.argmax(log_probs, axis=1)
mean_accuracy = np.mean(predicted == val_labels)  
roc_auc = sklearn.metrics.roc_auc_score(log_probs, val_labels)
print "Mean_accuracy: %f" % mean_accuracy
print "ROC AUC: %f" % roc_auc

ValueError: continuous-multioutput format is not supported