# Literal Listener

In [1]:
__author__ = "Christopher Leung"
__version__ = "CS224u, Stanford, Spring 2020"

## Set-up

See [colors_overview.ipynb](colors_overview.ipynb) for set-up in instructions and other background details.

In [2]:
from colors import ColorsCorpusReader
import os
from sklearn.model_selection import train_test_split
from torch_color_selector import (
    ColorizedNeuralListener, create_example_dataset)
import utils
from utils import START_SYMBOL, END_SYMBOL, UNK_SYMBOL
import numpy as np

In [3]:
utils.fix_random_seeds()

In [4]:
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv")

## All two-word examples as a dev corpus

So that you don't have to sit through excessively long training runs during development, I suggest working with the two-word-only subset of the corpus until you enter into the late stages of system testing.

In [5]:
dev_corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME, 
    word_count=None, 
    normalize_colors=True)

In [6]:
dev_examples = list(dev_corpus.read())

This subset has about one-third the examples of the full corpus:

In [7]:
len(dev_examples)

46994

In [8]:
#dev_examples = [example for example in dev_examples if len(example.contents.split(" ")) > 14]
#for example in dev_examples:
#    print(example.contents)

## Dev dataset

The first step is to extract the raw color and raw texts from the corpus:

In [9]:
dev_rawcols, dev_texts = zip(*[[ex.colors, ex.parse_turns()] for ex in dev_examples])

The raw color representations are suitable inputs to a model, but the texts are just strings, so they can't really be processed as-is. Question 1 asks you to do some tokenizing!

## Replacing turns with token

In [10]:
dev_texts = ["#".join(text) for text in dev_texts]
print(dev_texts[:10])

['The darker blue one', 'purple', 'Medium pink#the medium dark one', 'lime', 'Mint green.', 'Mud brown', 'Mud brown', 'Camo green', 'Darkish red', 'Grey']


## Random train–test split for development

For the sake of development runs, we create a random train–test split:

In [11]:
dev_rawcols_train, dev_rawcols_test, dev_texts_train, dev_texts_test, dev_examples_train, dev_examples_test = \
    train_test_split(dev_rawcols, dev_texts, dev_examples)

In [12]:
#dev_rawcols_test, dev_rawcols_dev, dev_texts_test, dev_texts_tdev = \
#    train_test_split(dev_rawcols_test, dev_texts_test, test_size=0.2)

In [13]:
# Train=75%, dev=5%, test=20%
print(len(dev_rawcols_train))
print(len(dev_rawcols_test))
#print(len(dev_rawcols_dev))

35245
11749


## Improve the tokenizer


In [14]:
from colors_utils import heuristic_ending_tokenizer

def tokenize_example(s):
    
    # Improve me!
    
    return [START_SYMBOL] + heuristic_ending_tokenizer(s) + [END_SYMBOL]

def clean_test_and_training(dev_seqs_train, dev_seqs_test):
    # This method cleans the test set with $UNK, for those words that do not show up in the training set
    vocab = {}
    for toks in dev_seqs_train+dev_seqs_test:
        for w in toks:
            if w not in vocab:
                vocab[w]=0
            vocab[w]+=1
    removal_candidates = {k:v for k, v in vocab.items() if v == 1 }
    
    dev_seqs_train = [[w if w not in removal_candidates else UNK_SYMBOL for w in toks] for toks in dev_seqs_train]

    dev_seqs_test = [[w if w not in removal_candidates else UNK_SYMBOL for w in toks] for toks in dev_seqs_test]
    return dev_seqs_train, dev_seqs_test

In [15]:
tokenize_example(dev_texts_train[376])

['<s>', 'aqua', '</s>']

## Use the tokenizer

Once the tokenizer is working, run the following cell to tokenize your inputs:

In [16]:
dev_seqs_train = [tokenize_example(s) for s in dev_texts_train]

dev_seqs_test = [tokenize_example(s) for s in dev_texts_test]

#dev_seqs_dev = [tokenize_example(s) for s in dev_texts_test]

dev_seqs_train, dev_seqs_test = clean_test_and_training(dev_seqs_train, dev_seqs_test)

#_, dev_seqs_dev = clean_test_and_training(dev_seqs_train, dev_seqs_test)

We use only the train set to derive a vocabulary for the model:

In [17]:
dev_vocab = sorted({w for toks in dev_seqs_train for w in toks}) + [UNK_SYMBOL]

It's important that the `UNK_SYMBOL` is included somewhere in this list. Test examples with word not seen in training will be mapped to `UNK_SYMBOL`. If you model's vocab is the same as your train vocab, then `UNK_SYMBOL` will never be encountered during training, so it will be a random vector at test time.

In [18]:
len(dev_vocab)

2806

## Adding a different split for the Speaker

I should ensure that the speaker and the listener train on different datasets to prevent one from directly impling the other.

In [19]:
n = len(dev_seqs_train)
dev_seqs_train_listener, dev_seqs_train_speaker = \
    dev_seqs_train[:n//2], dev_seqs_train[n//2:]

In [20]:
dev_vocab_listener = sorted({w for toks in dev_seqs_train_listener for w in toks}) + [UNK_SYMBOL]
dev_vocab_speaker = sorted({w for toks in dev_seqs_train_speaker for w in toks}) + [UNK_SYMBOL]

In [21]:
print(len(dev_vocab_listener), len(dev_vocab_speaker))

1929 1964


## Improve the color representations


In [22]:
import colorsys

def represent_color_context(colors):
    
    # Improve me!
    
    return [represent_color(color) for color in colors]


def represent_color(color):
    import numpy.fft as fft
    # Improve me!
    #return color
    #return colorsys.rgb_to_hsv(*color)
    return fft.fft(color)

In [23]:
represent_color_context(dev_rawcols_train[0])

[array([2.07833333+0.j        , 0.24833333+0.19052559j,
        0.24833333-0.19052559j]),
 array([ 0.88 +0.j        , -0.215-0.23382686j, -0.215+0.23382686j]),
 array([1.145+0.j        , 0.29 -0.37239092j, 0.29 +0.37239092j])]

## Use the color representer

The following cell just runs your `represent_color_context` on the train and test sets:

In [24]:
dev_cols_train = [represent_color_context(colors) for colors in dev_rawcols_train]

dev_cols_test = [represent_color_context(colors) for colors in dev_rawcols_test]

#dev_cols_dev = [represent_color_context(colors) for colors in dev_rawcols_dev]

At this point, our preprocessing steps are complete, and we can fit a first model.

In [25]:
dev_cols_train_listener, dev_cols_train_speaker = \
    dev_cols_train[:n//2], dev_cols_train[n//2:]

## Question 3: GloVe embeddings [1 points]

The above model uses a random initial embedding, as configured by the decoder used by `ContextualColorDescriber`. This homework question asks you to consider using GloVe inputs. 

__Your task__: Complete `create_glove_embedding` so that it creates a GloVe embedding based on your model vocabulary. This isn't mean to be analytically challenging, but rather just to create a basis for you to try out other kinds of rich initialization.

In [9]:
GLOVE_HOME = os.path.join('data', 'glove.6B')

In [9]:
def create_glove_embedding(vocab, glove_base_filename='glove.6B.100d.txt'):
    
    # Use `utils.glove2dict` to read in the GloVe file:    
    ##### YOUR CODE HERE
    glove_dict = utils.glove2dict(os.path.join(GLOVE_HOME, glove_base_filename))

    
    # Use `utils.create_pretrained_embedding` to create the embedding.
    # This function will, by default, ensure that START_TOKEN, 
    # END_TOKEN, and UNK_TOKEN are included in the embedding.
    ##### YOUR CODE HERE
    embedding, new_vocab = utils.create_pretrained_embedding(glove_dict, vocab)

    
    # Be sure to return the embedding you create as well as the
    # vocabulary returned by `utils.create_pretrained_embedding`,
    # which is likely to have been modified from the input `vocab`.
    
    ##### YOUR CODE HERE
    return embedding, new_vocab


## Try the GloVe representations

In [11]:
dev_glove_embedding, dev_glove_vocab = create_glove_embedding(dev_vocab)

In [None]:
len(dev_vocab)

In [None]:
len(dev_glove_vocab)

## Save to pickles

In [None]:
def save_to_pickle():
    import pickle 

    with open('dev_vocab.pickle', 'wb') as handle:
        pickle.dump(dev_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_vocab_listener.pickle', 'wb') as handle:
        pickle.dump(dev_vocab_listener, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_vocab_speaker.pickle', 'wb') as handle:
        pickle.dump(dev_vocab_speaker, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_seqs_test.pickle', 'wb') as handle:
        pickle.dump(dev_seqs_test, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_seqs_train.pickle', 'wb') as handle:
        pickle.dump(dev_seqs_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_seqs_train_speaker.pickle', 'wb') as handle:
        pickle.dump(dev_seqs_train_speaker, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_seqs_train_listener.pickle', 'wb') as handle:
        pickle.dump(dev_seqs_train_listener, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_cols_test.pickle', 'wb') as handle:
        pickle.dump(dev_cols_test, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_cols_train.pickle', 'wb') as handle:
        pickle.dump(dev_cols_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_cols_train_speaker.pickle', 'wb') as handle:
        pickle.dump(dev_cols_train_speaker, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_cols_train_listener.pickle', 'wb') as handle:
        pickle.dump(dev_cols_train_listener, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_examples_train.pickle', 'wb') as handle:
        pickle.dump(dev_examples_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_examples_test.pickle', 'wb') as handle:
        pickle.dump(dev_examples_test, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('embedding.pickle', 'wb') as handle:
        pickle.dump(embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_glove_vocab.pickle', 'wb') as handle:
        pickle.dump(dev_glove_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('dev_glove_embedding.pickle', 'wb') as handle:
        pickle.dump(dev_glove_embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)
save_to_pickle()