In [1]:
import numpy as np
import nltk
from nltk.corpus import brown
from pattern.en import conjugate, lemma
import matplotlib.pyplot as plt
import random
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 

#get a dictionary mapping words to their parts of speech; get the keys separately, for faster lookup times
brown = {v[0]: v[1] for v in nltk.corpus.brown.tagged_words()}
brown_keys = brown.keys()

In [8]:
#we are going to have 3 classes
#Embedding provides tools for the manipulation/analysis of word embeddings
#Grammar provides tools for the application of formal grammars to word embeddings
#Grammatical provides a wrapper for the two to interact
class Embedding:
    def __init__(self, embeddings, numbers, basis = "I"): 
        self.vectors = np.array(embeddings)
        self.dims = self.vectors.shape #(number of vectors, dimensionality of each)
        self.word2int = numbers
        self.int2word = {v: k for k, v in numbers.items()} #in case we want a reverse lookup
        self.important = None
        self.basis = basis #basis for the embedding vectors; can be changed, preserved across classes
        if self.basis == "I": 
            self.basis = np.eye(self.dims[1]) #initialize to identity of right size (impossible to specify exact size in default argument, need to check dimensionality of embeddings)
    
    def get_vect(self, pos): 
        #Get the vector for either the input word or the input index
        if type(pos) == str: 
            return self.vectors[self.word2int[pos]]
        if type(pos) == int: 
            return self.vectors[pos]
        return
    
    def get_word(self, n): 
        #Get the word for the input index; if given a list of input indices, return corresponding list of words
        if type(n) == list: 
            return map(self.get_word, n)
        return self.int2word[n]
    
    def filter_by_words(self, words, perm = False): 
        #returns list of indices corresponding to the input words (i'm not sure this is useful, but don't want to delete it yet)
        #the perm argument will show up repeatedly — generally, perm = False means return just the list of indices
        #while perm = True means filter this class's words/vectors to just those indices (returns void)
        L = []
        for i in range(self.dims[0]): 
            if self.get_word(i) in words: 
                L.append(i)
        if not perm: 
            return L
        else: 
            #the form class function will either overwrite this class or form a new one and return that, depending on whether perm was True or False, respectively
            self.form_class(L, True)    
            
    def filter_by_dot(self, words, angle = 90, normalize = True, prop = None, perm = False): 
        #filter by dot product relative to selected words (letting N be the number of words in the wordlist, return the prop * N best words)
        #since the formulae for affine hyperplane selection and angle selection are the same save for normalization, 
        #normalization = True sorts by angle (maximal angle allowable is the angle argument), and normalization = False sorts by affine hyperplane
        #if multiple words are given, any given word must be good with *all* of them
        
        if type(words) == str: words = [words]
        nice = np.cos(angle / 180.0 * np.pi) #degrees -> radians -> dot product (since it's cos(angle) that's proportional to the dot product)
        V = np.matrix(self.vectors) #N x D matrix, where N is number of vectors and D is dimensionality
        w = np.matrix([self.get_vect(word) for word in words]) #L x D matrix, where L is the number of words in the words argument
        #multiplying w by the transpose of V will result in an L x N matrix, 
        #where the values in the ith column will correspond to the dot products of the ith vector with the vectors of all the words in the words argument
        #so the minimal value in the ith column will correspond to the ith word's worst match
        #(and if this match is less than nice, we deny that word entry into the new list)
        good = []
        if normalize: 
            M = np.array(((w * V.T / np.linalg.norm(V, axis = 1)).T  / np.linalg.norm(w, axis = 1)).T) #if normalize, normalize by norms of vectors, so as to get angle
        else: 
            M = np.array(w * V.T) #otherwise, affine hyperplane
        if prop != None: 
            M = M.min(axis = 0) #get minimum along columns, as previously stated
            #now, we want the (prop * N) best words, so we sort the entire wordlist by their values in M, get the first prop * N, then sort *those* to get an ordered list of the best words
            good = sorted(sorted(range(self.dims[0]), key = lambda x: M[x])[:int(self.dims[0] * prop)])
        else: 
            #if prop was not specified, just return those that are above the threshold (but it's *really* hard to control the size of this list, so using prop is way better)
            for i, k in enumerate(M.min(axis = 0)): 
                if k > nice: 
                    good.append(i)
        if not perm: 
            return good
        else: 
            self.form_class(good, True)

    def filter_by_norm2(self, words, weights = None, prop = 0.5, order = 2, power = 1, power2 = 3, resort = True, perm = False): 
        return 'fuck'
        #sort words in wordlist by how close they are to words in words argument, with weights allowing you to control which words in the words argument are most important
        #see above note for details on how this works
        if weights == None: 
            weights = [1 for x in words]
        func = lambda x: sum([((weights[i] * np.linalg.norm(self.get_vect(words[i]) - self.get_vect(x), ord = order)**power2))**power for i in range(len(words))])
        good = sorted(range(self.dims[0]), key = func)[:int(self.dims[0] * prop)]
        if resort: 
            good = sorted(good)
        if not perm: 
            return good
        else: 
            self.form_class(good, True)
    
    def filter_by_norm(self, words, weights = None, prop = 0.5, order = 2, power = 1, power2 = 1, resort = True, perm = False): 
        if weights == None:
            weights = [1 for x in words]
        M = [0 for x in range(self.dims[0])]
        for x in range(self.dims[0]): 
            k = self.get_vect(x)
            for i in range(len(words)): 
                v = self.get_vect(words[i])
                M[x] += weights[i] * np.linalg.norm(v - k, ord = order) ** power2
        if power != 1: 
            M = [M[x] ** power for x in range(self.dims[0])]
        good = sorted(range(self.dims[0]), key = lambda x: M[x])[:int(self.dims[0] * prop)]
        if resort: 
            good = sorted(good)
        if not perm: 
            return good
        else: 
            self.form_class(good, True)
            
    def match(self, vect): 
        #what word is closest to a given vector? 
        #e.g., match(get_vect("queen") + get_vect("man")) might return "king", but idk since this is an arbitrary example 
        ind = None
        val = np.inf
        for i in range(self.dims[0]): 
            val2 = np.linalg.norm(vect - self.get_vect(i))
            if val2 < val and val2 != 0: 
                val = val2
                ind = i
        return ind
    
    def change_basis(self, basis): 
        #given a new basis, change the basis of the matrix
        #you don't need this
        V = np.matrix(self.vectors)
        V = V * basis
        self.vectors = np.array(V)
        self.basis = self.basis * basis
        
    def to_identity(self): 
        #Revert to the identity basis using the Moore-Penrose inverse
        self.change_basis(np.linalg.pinv(self.basis))
        
    def realign_PCA(self): 
        #perform a principal component analysis on the matrix of vectors (V), 
        #finding those vectors that best explain the variance among word embeddings 
        #geometric intuition: https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/GaussianScatterPCA.svg/1200px-GaussianScatterPCA.svg.png
        #(note: these vectors are canonically the eigenvectors of the covariance matrix, which itself is proportional to the transpose of V times V!)
        #then change the basis so as to make those vectors the principal axes
        #again, you don't need this, but really cool
        V = np.matrix(self.vectors)
        W = V.T * V
        Eval, Evec = np.linalg.eig(W) #PCA
        self.vectors = np.array(V * Evec) #change of basis 
        self.basis = self.basis * Evec
        
    def project2d(self, title = None): 
        #who cares about data visualization? 
        #not me!
        #if it's under 6 dimensions, i don't care
        #jk
        #but this is still pretty useless except for playing around with new sets of embeddings
        b = Embedding(self.vectors, self.word2int, self.basis)
        B = np.diag([1, 1] + [0] * (self.dims[1] - 2))[0:2].T
        b.realign_PCA() #may as well make the two dimensions useful ones
        
        b.change_basis(B) # :(
        fig, ax = plt.subplots()
        for word, x, y in zip(b.get_word(range(b.dims[0])), b.vectors.T[0], b.vectors.T[1]): 
            ax.annotate(word, (x, y))
        plt.plot([-10, 10], [0, 0], 'k--')
        plt.plot([0, 0], [-10, 10], 'k--')
        plt.xlim(min(b.vectors.T[0]), max(b.vectors.T[0]))
        plt.ylim(min(b.vectors.T[1]), max(b.vectors.T[1]))
        if title != None: 
            plt.title(title)
        plt.rcParams["figure.figsize"] = (10, 10)
        plt.show()
        #plt.plot(b.vectors.T[0], b.vectors.T[1], '+')
        
    def strip_pos(self, acceptable = ["NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBZ", "VBP", "JJ"], perm = False): 
        #strips words that don't carry useful meanings, like "while" and "should" and "cannot" and etc.
        #the acceptable default set comes from the list of POS tags NLTK uses: the default arg references nouns, adverbs, adjectives, and verbs
        L = []
        for w, i in self.word2int.items(): 
            try: 
                if brown[w] in acceptable: 
                    L.append(i)
            except: 
                pass 
        if perm: 
            self.form_class(L, perm = True)
        else: 
            return L
        
    def form_class(self, L, perm = False): 
        #as stated before, let L be a list of indices
        #then form_class returns a class whose words/vectors are those specified by L
        #and if perm (short for permanent) = True, form_class overwrites *this* class with that class, by recalling self.__init__
        ind = 0
        vec = []
        w2i = {}
        for i in range(self.dims[0]): 
            if i in L: 
                vec.append(self.get_vect(i))
                w2i[self.get_word(i)] = ind
                ind += 1
        if not perm: 
            return Embedding(vec, w2i, self.basis)
        else: 
            self.__init__(vec, w2i, self.basis)
            
    def select_random(self, n = 1): 
        #returns either a single random word from an Embedding instantiation's wordlist, or returns a list of random words, depending on n
        if n == 1:
            return random.choice(self.word2int.keys())
        L = []
        i = 0
        if n > self.dims[0]: 
            return "fuck"
        while i < n: 
            a = random.choice(self.word2int.keys())
            if a not in L: 
                L.append(a)
            i += 1
        return L
    
    def sample(self, words, weights, prop, n = 20):
        #again, useless except for playing around with different word embeddings
        B = self.form_class(self.filter_by_norm(words, weights, prop))
        B.project2d(' '.join(words))
        print ' '.join(words) + ': ' + ' '.join(B.select_random(n))
        
    def copy(self): 
        return Embedding(self.vectors, self.word2int, self.basis)
    

In [9]:
class Grammar: 
    def __init__(self, productions, terminals, pos_dict): 
        #Every grammar needs a set of productions from which to build abstract structures, 
        #and a set of terminals with which to fill the final structures in with.
        #self.pos_dict will let us convert between NLTK parts of speech and the terminals' own parts of speech 
        #(e.g., it will tell us that "VBG" and "VBN" correspond to "<verb>", "JJ" to "<adjective>", etc.)
        self.prods = productions
        self.terms = terminals
        self.pos_dict = pos_dict
        self.embedding = ref_embedding
        
    def clean(self, string): 
        #gets a list of words that were just filled in from an abstract structure, and tidies it up
        #e.g., ["A", "astronaut", "land", "on", "the", "moon"] becomes "An astronaut landed on the moon."
        for i in range(len(string)): 
            #first, make all verbs past tense
            try: 
                if self.pos_dict[brown[string[i]]] == '<verb>': 
                    string[i] = conjugate(string[i], tense = 'past')
            except KeyError: 
                pass
        if type(string) == list: 
            #then, convert list to space-separated string
            string = ' '.join(string)
        for i in ';,.!': 
            #then, remove spaces directly before punctuation (e.g., "Go !" -> "Go!")
            string = string.replace(' ' + i, i)
        for i in range(len(string)): 
            #then, capitalize all initial words in sentences
            if i==0 or (i > 2 and string[i-2] in '.!\n'): 
                string = string[:i] + string[i].upper() + string[i+1:]
            #then, replace "a" with "an" when necessary
            if string[i]=='a' and (i > 1 and string[i-1]==string[i+1]==' ') and (len(string) > i+2 and string[i+2] in 'aeiou'): 
                string = string[:i] + 'an' + string[i+1:]
        return string
    
    def fill(self, L): 
        #Naively fill in the abstract sentence structure with words
        #Just call the terminals corresponding to the given part of speech in the structure, and pick a random one
        L2 = [random.choice(self.terms[i].split('|')) for i in L]
        return L2
    
    def smart_fill(self, L, attn_span = 3, variety = 0.1, attn = None): 
        #Fill in the abstract sentence structure with words
        #Keep an "attention" list (attn) which determines the words used to fill in the structure, 
        #said words then being added to the attention list themselves, while old words are removed from the attention list 
        #Start off the attention list with the word corresponding to the subject
        if attn == None: attn = []
        attn = [self.terms['<subject>']] + attn
        L2 = []
        i = 0
        for i in L: 
            if i in ['<verb>', '<adjective>', '<noun>']: 
                #If the part of speech we're filling in has semantic content (it's one of those POSes), 
                #use the attention list to construct a list of related words, filter those to get the ones with the right POS, then pick one of those
                possibles = [x for x in self.embedding.get_word(self.embedding.filter_by_norm(attn, range(1, attn_span+1)[::-1], prop = variety))] #range(1, attn_span+1)[::-1]
                possibles2 = []
                for j in possibles: 
                    if brown[j] in self.pos_dict.keys() and self.pos_dict[brown[j]] == i: 
                        possibles2.append(j)
                possibles = possibles2
                word = random.choice(possibles)
                while word in L2: 
                    word = random.choice(possibles)
                L2.append(word)
                attn.append(word)
                if len(attn) > attn_span: 
                    #attn = attn[1:]
                    attn.remove(random.choice(attn))
                #print attn
                    
            else: 
                word = random.choice(self.terms[i].split('|'))
                L2.append(word)
        return L2
    
    def parse(self, general = False, start = '<sentence>', lim = 20, attn_span = 3, variety = 0.1, attn = None): 
        #Use the productions to turn the starting structure into a structure ready to be converted into terminals
        #Recursively, while there's a tag in the start variable that's not in the list of terminal tags, 
        #replace all the start tags with new sets of tags based on the production rules
        #The lim argument sets an upper bound to recursion depth — if it's exceeded, we assume we're on a runaway train, and start from scratch
        #Then, unless general is True, we send that final structure to be filled in with smart_fill, 
        #passing our attn_span and variety arguments to that
        if lim == 0: 
            return self.parse(general, '<sentence>', lim, attn_span = attn_span, variety = variety, attn = attn)
        start = random.choice(start.split('|')).split()
        start2 = []
        give = False
        for i in range(len(start)): 
            if start[i] in self.prods.keys(): 
                start2 += random.choice(self.prods[start[i]].split('|')).split()
                give = True
            else: 
                start2 += [start[i]]
        if give and (lim != 0): 
            return self.parse(general, ' '.join(start2), lim-1, attn_span = attn_span, variety = variety, attn = attn)
        else: 
            if general: 
                return start2
            else: 
                return self.smart_fill(start2, attn_span, variety, attn)
    
    def generate(self, start = '<sentence>', lim = 20, attn_span = 3, variety = 0.1, attn = None): 
        #Simple wrapper for parsing and cleaning a sentence — this will produce full, clean descriptions on its own
        return self.clean(self.parse(False, start, lim, attn_span, variety, attn))
        
    def add_terminal(self, term, pos): 
        #Add a terminal to the list of terminals, given the part of speech to add it to
        if pos == '<none>': 
            return
        if ' ' in pos: 
            for i in pos.split(): 
                self.add_terminal(term, i)
            return
        if pos == '<verb>': 
            term = conjugate(term, tense = 'past')
        if self.terms[pos] == '': 
            self.terms[pos] = term
        else: 
            self.terms[pos] += '|' + term
            
    def add_terminal_auto(self, term): 
        #Add a terminal to the list of terminals, automatically determining the part of speech to add it to
        #Determine it via self.pos_dict and the brown dict
        #And if we get a *list* of terms, add all of them automatically (very convenient!)
        if type(term) == list: 
            map(self.add_terminal_auto, term)
            return
        try: 
            pos = brown[term]
            self.add_terminal(term, self.pos_dict[pos])
        except KeyError: 
            return
            
    def clear_terminals(self, pos): 
        #Clear a part of speech, to start anew for whatever reason
        self.terms[pos] = ''

In [48]:
class Wrapper(): 
    #A class for entities such as characters, objects, and locations. 
    def __init__(self, embedding, grammar): 
        self.embedding = embedding
        self.grammar = grammar
        self.grammar.embedding = embedding
        self.rom = [] #Permanent characteristics
        self.ram = [] #Semi-permanent characteristics
        self.cache = []
        self.cache_size = 8
        self.def_proportion = 200.0 / self.embedding.dims[0]
        self.previous = None
        self.height = 0
        self.next = []
    
    def increase_height(self): 
        self.height += 1
        if self.previous != None: 
            self.previous.increase_height()
            
    def base(self, term): 
        if type(term) == list:
            map(self.base, term)
            return
        self.rom.append(term)
        
    def queue(self, term, method = "lifo"): 
        if type(term) == list: 
            map(lambda x: self.queue(x, method), term)
            return
        if len(self.cache) < self.cache_size: 
            self.cache.append(term)
            return
        if method == "fifo": 
            self.cache[self.cache - 1] = term
        if method == "lifo": 
            self.cache = self.cache[1:]
            self.cache.append(term)
        if method == "random": 
            i = random.choice(range(self.cache_size))
            self.cache = self.cache[:i] + [term] + self.cache[i+1:]
        
    def queue_ram(self, term): 
        if type(term) == list: 
            map(lambda x: self.queue_ram(x), term)
            return
        self.ram.append(term)
            
    def filter_by_pos(self, terms, pos): 
        return [x for x in terms if self.grammar.pos_dict[brown[x]] == pos]
    
    def gen_terms(self): 
        return self.embedding.get_word(self.embedding.filter_by_norm(self.rom*3 + self.ram + self.cache, range(1, len(self.rom*3 + self.ram + self.cache)+1)[::-1], prop = self.def_proportion))
    
    def get_pos(self, pos): 
        return self.filter_by_pos(self.gen_terms(), pos)
            
    def fill_characteristics(self, n = 4): 
        A = self.get_pos("<adjective>")
        random.shuffle(A)
        A = A[:min([len(A), n])]
        self.queue_ram(A)
    
    def pass_characteristics(self, C): 
        C.queue(self.rom + self.ram)
    
class Character(Wrapper): 
    def __init__(self, embedding, grammar, identity, name = None, pronoun = None, bases = None): 
        Wrapper.__init__(self, embedding, grammar)
        self.identity = identity
        self.rom = [identity]
        self.pronoun = pronoun
        self.name = name
        if bases != None: 
            self.base(bases)
        if self.pronoun == None: 
            self.pronoun = random.choice(["he", "she"])
        if name == None: 
            if self.pronoun == "he": 
                self.name = random.choice(mnames)
            else: 
                self.name = random.choice(fnames)

    def __str__(self): 
        return self.name + " is a " + self.identity + ". " + self.pronoun.capitalize() + " is " + ", ".join(self.ram[:-1]) + ", and " + self.ram[-1] + "."
        
    def gen_char(self): 
        self.grammar.clear_terminals('<subject>')
        self.grammar.clear_terminals('<pronoun>')
        self.grammar.add_terminal(self.identity, '<subject>')
        self.grammar.add_terminal_auto(self.identity)
        self.grammar.add_terminal(self.pronoun, '<pronoun>')
        return self.grammar.generate(attn_span = self.cache_size, variety = self.def_proportion, attn = self.ram)
    
    def profile(self, detail = 5): 
        k = self.name + " is a " + self.identity + ".\n"
        for i, j in [("<verb>", "does"), ("<adjective>", "is"), ("<noun>", "likes")]:
            k += 'Some things ' + self.name + ' ' + j + ': \n'
            M = list(set(self.get_pos(i)))
            random.shuffle(M)
            if i == '<verb>': 
                k += '\t' + ', '.join(map(lemma, M)[:min([detail, len(M)])]) + '\n'
            else: 
                k += '\t' + ', '.join(M[:min([detail, len(M)])]) + '\n'
        return k
    
class Location(Wrapper): 
    def __init__(self, embedding, grammar, identity, name = None, bases = None): 
        Wrapper.__init__(self, embedding, grammar)
        if name == None: 
            name = identity.capitalize()
        self.identity = identity
        self.name = name
        self.rom = [identity]
        if bases != None: 
            self.base(bases)
        
    def __str__(self): 
        return self.name + " is a " + self.identity + ". It is " + ", ".join(self.ram[:-1]) + ", and " + self.ram[-1] + "."
    
    def profile(self, detail = 5): 
        k = ''
        for i, j in [("<verb>", "is used to do"), ("<adjective>", "is"), ("<noun>", "has")]:
            k += 'Some things ' + self.name + ' ' + j + ': \n'
            M = list(set(self.get_pos(i)))
            random.shuffle(M)
            if i == '<verb>': 
                k += '\t' + ', '.join(map(lemma, M)[:min([detail, len(M)])]) + '\n'
            else: 
                k += '\t' + ', '.join(M[:min([detail, len(M)])]) + '\n'
        return k
    
class Object(Wrapper): 
    def __init__(self, embedding, grammar, identity, name = None, bases = None): 
        Wrapper.__init__(self, embedding, grammar)
        if name == None: 
            name = identity.capitalize()
        self.identity = identity
        self.name = name
        self.rom = [identity]
        if bases != None: 
            self.base(bases)
    
    def __str__(self): 
        return self.name + " is a " + self.identity + ". It is " + ", ".join(self.ram[:-1]) + ", and " + self.ram[-1] + "."
    
    def profile(self, detail = 5): 
        k = ''
        for i, j in [("<verb>", "is used for"), ("<adjective>", "is"), ("<noun>", "is used with")]:
            k += 'Some things ' + self.name + ' ' + j + ': \n'
            M = list(set(self.get_pos(i)))
            random.shuffle(M)
            if i == '<verb>': 
                k += '\t' + ', '.join(map(lemma, M)[:min([detail, len(M)])]) + '\n'
            else: 
                k += '\t' + ', '.join(M[:min([detail, len(M)])]) + '\n'
        return k

In [59]:
class Network: 
    def __init__(self, grammar, embedding, identity_list = {}): 
        self.identity_list = identity_list
        self.grammar = grammar
        self.embedding = embedding
        self.entities = []
        self.objects = []
        
    def make_canonical_char(self, identity, name = None, pronoun = None, adjs = []): 
        return Character(self.embedding, self.grammar, identity, name, pronoun, adjs)

    def make_canonical_loc(self, identity, name = None, adjs = []): 
        return Location(self.embedding, self.grammar, identity, name, adjs)

    def make_canonical_obj(self, identity, name = None, adjs = []): 
        return Object(self.embedding, self.grammar, identity, name, adjs)
    
    def add_char(self, identity, name, pronoun, adjs): 
        C = self.make_canonical_char(identity, name, pronoun, adjs)
        C.fill_characteristics()
        self.entities.append(C)
        
    def add_loc(self, identity, name, adjs): 
        C = self.make_canonical_loc(identity, name, adjs)
        C.fill_characteristics()
        self.entities.append(C)
        
    def add_obj(self, identity, name, adjs): 
        C = self.make_canonical_obj(identity, name, adjs)
        C.fill_characteristics()
        self.entities.append(C)

    def get_next(self, first = None, force_loc = False): 
        if force_loc: 
            first = None
        di = {}
        if first == None or isinstance(first, Character): 
            di = self.identity_list
        if isinstance(first, Location) or isinstance(first, Object): 
            di["characters"] = self.identity_list["characters"]
        if force_loc: 
            di = {}
            di["locations"] = self.identity_list["locations"]
        I = random.choice(di.keys())
        J = random.choice(di[I])

        C = {"locations": self.make_canonical_loc, "characters": self.make_canonical_char, "objects": self.make_canonical_obj}[I](J)
        if first != None: 
            first.increase_height()
            first.pass_characteristics(C)
            first.next.append(C)
            C.previous = first
        C.fill_characteristics()
        if isinstance(C, Object) or isinstance(C, Location): 
            C.name = C.ram[0].capitalize() + " " + C.name
        if isinstance(C, Character): 
            C.name = C.identity.capitalize() + " " + C.name 
        self.entities.append(C)
        if isinstance(C, Object): 
            self.objects.append(C)
        return C
    
    def make(self, n = 10): 
        t0 = time.time()
        print "The cast: "
        print "\t" + self.get_next(force_loc = True).name + " (1/" + str(n) + ")"
        t = time.time()
        A = [3, 3, 3, 3]
        for i in range(n - 1): 
            P = sorted(self.entities, key = lambda x: len(x.next))[0]
            t0 = t
            t = time.time()
            A = A[1:] + [t-t0]
            m = sum(A)/float(len(A))
            print "\t" + self.get_next(random.choice(self.entities)).name + " (" + str(i+2) + "/" + str(n) + ", " + str(round(m, 1) * (n - (i + 2))) + "s remaining)"
        print 'Done!'
    
    def search(self, s): 
        for i in self.entities: 
            if s in i.name: 
                return i
    
    def quest(self, seed = None, deets = False): 
        if seed == None: 
            a = random.choice(sorted([x for x in self.entities if isinstance(x, Location)], key = lambda x: x.height, reverse = True)[:6])
        else: 
            a = seed
        L = [a] 
        while L[-1].next != []: 
            L.append(random.choice(sorted(L[-1].next, key = lambda x: x.height, reverse = True)[:min([len(L[-1].next), 2])]))
        i = L[0]
        print 'Your quest is: \n'
        if isinstance(i, Location): 
            print 'Go to the ' + i.name + '.'
        if isinstance(i, Character): 
            print 'Find ' + i.name + '.'
        if isinstance(i, Object): 
            print 'Retrieve the ' + i.name + '.'
        if deets: 
            print "\t" + i.__str__()
        print 
        for i in L[1:-1]: 
            if isinstance(i, Location): 
                print 'Then, go to the ' + i.name + '.'
            if isinstance(i, Character): 
                print 'Then, find ' + i.name + '.'
            if isinstance(i, Object): 
                print 'Then, retrieve the ' + i.name + '.'
            if deets: 
                print "\t" + i.__str__()
            print
        i = L[-1]
        if isinstance(i, Location): 
            print 'Finally, go to the ' + i.name + '.'
        if isinstance(i, Character): 
            print 'Finally, find ' + i.name + '.'
        if isinstance(i, Object): 
            print 'Finally, retrieve the ' + i.name + '.'
        if deets: 
            print "\t" + L[-1].__str__()
            
    def profile(self): 
        for i in self.entities: 
            print i
            print '\t' + ', '.join([x.name for x in i.next]) + '\n'
            
    def char_profile(self, noun, detail = 8): 
        C = self.make_canonical_char(noun)
        for i, j in [("<verb>", "do"), ("<adjective>", "are"), ("<noun>", "like")]:
            print 'Some things ' + noun + 's ' + j + ': '
            M = list(set(C.get_pos(i)))
            random.shuffle(M)
            if i == '<verb>': 
                print '\t' + ', '.join(map(lemma, M)[:min([detail, len(M)])])
            else: 
                print '\t' + ', '.join(M[:min([detail, len(M)])])

In [12]:
char_productions = {'<sentence>': '<type1> <punctuation> <type2> <punctuation>|<type1> <punctuation> <type2> <comma> <connective> <type2> <punctuation>|<sentence> <sentence2>', 
                    '<sentence2>': '<type12> <punctuation> <type2> <punctuation>|<type12> <punctuation> <type2> <comma> <connective> <type2> <punctuation>',
                    '<sentence3>': '<type12> <punctuation> <type2> <punctuation>|<type12> <punctuation> <type2> <comma> <connective> <type2> <punctuation>',
                    '<type1>': '<subj> <description> <comma> <connective> <pronoun> <action>',
                    '<type12>': '<subj2> <description> <comma> <connective> <pronoun> <action>',
                    '<type2>': '<pronoun> <type3>|<pronoun> <causal> <type3>',
                    '<type3>': '<description>|<action>',
                    '<subj>': '<the> <subject>|<the> <adjective> <subject>|<the> <subject>',
                    '<subj2>': '<the> <subject>',
                    '<description>': '<be> <adjective>',
                    '<action>': '<verb> <the> <adjective> <noun>'
                   }

char_terminals = {'<article>': 'a|the', 
                  '<the>': 'the', 
                 '<be>': 'became|was', 
                  '<noun>': '', 
                  '<adjective>': '', 
                  '<verb>': '', 
                  '<adverb>': '', 
                  '<pronoun>': '', 
                  '<prep>': 'with',
                  '<subject>': '',
                 '<connective>': 'as|and|while|but|so', 
                  '<causal>': 'then|therefore|accordingly',
                 '<modifier>':'very|unusually|amazingly|unbelievably|somewhat|mildly|subtly',
                 '<punctuation>': '.', 
                 '<comma>': ',', 
                 '<semicolon>': ';'
                 }
                    
char_pos_dict = {'RB': '<adverb>', 'RBR': '<adjective>', 'RBS': '<adjective>', 'NN': '<noun>',
                'NNS': '<noun>', 'JJ': '<adjective>', 'JJR': '<adjective>', 'JJS': '<adjective>', 
                'IN': '<prep>', 'VB': '<verb>', 'VBD': '<verb>', 'VBG': '<verb>', 'VBP': '<verb>', 
                 'VBZ': '<verb>', 'VBN': '<verb>', 'NNP': '<noun>', 'NNPS': '<noun>'}
                    

identity_list = {"locations": ["house", "cave", "town", "university", "company", "district", "hall", "school", "route", "museum", 
                              "city", "church", "river", "island", "street", "court", "club", "mountain", "institute", "station", 
                              "province", "kingdom", "organization", "ship", "festival", "aircraft", "capital", "hill", "hospital", 
                              "studio", "municipality", "parliament", "soviet", "airport", "empire", "fort", "castle", "temple", 
                              "camp", "tower", "campus", "corporation", "prison", "senate", "farm", "cemetery", "institution", 
                              "federation", "dynasty", "commune", "cathedral", "junction", "exhibition"],
                 "characters": ["cat", "dog", "person", "wizard", "professor", "soldier", "warrior", "president", "journalist", "priest", 
                               "chief", "singer", "saint", "emperor", "author", "leader", "student", "officer", "child", "author",
                               "governor", "artist", "lord", "god", "captain", "writer", "manager", "actor", "doctor", "architect", 
                               "lieutenant", "representative", "communist", "spirit", "criminal", "poet", "colonel", "fighter", "lawyer", 
                               "historian", "guitarist", "wolf", "traitor", "villager", "farmer", "governor"], 
                 "objects": ["jewel", "tree", "flower", "book", "document", "computer", "weapon", "aircraft", "car", "painting", 
                            "note", "plant", "program", "stone", "recording", "journal", "guitar", "prize", "machine", "letter", "vehicle", 
                            "gun", "message", "crown", "drug", "map", "instrument", "element", "composition"]
                }


In [6]:
#get word vectors, stored in Pre, and cross-reference with Words and Words2

Words2 = {x: None for x in open('/Users/6081iprep/Documents/languages/english.txt').readlines()}
Pre = open('bow2.words','r').readlines()

#Pre = open('/Users/6081iprep/Desktop/words.txt', 'r').read().split('\n')[:-1]
W = []
V = []
removewords = ["blond", "virgin", "pregnant", "sucking", "dumb", "entitled", "titled", "smile", "cry", "gang", "dice", "black", "dick", "bye", 
              "byed", "byes", "gangs", "ganged", "spic", "beardless", "swart", "yeller", "mustachioed", "unshaven", "anchorite", "canonist", 
              "sodden", "stolid", "workmanlike"] #these tend to create either inappropriate or nonsensical sentences

for i in Pre: 
    if len(i) > 0: 
        w = i.split()[0]
        if len(w) > 2:
            try: 
                a = Words2[w + '\n']
                if w not in removewords: #lots of useless length 2 words
                    W.append(w)
                    V.append(map(float, i.split()[1:]))
            except: 
                pass
            
WI = {}
for i, w in enumerate(W): 
    WI[w] = i
    
del Words2 #save memory
del Pre

language='american'
def load(lang):
    a,n=open('/Users/6081iprep/Documents/names/'+lang+'.txt'),[]
    for z in a:
        n.append(z[:len(z)-1])
    fnames,mnames=[],[]
    for z in n:
        q=z.split(',')
        if q[2]=='female' or q[2]=='female\r':
            fnames.append(q[0])
        else:
            mnames.append(q[0])
    a.close()
    return fnames,mnames
        
fnames,mnames = load(language)

In [30]:
ref_embedding = Embedding(V, WI)
ref_embedding.strip_pos(perm = True)
print("Created embedding")
#this will be the default Embedding instantiation for non-Embedding objects to turn to when they need to do stuff with word embeddings

grammar = Grammar(char_productions, char_terminals, char_pos_dict)
grammar.add_terminal_auto(ref_embedding.word2int.keys())
basegrammar = Grammar(char_productions, char_terminals, char_pos_dict)
basegrammar.add_terminal_auto(ref_embedding.word2int.keys())
print("Created grammar")

Created embedding
Created grammar


In [None]:
char = Character(ref_embedding, grammar, 'president')
char.fill_characteristics()
char.gen_char()

In [66]:
N = Network(grammar, ref_embedding, identity_list)
N.make(10)
N.quest(deets = True)

The cast: 
	Electoral Municipality (1/10)
	Saint Delores (2/10, 18.4s remaining)
	Artist Ethel (3/10, 17.5s remaining)
	Wizard William (4/10, 16.2s remaining)
	Graphic Computer (5/10, 16.0s remaining)
	Poet Kerry (6/10, 17.2s remaining)
	Farmer Roberta (7/10, 13.8s remaining)
	Wizard Tony (8/10, 9.2s remaining)
	Demoniac Journal (9/10, 4.4s remaining)
	Coppery Crown (10/10, 0.0s remaining)
Done!
Your quest is: 

Go to the Electoral Municipality.
	Electoral Municipality is a municipality. It is electoral, merrymaking, administrative, and populous.

Then, find Artist Ethel.
	Artist Ethel is a artist. She is merrymaking, benighted, electoral, and demoniac.

Finally, retrieve the Graphic Computer.
	Graphic Computer is a computer. It is graphic, phonic, merrymaking, and splashy.
