# Create Vocabulary 

In [2]:
class Vocabulary(object):
    def __init__(self, token_to_idx = None, add_unk = True, \
                 unk_token = "<UNK>"):
        '''
        Args:
        token_to_idx (dict): a pre-existing map of tokens to indices
        add_unk(bool): a flag that indicates whether to add the UNK token
        unk_token(str): the UNK token to add into the Vocabulary 
        '''
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx 
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_indx = -1 
        if self._add_unk:
            self.unk_index = self.add_token(self._unk_token)
            
    def to_serializable(self):
        '''
        returns a disctionary that can be serialized
        '''
        return {'token_to_idx': self._token_to_idx,
               'add_unk':self._add_unk,
               'unk_token':self._unk_token}
    
    @classmethod 
    def from_serializable(cls, contents):
        '''
        instantiates the Vocabulary from a serialized disctionary 
        '''
        return cls(**contents)
    
    def add_token(self, token):
        '''
        Update mapping dicts based on the token
        
        Args:
        token (str): the item to add into the Vocabulary 
        
        Returns:
        index(int): the integer corresponding to the token
        '''
        
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
            
        return index
    
    def look_token(self, token):
        '''
        Retrive the index associated with the token or the UNK index if token isn't present.
        Args:
        token (str): the token to look up 
        
        Returns:
        index (int): the index corresponding to the token 
        
        Notes:
        'unk_index' needs to be >=0 (having been added into the Vocabulary) for the UNK functionality 
        '''
        if self.add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        '''
        Return the token associated with the index
        
        Args:
        index (int): the index to look up
        
        Returns:
        token (str): the token corresponding to the index
        
        Raises:
        KeyError: if the index is not int the Vocabulary 
        '''
        if index not in self._idx_to_token:
            raise KeyError(f"the index {index} is not in the Vocabulary")
            
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size = {len(self)})>"
    
    def __len__(self):
        return len(self._token_to_idx)