In [None]:
def ngram_creation(corpus, ngram_range=(1, 1)):
    """
    Produces a range of ngrams in list-of-list format from a corpus.
    
    Parameters:
    corpus (list): A list of document-length strings
    
    ngram_range (tuple): A tuple containing two integers. The first integer
    should not be less than 1. The second integer must be greater-than or 
    equal-to the first.
    
    Returns:
    ngram_list (list): A list-of-lists containing one or more nested lists.
    The nested lists contain tuples of strings.
    """
    
    ngram_list = []
    for i in range(ngram_range[0], ngram_range[1]+1):
        output = [list(ngrams(note, i)) for note in corpus]
        ngram_list.append(output)

    return ngram_list


In [None]:
def token_refinement(dataframe_list, ngram_range=(1,1)):
    """
    Establishes a set of vocabulary tokens, eliminating repeated tokens.
    
    Parameters:
    dataframe_list (list): A list containing dataframes separated by
    classification
    
    ngram_range (tuple): A tuple containing two integers. The first integer
    should not be less than 1. The second integer must be greater-than or 
    equal-to the first.
    
    Returns:
    output_vocab (list): A list containing nested sets of vocabulary tokens.
    """
    output_vocab = []
    for df in dataframe_list:
        for i in range(ngram_range[0], ngram_range[1] + 1):
            vocab = set()
            for row in df.itertuples():
                for item in row[i+2]:
                    vocab.add(item)
            output_vocab.append(vocab)
    return output_vocab

In [None]:
def unique_elements(list_of_lists: list):
    """
    Reduces a list of vocab terms by removing tokens that a given 
    classification's vocab shares with another classification. Each nested
    set of tokens is compared to every other set of tokens. The resulting
    set of tokens is appended to a new list as to ensure that tokens within
    a given set are the unique tokens that are only present in that 
    classification's vocab.
    
    Parameters:
    list_of_lists (list): a list containing sets of vocabulary tokens,
    separated by both classification and ngrams.
    
    Returns:
    output_list (list): a list containing sets of vocabulary tokens,
    but the number of tokens in each set has been reduced in number.
    """
    
    output_list = []
    for i in range(len(list_of_lists)):
        current_list = list_of_lists[i]
        for cycled_list in list_of_lists:
            if cycled_list != list_of_lists[i]:
                current_list = [item for item in current_list if item not in cycled_list]
        output_list.append(current_list)
    return output_list

In [None]:
def merge_vocab(vocab, ngram_range):
    """
    """
    new_vocab = []
    for i in range(ngram_range[0], ngram_range[1] + 1):
        new_vocab.append([])
    for sublist in vocab:
        for term in sublist:
            new_vocab[len(term)-1].append(term)
    return new_vocab

In [None]:
def bag_of_words(dataframe, vocab):
    """
    Generates a bag-of-words model dataframe. The resulting 
    dataframe is a sparse matrix.
    
    Parameters:
    dataframe (pandas.DataFrame): A dataframe with columns
    for tuples containing patient ID and note number, 
    manually annotated classification of the note, and a
    document-length string of the content of those notes.
    
    vocab (list): a list containing sets of vocabulary tokens.
    
    Returns:
    bow_model (pandas.Dataframe): a bag-of-words model 
    dataframe. The columns are denoted by the vocabulary 
    tokens while the rows are denoted by tuples 
    containing patient ID and note number. The values of
    the matrix are counts of how many times a given token
    occurred in the contents of the respective note.
    """

    output = {}    
    corpus_length = dataframe.shape[0]
    for row in dataframe.itertuples():
        id_note = str(row[0])
        cls = str(row[1])
            
        if id_note not in output:
            output[id_note] = {}
        
        for i in range(2, 5):
            cnt = Count(row[i])
            for term in vocab[i-2]:
                ngram = str(term)
                output[id_note][ngram] = cnt[term]
                    
        output[id_note]['--classification--'] = cls

    bow_model = pd.DataFrame.from_dict(output, orient='index')
    return bow_model
