Name: Daniel Yan

Email: daniel.yan@vanderbilt.edu

Classes and/or functions used to reformat pandas dataframes.

In [None]:
Class Normalizer:
    """
    Normalizes a features matrix by the length of each sequence in a certain column.
    
    Attributes:
        df: Pandas dataframe to normalize
        pairs_col: Integer column within the features matrix containing element to normalize length by
        normalize_cols: List intgers containing column numbers to normalize by dividing 
            by length of sequence in pairs_col
        _normalized: Boolean value for if object has been normalized
    """
    def __init__(self, df, pairs_col, normalize_cols):
        """
        Sets the data frame and the integer column number of the base pairs within the data frame
        for the Normalizer object. Sets internal _normalized flag to false to note that data frame
        has not been normalized.
        
        Keyword Arguments:
            self: Normalizer object passed to itself
            df: Pandas data frame to normalize
            pairs_col: Integer column within the features matrix where the base pairs are stored
            normalize_cols: List intgers containing column numbers to normalize by dividing 
                by length in pairs_col
        """
        self.df = df
        self.pairs_col = pairs_col
        self.normalize_cols = normalize_cols
        self._normalized = False
        
    def normalize_counts(self):
    """
    Normalize all kmer counts by dividing by the total number of base pairs
    
    Keyword Arguments:
        self: Normalizer object passed to itself
    
    Returns:
        Normalized dataframe contained in self.df
    """
    # Normalize counts by dividing kmer counts in each row by the number of bases
    df = df.div()