Name: Daniel Yan

Email: daniel.yan@vanderbilt.edu

Finds the shuffled parts of the human genome within the features matrices in files 50-99 from /dors/capra_lab/users/yand1/te_ml/data/2018_07_11_pca_te_enhancers/batch_output
and normalize them by dividing by the number of base pairs. Store the new features matrix containing only information about shuffled parts of the human genome to a /dors/capra_lab/users/yand1/te_ml/data/2018_07_12_pca_te_enhancers/shuffled_features_matrix.tsv

In [None]:
# Libraries
import pandas as pd

In [None]:
Class Normalizer:
    """
    Normalizes a k-mers features matrix by the length of each sequence (stored in each row).
    
    Attributes:
        df: Pandas dataframe to normalize
        pairs_col: Integer column within the features matrix where the base pairs are stored
        normalize_cols: List intgers containing column numbers to normalize by dividing 
            by length in pairs_col
        _normalized: Boolean value for if object has been normalized
    """
    def __init__(self, df, pairs_col, normalize_cols):
        """
        Sets the data frame and the integer column number of the base pairs within the data frame
        for the Normalizer object. Sets internal _normalized flag to false to note that data frame
        has not been normalized.
        
        Keyword Arguments:
            self: Normalizer object passed to itself
            df: Pandas data frame to normalize
            pairs_col: Integer column within the features matrix where the base pairs are stored
            normalize_cols: List intgers containing column numbers to normalize by dividing 
                by length in pairs_col
        """
        self.df = df
        self.pairs_col = pairs_col
        self.normalize_cols = normalize_cols
        self._normalized = False
        
    def normalize_counts():
    """
    Normalize all kmer counts by dividing by the total number of base pairs
    
    Keyword Arguments:
        df: Data frame to normalize counts
        pairs_col: Integer column number that contains strings to normalize counts by
    
    Returns:
        Data frame with 
    """
    # Normalize counts by dividing kmer counts in each row by the number of bases
    df = df.apply(normalize_row, axis = "columns")
    return df

In [None]:
def normalize_row(row):
    """
    Divides the count of kmers by the number of pairs to get normalized value for PCA
    
    Args:
        row(pd.Series): Single row representing a HERV with counts of k-mers
        
    Return:
        row(pd.Series): Row that has k-mer counts divided by number of base pairs
    """
    # Get number of pairs in current row
    pairs_length = len(row[PAIRS])
    
    # Update k-kmer counts
    for i in range(PAIRS + 1, len(row)):
        row.iloc[i] = (row.iloc[i])/pairs_length
    
    return row

In [None]:
def combine(file_list, axis = "index"):
    """
    Combines rows of the files together on the given axis and returns combined data frame
    
    Args:
        file_list(list): List of files with same columns
        axis(string): Concatenate on index or columns
    """
    # List to store all the data frames to concatenate
    frames_list = []
    
    # Read in all files
    for file in file_list:
        frames_list.append(pd.read_table(file))
            
    # Combine data frames      
    return pd.concat(frames_list, axis = axis) 

In [None]:
if __name__ == "__main__"
    # List of files to combine
    file_list = []
    
    # Directory the files are in 
    directory = "/dors/capra_lab/users/yand1/te_ml/data/2018_07_11_pca_te_enhancers/batch_output/"
    
    # Generate list of files with shuffled human genome data to combine
    for i in range(50):
        file_list.append(directory + "shuffle_{}_features_matrix.tsv".format(i))
        
    # Get combined data frame
    print("Combining files...")
    combined_df = combine(file_list)
    
    # Normalize counts 
    print("Normalizing counts...")
    combined_df = normalize_counts(combined_df)
    
    # Save to new file
    combined_df.to_csv("/dors/capra_lab/users/yand1/te_ml/data/2018_07_11_pca_te_enhancers/shuffled_features_matrix.tsv",
                      header = False, Index = False, sep = '\t')