In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import glob
import csv

In [7]:
# function that reads in one csv file into np array
experimentPath = 'SampleEpiOutput'

In [8]:
def readGenotypes(filename, skipColumns=1):
    '''
    Description:
        * Returns the genotypes header from an run file.
    In:
        * filename: Path to the run file
        * skipColumns: Optional argument for compatibility in the cases where
            the patch number is still present in the data.
    Out:
        * List of genotypes strings
    Notes:
        * NA
    '''
    reader = csv.reader(open(filename))
    return next(reader)[skipColumns:]


In [42]:
def readExperimentFilenamesEpi(
    experimentPath,
    stateIdentifiers={"human": "HUM", "male": "ADM", "female_e": "AF1_E", "female_i": "AF1_I", "female_s": "AF1_S"}
):
    """
    Description:
        * This auxiliary function searches within a given path, and returns all
            the CSV files that match the head provided by the dictionary keys
            for human, male, and infected state of female 
    In:
        * experimentPath: Path to the directory that contains the experiments'
            CSV files (quantiles over patches).
        * stateIdentifiers: Dictionary containing the head identifier for
            the human, male, and state of female CSV files.
    Out:
        * Dictionary with the sorted lists of filenames associated with each file head given in stateIdentifiers
           ex.  "human" [list -> strings], "male" [list -> strings], etc
     
    """
    
    stateIdToFiles = {}
    for stateId in stateIdentifiers:
        files = sorted(
            glob.glob(experimentPath + "/" + stateIdentifiers[stateId] + "*.csv")
        ) 
        stateIdToFiles[stateId] = files

    return stateIdToFiles


In [43]:
fileIdToFiles = readExperimentFilenamesEpi(experimentPath)
fileIdToFiles["female_e"]

['SampleEpiOutput/AF1_E_Patch000.csv', 'SampleEpiOutput/AF1_E_Patch001.csv']

In [34]:
# load one file
def loadNodeDataEpi(
    filename,
    dataType=float,
    skipHeader=1,
    skipColumns=1
):
    """
    Description:
        * Loads the data given by filename
    In:
        * filename: path to csv file
        * dataType: To save memory/processing time if possible (int/float).
    Out:
        * Dictionary containing:
            "genotypes" [list -> strings]
            "population" [numpyArray]

    """
    genotypes = readGenotypes(filename)
    data = np.genfromtxt(
            filename,
            dtype=dataType,
            skip_header=skipHeader,
            delimiter=",")
    returnDictionary = {
            "genotypes": genotypes,
            "population": data[:, skipColumns:]
        }
    return returnDictionary

In [45]:
humanNode = loadNodeDataEpi('SampleEpiOutput/HUM_Patch000.csv')
print(humanNode["genotypes"])
print(humanNode["population"][:5])

['S', 'I']
[[ 850.  150.]
 [ 855.  143.]
 [ 870.  149.]
 [ 883.  145.]
 [ 891.  141.]]


In [37]:
# function that loads all files in filenames
def loadLandscapeDataEpi(filenames, male=True, female=True, dataType=float):
    """
    Description:
        * Imports the information of all the nodes in filenames
    In:
        * filenames: List of paths to desired csv files
    Out:
        * Dictionary containing:
            "genotypes" [list -> strings]
            "landscape" [list -> numpyArrays]
    """
    if len(filenames) == 0:
        return
    genotypes = readGenotypes(filenames[0])
    nodesDataList = []
    for f in filenames:
        data = loadNodeDataEpi(f, dataType)["population"]
        nodesDataList.append(data)
        
    returnDictionary = {
        "genotypes": genotypes,
        "landscape": nodesDataList
    }
    return returnDictionary

In [55]:
humanDict = loadLandscapeDataEpi(fileIdToFiles["human"])
print(humanDict["genotypes"])
print(humanDict["landscape"][0][:5])

maleDict = loadLandscapeDataEpi(fileIdToFiles["male"])
print(maleDict["genotypes"])
print(maleDict["landscape"][0][:5])

femaleEDict = loadLandscapeDataEpi(fileIdToFiles["female_e"])
print(femaleEDict["genotypes"])
print(femaleEDict["landscape"][0][:5])

['S', 'I']
[[ 850.  150.]
 [ 855.  143.]
 [ 870.  149.]
 [ 883.  145.]
 [ 891.  141.]]
['HH', 'HW', 'HR', 'WW', 'WR', 'RR']
[[   0.    0.    2.  745.    5.    0.]
 [   0.    0.    2.  737.    5.    0.]
 [   0.    0.    0.  711.    0.    0.]
 [   0.    0.    0.  726.    0.    0.]
 [   0.    0.    0.  694.    0.    0.]]
['HH', 'HW', 'HR', 'WW', 'WR', 'RR']
[[  0.   0.   0.  32.   0.   0.]
 [  0.   0.   0.  20.   0.   0.]
 [  0.   0.   0.  16.   0.   0.]
 [  0.   0.   0.  12.   0.   0.]
 [  0.   0.   0.  16.   0.   0.]]


In [53]:
def sumLandscapePopulationsEpi(
    landscapeData
):
    """
    Description:
        * This function sums the data in each np array given by landscapeData["landscape"]
    In:
        * landscapeData: Data loaded with the "loadLandscapeDataEpi" function.
    Out:
        * Dictionary containing:
            "genotypes" [list -> strings]
            "population" [numpyArray]
    """
    dataList = landscapeData["landscape"]
    if len(dataList) == 0:
        return
    
    fillArray = np.zeros_like(dataList[0])    
    for i in range(len(dataList)):
        fillArray += dataList[i]
        
    returnDictionary = {
        "genotypes": landscapeData["genotypes"],
        "population": fillArray
    }
    return returnDictionary


In [59]:
sumHumanDict = sumLandscapePopulationsEpi(humanDict)
print(sumHumanDict["genotypes"])
print(sumHumanDict["population"][:5])

sumMaleDict = sumLandscapePopulationsEpi(maleDict)
print(sumMaleDict["genotypes"])
print(sumMaleDict["population"][:5])

sumFemaleEDict = sumLandscapePopulationsEpi(femaleEDict)
print(sumFemaleEDict["genotypes"])
print(sumFemaleEDict["population"][:5])
type(sumFemaleEDict["population"][0, 0])

['S', 'I']
[[ 2550.   450.]
 [ 2485.   449.]
 [ 2488.   469.]
 [ 2469.   461.]
 [ 2455.   463.]]
['HH', 'HW', 'HR', 'WW', 'WR', 'RR']
[[    0.     0.     2.  1490.     5.     0.]
 [    0.     0.     2.  1474.     5.     0.]
 [    0.     0.     0.  1422.     0.     0.]
 [    0.     0.     0.  1452.     0.     0.]
 [    0.     0.     0.  1388.     0.     0.]]
['HH', 'HW', 'HR', 'WW', 'WR', 'RR']
[[  0.   0.   0.  64.   0.   0.]
 [  0.   0.   0.  40.   0.   0.]
 [  0.   0.   0.  32.   0.   0.]
 [  0.   0.   0.  24.   0.   0.]
 [  0.   0.   0.  32.   0.   0.]]


numpy.float64

In [64]:
def sumAlleleCounts(sumLandscapeData, alleleNames, columns):
    """
    Description:
        * This function sums the total count of each allele given by sumLandscapeData["population"]
    In:
        * sumLandscapeData: Dict loaded with the "sumLandscapePopulationsEpi" function.
        * alleleNames: list of individual alleles, eg ["W", "R"]
        * columns: List of lists describing number of times each one
            indexed column should be counted for this allele 
            ex. [[1, 1, 2], [2, 3, 3]] if the given genotypes are WW, WR, and RR
    Out:
        * Dictionary containing:
            "genotypes" [list -> strings]
            "population" [numpyArray]
    """
    landscapeData = sumLandscapeData["population"]
    fillArray = np.zeros((len(landscapeData), len(alleleNames)))    
    
    for i in range(len(alleleNames)):
        for index in columns[i]:
            # subtract 1 because index is 1 indexed
            fillArray[:,i] += landscapeData[:,index-1]
        
    returnDictionary = {
        "alleles": alleleNames,
        "totalCounts": fillArray
    }
    return returnDictionary


In [65]:
sumMaleAlleleDict = sumAlleleCounts(sumMaleDict, ['H', 'W', 'R'], [[1, 1, 2, 3], [2, 4, 4, 5], [3, 5, 6, 6]])
print(sumMaleAlleleDict["alleles"])
print(sumMaleAlleleDict["totalCounts"][:5])

['H', 'W', 'R']
[[  2.00000000e+00   2.98500000e+03   7.00000000e+00]
 [  2.00000000e+00   2.95300000e+03   7.00000000e+00]
 [  0.00000000e+00   2.84400000e+03   0.00000000e+00]
 [  0.00000000e+00   2.90400000e+03   0.00000000e+00]
 [  0.00000000e+00   2.77600000e+03   0.00000000e+00]]
