diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..185371b --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +# Black by its default setting the maximum length to 88 +max-line-length = 88 +extend-ignore = + # also the Black's default, whitespace before ':' + E203, + # this is for the line length, all the lines too long are comments now + E501, + # the profile decorator is required + F821, + # some broken code under development, will come back + F841, + diff --git a/.gitignore b/.gitignore index 8edd12c..042875a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ __pycache__/ .ipynb_checkpoints/ .Rapp.history +*build/ +.DS_Store \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2cc2c3b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: +- repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black +- repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..ea72fc2 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,19 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" + jobs: + post_create_environment: + - pip install sphinx-rtd-theme + +sphinx: + configuration: docs/source/conf.py + +python: + install: + - requirements: docs/source/requirements.txt + +formats: + - epub \ No newline at end of file diff --git a/.workflow/format.yml b/.workflow/format.yml new file mode 100644 index 0000000..093de3d --- /dev/null +++ b/.workflow/format.yml @@ -0,0 +1,22 @@ +name: Evaluate code syntax +on: [push, pull_request] + +jobs: + pre-commit_and_pytest: + name: Run pre-commit + strategy: + matrix: + os: [ubuntu-latest, Windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + permissions: + contents: read + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.11.x" + + - uses: pre-commit/action@v3.0.0 + name: Run pre-commit diff --git a/BurrowsWheelerLibrary.py b/BurrowsWheelerLibrary.py index 998b010..c5fdcb9 100644 --- a/BurrowsWheelerLibrary.py +++ b/BurrowsWheelerLibrary.py @@ -1,46 +1,57 @@ -import random import numpy as np -import numba -from numba import njit, jit, int8, int32,int64, boolean, deferred_type, optional, float32 +from numba import ( + njit, + jit, + int8, + int64, +) from numba.experimental import jitclass from collections import OrderedDict try: profile -except: - def profile(x): +except NameError: + + def profile(x): return x -##################################### -### ### -### Burrows Wheeler ### -### ### -##################################### -class BurrowsWheelerLibrary(): +# Burrows Wheeler + + +class BurrowsWheelerLibrary: def __init__(self, haplotypeList): self.library = createBWLibrary(np.array(haplotypeList)) self.hapList = haplotypeList self.nHaps = len(haplotypeList) + def getHaplotypeMatches(self, haplotype, start, stop): - nHaps, hapIndexes = getConsistentHaplotypes(self.library, haplotype, start, stop) - haps = [(self.hapList[hapIndexes[index, 0]], hapIndexes[index, 1]) for index in range(nHaps)] + nHaps, hapIndexes = getConsistentHaplotypes( + self.library, haplotype, start, stop + ) + haps = [ + (self.hapList[hapIndexes[index, 0]], hapIndexes[index, 1]) + for index in range(nHaps) + ] return haps + @profile def getBestHaplotype(self, weights, start, stop): index = getHaplotypesPlusWeights(self.library, weights, start, stop) return self.hapList[index][start:stop] + jit_BurrowsWheelerLibrary_spec = OrderedDict() -jit_BurrowsWheelerLibrary_spec['a'] = int64[:,:] -jit_BurrowsWheelerLibrary_spec['d'] = int64[:,:] -jit_BurrowsWheelerLibrary_spec['zeroOccPrev'] = int64[:,:] -jit_BurrowsWheelerLibrary_spec['nZeros'] = int64[:] -jit_BurrowsWheelerLibrary_spec['haps'] = int8[:,:] +jit_BurrowsWheelerLibrary_spec["a"] = int64[:, :] +jit_BurrowsWheelerLibrary_spec["d"] = int64[:, :] +jit_BurrowsWheelerLibrary_spec["zeroOccPrev"] = int64[:, :] +jit_BurrowsWheelerLibrary_spec["nZeros"] = int64[:] +jit_BurrowsWheelerLibrary_spec["haps"] = int8[:, :] + @jitclass(jit_BurrowsWheelerLibrary_spec) -class jit_BurrowsWheelerLibrary(): +class jit_BurrowsWheelerLibrary: def __init__(self, a, d, nZeros, zeroOccPrev, haps): self.a = a self.d = d @@ -51,46 +62,44 @@ def __init__(self, a, d, nZeros, zeroOccPrev, haps): def getValues(self): return (self.a, self.d, self.nZeros, self.zeroOccPrev, self.haps) - def update_state(self, state, index): pass def get_null_state(self, value, index): if value == 0: lowerR = 0 - upperR = nZeros[stop-1] + upperR = nZeros[stop - 1] if value == 1: - lowerR = nZeros[stop-1] + lowerR = nZeros[stop - 1] upperR = nHaps pass + @njit def createBWLibrary(haps): - - #Definitions. + # Definitions. # haps : a list of haplotypes # a : an ordering of haps in lexographic order. # d : Number of loci of a[i,j+k] == a[i,-1, j+k] - nHaps = haps.shape[0] nLoci = haps.shape[1] - a = np.full(haps.shape, 0, dtype = np.int64) - d = np.full(haps.shape, 0, dtype = np.int64) + a = np.full(haps.shape, 0, dtype=np.int64) + d = np.full(haps.shape, 0, dtype=np.int64) - nZerosArray = np.full(nLoci, 0, dtype = np.int64) + nZerosArray = np.full(nLoci, 0, dtype=np.int64) + + zeros = np.full(nHaps, 0, dtype=np.int64) + ones = np.full(nHaps, 0, dtype=np.int64) + dZeros = np.full(nHaps, 0, dtype=np.int64) + dOnes = np.full(nHaps, 0, dtype=np.int64) - zeros = np.full(nHaps, 0, dtype = np.int64) - ones = np.full(nHaps, 0, dtype = np.int64) - dZeros = np.full(nHaps, 0, dtype = np.int64) - dOnes = np.full(nHaps, 0, dtype = np.int64) - nZeros = 0 nOnes = 0 for j in range(nHaps): - if haps[j, nLoci-1] == 0: + if haps[j, nLoci - 1] == 0: zeros[nZeros] = j if nZeros == 0: dZeros[nZeros] = 0 @@ -98,50 +107,48 @@ def createBWLibrary(haps): dZeros[nZeros] = 1 nZeros += 1 else: - ones[nOnes] = j + ones[nOnes] = j if nOnes == 0: dOnes[nOnes] = 0 else: dOnes[nOnes] = 1 nOnes += 1 if nZeros > 0: - a[0:nZeros, nLoci-1] = zeros[0:nZeros] - d[0:nZeros, nLoci-1] = dZeros[0:nZeros] + a[0:nZeros, nLoci - 1] = zeros[0:nZeros] + d[0:nZeros, nLoci - 1] = dZeros[0:nZeros] if nOnes > 0: - a[nZeros:nHaps, nLoci-1] = ones[0:nOnes] - d[nZeros:nHaps, nLoci-1] = dOnes[0:nOnes] + a[nZeros:nHaps, nLoci - 1] = ones[0:nOnes] + d[nZeros:nHaps, nLoci - 1] = dOnes[0:nOnes] + + nZerosArray[nLoci - 1] = nZeros - nZerosArray[nLoci-1] = nZeros + for i in range(nLoci - 2, -1, -1): + zeros = np.full(nHaps, 0, dtype=np.int64) + ones = np.full(nHaps, 0, dtype=np.int64) + dZeros = np.full(nHaps, 0, dtype=np.int64) + dOnes = np.full(nHaps, 0, dtype=np.int64) - for i in range(nLoci-2, -1, -1) : - zeros = np.full(nHaps, 0, dtype = np.int64) - ones = np.full(nHaps, 0, dtype = np.int64) - dZeros = np.full(nHaps, 0, dtype = np.int64) - dOnes = np.full(nHaps, 0, dtype = np.int64) - nZeros = 0 nOnes = 0 - dZerosTmp = -1 #This is a hack. + dZerosTmp = -1 # This is a hack. dOnesTmp = -1 - for j in range(nHaps) : - - dZerosTmp = min(dZerosTmp, d[j,i+1]) - dOnesTmp = min(dOnesTmp, d[j,i+1]) - if haps[a[j, i+1], i] == 0: - zeros[nZeros] = a[j, i+1] + for j in range(nHaps): + dZerosTmp = min(dZerosTmp, d[j, i + 1]) + dOnesTmp = min(dOnesTmp, d[j, i + 1]) + if haps[a[j, i + 1], i] == 0: + zeros[nZeros] = a[j, i + 1] dZeros[nZeros] = dZerosTmp + 1 nZeros += 1 dZerosTmp = nLoci else: - ones[nOnes] = a[j, i+1] + ones[nOnes] = a[j, i + 1] dOnes[nOnes] = dOnesTmp + 1 nOnes += 1 dOnesTmp = nLoci - if nZeros > 0: a[0:nZeros, i] = zeros[0:nZeros] d[0:nZeros, i] = dZeros[0:nZeros] @@ -151,68 +158,65 @@ def createBWLibrary(haps): d[nZeros:nHaps, i] = dOnes[0:nOnes] nZerosArray[i] = nZeros - - #I'm going to be a wee bit sloppy in creating zeroOccPrev - #Not defined at 0 so start at 1. - zeroOccPrev = np.full(haps.shape, 0, dtype = np.int64) + # I'm going to be a wee bit sloppy in creating zeroOccPrev + # Not defined at 0 so start at 1. + zeroOccPrev = np.full(haps.shape, 0, dtype=np.int64) for i in range(1, nLoci): count = 0 for j in range(0, nHaps): - if haps[a[j, i], i-1] == 0: + if haps[a[j, i], i - 1] == 0: count += 1 zeroOccPrev[j, i] = count - library = jit_BurrowsWheelerLibrary(a, d, nZerosArray, zeroOccPrev, haps) return library + @jit(nopython=True, nogil=True) def getConsistentHaplotypes(bwLibrary, hap, start, stop): a, d, nZeros, zeroOccPrev, haps = bwLibrary.getValues() nHaps = a.shape[0] nLoci = a.shape[1] + intervals = np.full((nHaps, 2), 0, dtype=np.int64) + intervals_new = np.full((nHaps, 2), 0, dtype=np.int64) - intervals = np.full((nHaps, 2), 0, dtype = np.int64) - intervals_new = np.full((nHaps, 2), 0, dtype = np.int64) - nIntervals = 0 nIntervals_new = 0 - - #Haps go from 0 to nHaps-1. Loci go from start to stop-1 (inclusive). - #The first hap with one is nZeros. The last hap with zero is nZeros -1. - #Last loci is stop -1 - #These are split out because they represent *distinct* haplotypes. - #Maybe could do this with tuple and list append but *shrug*. - - if hap[stop-1] == 0 or hap[stop-1] == 9: + + # Haps go from 0 to nHaps-1. Loci go from start to stop-1 (inclusive). + # The first hap with one is nZeros. The last hap with zero is nZeros -1. + # Last loci is stop -1 + # These are split out because they represent *distinct* haplotypes. + # Maybe could do this with tuple and list append but *shrug*. + + if hap[stop - 1] == 0 or hap[stop - 1] == 9: lowerR = 0 - upperR = nZeros[stop-1] + upperR = nZeros[stop - 1] if upperR >= lowerR: intervals[nIntervals, 0] = lowerR intervals[nIntervals, 1] = upperR nIntervals += 1 - - if hap[stop-1] == 1 or hap[stop-1] == 9: - lowerR = nZeros[stop-1] + + if hap[stop - 1] == 1 or hap[stop - 1] == 9: + lowerR = nZeros[stop - 1] upperR = nHaps if upperR >= lowerR: intervals[nIntervals, 0] = lowerR intervals[nIntervals, 1] = upperR nIntervals += 1 - #Python indexing is annoying. - #Exclude stop and stop-1, include start. - #Intervals are over haplotypes. - for i in range(stop-2, start-1, -1): + # Python indexing is annoying. + # Exclude stop and stop-1, include start. + # Intervals are over haplotypes. + for i in range(stop - 2, start - 1, -1): # print(intervals[0:nIntervals,:]) nIntervals_new = 0 - #Doing it on interval seems to make marginally more sense. + # Doing it on interval seems to make marginally more sense. for interval in range(nIntervals): - int_start = intervals[interval, 0] int_end = intervals[interval, 1] @@ -220,30 +224,33 @@ def getConsistentHaplotypes(bwLibrary, hap, start, stop): if int_start == 0: lowerR = 0 else: - lowerR = zeroOccPrev[int_start-1, i+1] - upperR = zeroOccPrev[int_end-1, i+1] #Number of zeros in the region. - if upperR > lowerR: #Needs to be greater than. Regions no longer inclusive. + lowerR = zeroOccPrev[int_start - 1, i + 1] + upperR = zeroOccPrev[ + int_end - 1, i + 1 + ] # Number of zeros in the region. + if ( + upperR > lowerR + ): # Needs to be greater than. Regions no longer inclusive. # print("Added 0:", int_start, int_end, "->>", lowerR, upperR) intervals_new[nIntervals_new, 0] = lowerR intervals_new[nIntervals_new, 1] = upperR nIntervals_new += 1 if hap[i] == 1 or hap[i] == 9: - # of ones between 0 and k (k inclusive) is k+1 - number of zeros. if int_start == 0: lowerR = nZeros[i] else: - lowerR = nZeros[i] + (int_start - zeroOccPrev[int_start-1, i+1]) - upperR = nZeros[i] + (int_end - zeroOccPrev[int_end-1, i+1]) + lowerR = nZeros[i] + (int_start - zeroOccPrev[int_start - 1, i + 1]) + upperR = nZeros[i] + (int_end - zeroOccPrev[int_end - 1, i + 1]) if upperR > lowerR: # print("Added 1:", int_start, int_end, "->>", lowerR, upperR) intervals_new[nIntervals_new, 0] = lowerR intervals_new[nIntervals_new, 1] = upperR nIntervals_new += 1 # else: - # print(i, "interval rejected:", int_start, int_end, "->", upperR, lowerR) - #This is basically intervals = intervals_new + # print(i, "interval rejected:", int_start, int_end, "->", upperR, lowerR) + # This is basically intervals = intervals_new for j in range(nIntervals_new): intervals[j, 0] = intervals_new[j, 0] intervals[j, 1] = intervals_new[j, 1] @@ -251,26 +258,25 @@ def getConsistentHaplotypes(bwLibrary, hap, start, stop): # print("Finished", i, "->", nIntervals) # print(intervals[0:nIntervals,:]) - hapIndexes = np.full((nHaps, 2), 0, dtype = np.int64) + hapIndexes = np.full((nHaps, 2), 0, dtype=np.int64) nHapsAssigned = 0 for i in range(nIntervals): - int_start = intervals[i, 0] int_end = intervals[i, 1] - - hapIndexes[nHapsAssigned, 0] = a[int_start,start] + hapIndexes[nHapsAssigned, 0] = a[int_start, start] hapIndexes[nHapsAssigned, 1] = int_end - int_start - nHapsAssigned +=1 + nHapsAssigned += 1 return (nHapsAssigned, hapIndexes) + # def printSortAt(loci, library): # a, d, nZeros, zeroOccPrev, haps = library.getValues() # vals = haps[a[:,loci],:] # for i in range(vals.shape[0]): # print(i, ' '.join(map(str, vals[i,:])) ) - + # # print(zeroOccPrev[:,:]) # hapLib = [np.array([1, 0, 0, 0, 0, 0, 1], dtype = np.int8), @@ -307,44 +313,46 @@ def getConsistentHaplotypes(bwLibrary, hap, start, stop): # for key, value in tmp: # print(key, value) + @njit def getConsistentHaplotypesBruteForce(bwLibrary, hap, start, stop): hap = hap[start:stop] a, d, nZeros, zeroOccPrev, haps = bwLibrary.getValues() recodedHaps = haps[:, start:stop] - + nHaps = recodedHaps.shape[0] nLoci = recodedHaps.shape[1] - consistent = np.full(nHaps, 0, dtype = np.int64) + consistent = np.full(nHaps, 0, dtype=np.int64) - #Last correct index + # Last correct index lastIndex = -1 for j in range(nHaps): - - #Otherwise, we have not enough information and need to search. + # Otherwise, we have not enough information and need to search. add = True for i in range(nLoci): - if hap[i] != 9 : + if hap[i] != 9: if recodedHaps[j, i] != hap[i]: add = False if add: consistent[j] = 1 - - hapIndexes = np.full((nHaps, 2), 0, dtype = np.int64) + + hapIndexes = np.full((nHaps, 2), 0, dtype=np.int64) nHapsAssigned = 0 for i in range(nHaps): if consistent[i] > 0: # hapIndexes[nHapsAssigned, 0] = a[i,start] hapIndexes[nHapsAssigned, 0] = i hapIndexes[nHapsAssigned, 1] = consistent[i] - nHapsAssigned +=1 + nHapsAssigned += 1 return (nHapsAssigned, hapIndexes) + + @njit def getHaplotypesPlusWeights(bwLibrary, weights, start, stop): - #Weights are weights to the original haplotypes (haps) + # Weights are weights to the original haplotypes (haps) a, d, nZeros, zeroOccPrev, haps = bwLibrary.getValues() recodedWeights = weights[a[:, start]] @@ -357,35 +365,35 @@ def getHaplotypesPlusWeights(bwLibrary, weights, start, stop): currentWeight = 0 for j in range(nHaps): - #Will need to double check this. This code will be slow! + # Will need to double check this. This code will be slow! if d[j, start] < nLoci: - #Process the last haplotype before moving on. - if currentWeight > bestWeight : + # Process the last haplotype before moving on. + if currentWeight > bestWeight: bestLoci = currentLoci bestWeight = currentWeight - + currentLoci = j currentWeight = 0 currentWeight += recodedWeights[j] - #Make sure we check the last haplotype. - if currentWeight > bestWeight : + # Make sure we check the last haplotype. + if currentWeight > bestWeight: bestLoci = currentLoci bestWeight = currentWeight - - #REMEMBER TO RECODE + # REMEMBER TO RECODE return a[bestLoci, start] -#Older version that doesn't use all of the meta data we have. + +# Older version that doesn't use all of the meta data we have. # @jit(nopython=True, nogil=True) # def getConsistentHaplotypes(bwLibrary, hap, start, stop): # a, d, nZeros, zeroOccPrev, haps = bwLibrary.getValues() # hap = hap[start:stop] # recodedHaps = haps[a[:, start], start:stop] - + # nHaps = recodedHaps.shape[0] # nLoci = recodedHaps.shape[1] # consistent = np.full(nHaps, 0, dtype = np.int64) @@ -402,10 +410,10 @@ def getHaplotypesPlusWeights(bwLibrary, weights, start, stop): # #By definition, all of 0 -> d[j, start]-1 inclusive is the same. # #All of 0 -> lastCorrect (inclusive) is correct. # #First error is the position of the first error. If firstError < nLoci, this is a problem. (nLoci is out of our bounds) - + # lastCorrect = min(lastCorrect, d[j, start]-1) # if firstError > d[j,start]-1: firstError = nLoci - + # #Two elif statements. # #First can we spot an error? # #Second if no error's exist, are we sure that the entire haplotype is right. @@ -427,7 +435,7 @@ def getHaplotypesPlusWeights(bwLibrary, weights, start, stop): # stopIters = False # i = lastCorrect # while not stopIters: -# i = i+1 #We know that the last correct loci is correct. So we're safe to start at last correct +1 +# i = i+1 #We know that the last correct loci is correct. So we're safe to start at last correct +1 # if hap[i] != 9 and recodedHaps[j, i] != hap[i]: # lastCorrect = i-1 # firstError = i @@ -435,16 +443,16 @@ def getHaplotypesPlusWeights(bwLibrary, weights, start, stop): # elif i == nLoci-1: # stopIters = True # lastCorrect = nLoci-1 -# firstError = nLoci +# firstError = nLoci -# if firstError < nLoci: +# if firstError < nLoci: # consistent[j] = 0 # lastIndex = -1 # elif lastCorrect >= nLoci-1: #This will probably be nLoci-1 since that is where our search stops. # consistent[j] = 1 # lastIndex = j - + # hapIndexes = np.full((nHaps, 2), 0, dtype = np.int64) # nHapsAssigned = 0 # for i in range(nHaps): diff --git a/CombinedHMM.py b/CombinedHMM.py index 60ae010..a3d166e 100644 --- a/CombinedHMM.py +++ b/CombinedHMM.py @@ -3,19 +3,19 @@ from . import NumbaUtils from . import ProbMath -class HaploidMarkovModel : - def __init__(self, n_loci, error, recombination_rate = None): +class HaploidMarkovModel: + def __init__(self, n_loci, error, recombination_rate=None): self.update_paramaters(n_loci, error, recombination_rate) self.directional_smoothing = self.create_directional_smoothing() self.apply_smoothing = self.create_apply_smoothing() self.apply_viterbi = self.create_viterbi_algorithm() - self.apply_sampling = self.create_sampling_algorithm(NumbaUtils.multinomial_sample) - - - def update_paramaters(self, n_loci, error, recombination_rate = None): + self.apply_sampling = self.create_sampling_algorithm( + NumbaUtils.multinomial_sample + ) + def update_paramaters(self, n_loci, error, recombination_rate=None): self.n_loci = n_loci if type(error) is float: @@ -24,16 +24,17 @@ def update_paramaters(self, n_loci, error, recombination_rate = None): self.error = error if recombination_rate is None: - recombination_rate = 1.0/n_loci + recombination_rate = 1.0 / n_loci if type(recombination_rate) is float: - self.recombination_rate = np.full(n_loci, recombination_rate, dtype=np.float32) + self.recombination_rate = np.full( + n_loci, recombination_rate, dtype=np.float32 + ) else: self.recombination_rate = recombination_rate - def get_mask(self, called_haplotypes): - return np.all(called_haplotypes != 9, axis = 0) + return np.all(called_haplotypes != 9, axis=0) # def get_run_option(default_arg, alternative_arg): # # Return the default arg as true if it is supplied, otherwise return the alternative arg. @@ -50,53 +51,61 @@ def get_mask(self, called_haplotypes): # else: # return not alternative_arg - - def run_HMM(self, point_estimates = None, algorithm = "marginalize", **kwargs): - + def run_HMM(self, point_estimates=None, algorithm="marginalize", **kwargs): # return_called_values = get_run_option(return_called_values, return_genotype_probabilities) if point_estimates is None: point_estimates = self.get_point_estimates(**kwargs) - + if algorithm == "marginalize": total_probs = self.apply_smoothing(point_estimates, self.recombination_rate) - genotype_probabilities = self.calculate_genotype_probabilities(total_probs, **kwargs) + genotype_probabilities = self.calculate_genotype_probabilities( + total_probs, **kwargs + ) elif algorithm == "viterbi": total_probs = self.apply_viterbi(point_estimates, self.recombination_rate) - genotype_probabilities = self.calculate_genotype_probabilities(total_probs, **kwargs) - + genotype_probabilities = self.calculate_genotype_probabilities( + total_probs, **kwargs + ) + elif algorithm == "sample": total_probs = self.apply_sampling(point_estimates, self.recombination_rate) - genotype_probabilities = self.calculate_genotype_probabilities(total_probs, **kwargs) + genotype_probabilities = self.calculate_genotype_probabilities( + total_probs, **kwargs + ) else: print(f"Valid alrogithm option not given: {alrogithm}") return genotype_probabilities - def call_genotype_probabilities(self, genotype_probabilities, threshold = 0.1): - + def call_genotype_probabilities(self, genotype_probabilities, threshold=0.1): return ProbMath.call_genotype_probs(genotype_probabilities, threshold) - - def get_point_estimates(self, individual, haplotype_library, library_calling_threshold = 0.95, **kwargs) : - called_haplotypes = haplotype_library.get_called_haplotypes(threshold = library_calling_threshold) - mask = self.get_mask(called_haplotypes) - point_estimates = self.njit_get_point_estimates(individual.genotypes, called_haplotypes, self.error, mask) + def get_point_estimates( + self, individual, haplotype_library, library_calling_threshold=0.95, **kwargs + ): + called_haplotypes = haplotype_library.get_called_haplotypes( + threshold=library_calling_threshold + ) + mask = self.get_mask(called_haplotypes) + point_estimates = self.njit_get_point_estimates( + individual.genotypes, called_haplotypes, self.error, mask + ) return point_estimates @staticmethod @jit(nopython=True, nogil=True) def njit_get_point_estimates(genotypes, haplotypes, error, mask): nHap, nLoci = haplotypes.shape - point_estimates = np.full((nLoci, nHap), 1, dtype = np.float32) + point_estimates = np.full((nLoci, nHap), 1, dtype=np.float32) for i in range(nLoci): if genotypes[i] != 9 and mask[i]: for j in range(nHap): sourceGeno = haplotypes[j, i] - if 2*sourceGeno == genotypes[i]: - point_estimates[i, j] = 1-error[i] + if 2 * sourceGeno == genotypes[i]: + point_estimates[i, j] = 1 - error[i] else: point_estimates[i, j] = error[i] @@ -104,20 +113,25 @@ def njit_get_point_estimates(genotypes, haplotypes, error, mask): @staticmethod @jit(nopython=True, nogil=True) - def transmission(cummulative_probabilities, previous_point_probability, recombination_rate, output): + def transmission( + cummulative_probabilities, + previous_point_probability, + recombination_rate, + output, + ): output[:] = cummulative_probabilities * previous_point_probability normalize(output) - output[:] *= (1-recombination_rate) + output[:] *= 1 - recombination_rate output[:] += recombination_rate - - def create_directional_smoothing(self) : + def create_directional_smoothing(self): transmission = self.transmission @jit(nopython=True, nogil=True) - def directional_smoothing(point_estimate, recombination_rate, forward = False, backward = False): - - output = np.full(point_estimate.shape, 1, dtype = np.float32) + def directional_smoothing( + point_estimate, recombination_rate, forward=False, backward=False + ): + output = np.full(point_estimate.shape, 1, dtype=np.float32) n_loci = point_estimate.shape[0] if forward: @@ -131,7 +145,12 @@ def directional_smoothing(point_estimate, recombination_rate, forward = False, b step = -1 for i in range(start, stop, step): - transmission(output[i-step,:], point_estimate[i - step,:], recombination_rate[i], output[i,:]) + transmission( + output[i - step, :], + point_estimate[i - step, :], + recombination_rate[i], + output[i, :], + ) return output @@ -144,9 +163,15 @@ def create_apply_smoothing(self): def apply_smoothing(point_estimate, recombination_rate): """Calculate normalized state probabilities at each loci using the forward-backward algorithm""" - est = ( point_estimate * - directional_smoothing(point_estimate, recombination_rate, forward = True) * - directional_smoothing(point_estimate, recombination_rate, backward = True) ) + est = ( + point_estimate + * directional_smoothing( + point_estimate, recombination_rate, forward=True + ) + * directional_smoothing( + point_estimate, recombination_rate, backward=True + ) + ) # Return normalized probabilities @@ -166,17 +191,21 @@ def sample_path(point_estimate, recombination_rate): # Right now using a matrix output; will improve later. n_loci = point_estimate.shape[0] - output = np.full(point_estimate.shape, 0, dtype = np.float32) + output = np.full(point_estimate.shape, 0, dtype=np.float32) - forward_and_point_estimate = point_estimate * directional_smoothing(point_estimate, recombination_rate, forward = True) + forward_and_point_estimate = point_estimate * directional_smoothing( + point_estimate, recombination_rate, forward=True + ) # First index. selected_index = selection_function(forward_and_point_estimate[-1].ravel()) - output[- 1].ravel()[selected_index] = 1 # Set the output value at the selected_index to 1. + output[-1].ravel()[ + selected_index + ] = 1 # Set the output value at the selected_index to 1. # Always sample backward (for tradition mostly). - locus_estimate = np.full(point_estimate[0].shape, 0, dtype = np.float32) - matrix_ones = np.full(point_estimate[0].shape, 1, dtype = np.float32) + locus_estimate = np.full(point_estimate[0].shape, 0, dtype=np.float32) + matrix_ones = np.full(point_estimate[0].shape, 1, dtype=np.float32) start = n_loci - 2 stop = -1 @@ -184,21 +213,27 @@ def sample_path(point_estimate, recombination_rate): for i in range(start, stop, step): # Pass along sampled value at the locus. - transmission(output[i-step,:], matrix_ones, recombination_rate[i], locus_estimate) + transmission( + output[i - step, :], + matrix_ones, + recombination_rate[i], + locus_estimate, + ) # Combine forward_estimate with backward_estimate - locus_estimate *= forward_and_point_estimate[i,:] + locus_estimate *= forward_and_point_estimate[i, :] selected_index = selection_function(locus_estimate.ravel()) - output[i].ravel()[selected_index] = 1 # Set the output value at the selected_index to 1. + output[i].ravel()[ + selected_index + ] = 1 # Set the output value at the selected_index to 1. # Return probabilities return output return sample_path - def create_viterbi_algorithm(self): - maximum_likelihood_step = self.maximum_likelihood_step + @jit(nopython=True, nogil=True) def viterbi_path(point_estimate, recombination_rate): """Calculate normalized state probabilities at each loci using the forward-backward algorithm""" @@ -206,25 +241,35 @@ def viterbi_path(point_estimate, recombination_rate): # Right now using a matrix output; will improve later. n_loci = point_estimate.shape[0] - path_score = np.full(point_estimate.shape, 0, dtype = np.float32) - previous_index = np.full(point_estimate.shape, 0, dtype = np.int64) + path_score = np.full(point_estimate.shape, 0, dtype=np.float32) + previous_index = np.full(point_estimate.shape, 0, dtype=np.int64) - output = np.full(point_estimate.shape, 0, dtype = np.float32) + output = np.full(point_estimate.shape, 0, dtype=np.float32) path_score[0] = point_estimate[0] - start = 1; stop = n_loci; step = 1 + start = 1 + stop = n_loci + step = 1 for i in range(start, stop, step): # Pass along sampled value at the locus. - maximum_likelihood_step(path_score[i-step], recombination_rate[i], point_estimate[i], path_score[i], previous_index[i]) + maximum_likelihood_step( + path_score[i - step], + recombination_rate[i], + point_estimate[i], + path_score[i], + previous_index[i], + ) # Traceback start_index = np.argmax(path_score[-1]) - output[n_loci-1].ravel()[start_index] = 1 + output[n_loci - 1].ravel()[start_index] = 1 index = start_index - start = n_loci-2; stop = -1; step = -1 + start = n_loci - 2 + stop = -1 + step = -1 for i in range(start, stop, step): - index = previous_index[i-step].ravel()[index] + index = previous_index[i - step].ravel()[index] output[i].ravel()[index] = 1 return output @@ -232,84 +277,120 @@ def viterbi_path(point_estimate, recombination_rate): @staticmethod @jit(nopython=True, nogil=True) - def maximum_likelihood_step(previous_path_score, recombination_rate, point_estimate, output_path_score, output_index): - + def maximum_likelihood_step( + previous_path_score, + recombination_rate, + point_estimate, + output_path_score, + output_index, + ): best_index = np.argmax(previous_path_score) best_score = previous_path_score[best_index] n_hap = previous_path_score.shape[0] for i in range(n_hap): - - no_rec_score = (1-recombination_rate)*previous_path_score[i] - rec_score = best_score*recombination_rate + no_rec_score = (1 - recombination_rate) * previous_path_score[i] + rec_score = best_score * recombination_rate if no_rec_score > rec_score: # No recombination - output_path_score[i] = no_rec_score*point_estimate[i] + output_path_score[i] = no_rec_score * point_estimate[i] output_index[i] = i else: # Recombination - output_path_score[i] = rec_score/n_hap*point_estimate[i] + output_path_score[i] = rec_score / n_hap * point_estimate[i] output_index[i] = best_index - + output_path_score /= np.sum(output_path_score) - def calculate_genotype_probabilities(self, total_probs, haplotype_library, **kwargs): + def calculate_genotype_probabilities( + self, total_probs, haplotype_library, **kwargs + ): haplotype_dosages = haplotype_library.get_haplotypes() - return self.njit_calculate_genotype_probabilities(total_probs, haplotype_dosages) + return self.njit_calculate_genotype_probabilities( + total_probs, haplotype_dosages + ) - - @staticmethod + @staticmethod @jit(nopython=True, nogil=True) - def njit_calculate_genotype_probabilities(total_probs, reference_haplotypes) : + def njit_calculate_genotype_probabilities(total_probs, reference_haplotypes): n_hap, n_loci = reference_haplotypes.shape - geno_probs = np.full((2, n_loci), 0.0000001, dtype = np.float32) # Adding a very small value as a prior incase all of the values are missing. + geno_probs = np.full( + (2, n_loci), 0.0000001, dtype=np.float32 + ) # Adding a very small value as a prior incase all of the values are missing. for i in range(n_loci): for j in range(n_hap): hap_value = reference_haplotypes[j, i] - prob_value = total_probs[i,j] + prob_value = total_probs[i, j] if hap_value != 9: - # Add in a sum of total_probs values. - geno_probs[0, i] += prob_value * (1-hap_value) + # Add in a sum of total_probs values. + geno_probs[0, i] += prob_value * (1 - hap_value) geno_probs[1, i] += prob_value * hap_value - geno_probs = geno_probs/np.sum(geno_probs, axis = 0) + geno_probs = geno_probs / np.sum(geno_probs, axis=0) return geno_probs -class DiploidMarkovModel(HaploidMarkovModel) : - def __init__(self, n_loci, error, recombination_rate = None): +class DiploidMarkovModel(HaploidMarkovModel): + def __init__(self, n_loci, error, recombination_rate=None): HaploidMarkovModel.__init__(self, n_loci, error, recombination_rate) - - def extract_reference_panels(self, haplotype_library = None, paternal_haplotype_library = None, maternal_haplotype_library = None) : - if maternal_haplotype_library is not None and paternal_haplotype_library is not None: + def extract_reference_panels( + self, + haplotype_library=None, + paternal_haplotype_library=None, + maternal_haplotype_library=None, + ): + if ( + maternal_haplotype_library is not None + and paternal_haplotype_library is not None + ): seperate_reference_panels = True - return paternal_haplotype_library, maternal_haplotype_library, seperate_reference_panels + return ( + paternal_haplotype_library, + maternal_haplotype_library, + seperate_reference_panels, + ) else: seperate_reference_panels = False return haplotype_library, haplotype_library, seperate_reference_panels - - def get_point_estimates(self, individual, library_calling_threshold= 0.95, **kwargs): - paternal_haplotype_library, maternal_haplotype_library, seperate_reference_panels = self.extract_reference_panels(**kwargs) - - paternal_called_haplotypes = paternal_haplotype_library.get_called_haplotypes(threshold = library_calling_threshold) - maternal_called_haplotypes = maternal_haplotype_library.get_called_haplotypes(threshold = library_calling_threshold) - - mask = self.get_mask(paternal_called_haplotypes) & self.get_mask(maternal_called_haplotypes) - return self.njit_get_point_estimates(individual.genotypes, paternal_called_haplotypes, maternal_called_haplotypes, self.error, mask) - + def get_point_estimates(self, individual, library_calling_threshold=0.95, **kwargs): + ( + paternal_haplotype_library, + maternal_haplotype_library, + seperate_reference_panels, + ) = self.extract_reference_panels(**kwargs) + + paternal_called_haplotypes = paternal_haplotype_library.get_called_haplotypes( + threshold=library_calling_threshold + ) + maternal_called_haplotypes = maternal_haplotype_library.get_called_haplotypes( + threshold=library_calling_threshold + ) + + mask = self.get_mask(paternal_called_haplotypes) & self.get_mask( + maternal_called_haplotypes + ) + return self.njit_get_point_estimates( + individual.genotypes, + paternal_called_haplotypes, + maternal_called_haplotypes, + self.error, + mask, + ) @staticmethod @jit(nopython=True, nogil=True) - def njit_get_point_estimates(indGeno, paternalHaplotypes, maternalHaplotypes, error, mask): + def njit_get_point_estimates( + indGeno, paternalHaplotypes, maternalHaplotypes, error, mask + ): nPat, nLoci = paternalHaplotypes.shape nMat, nLoci = maternalHaplotypes.shape - point_estimates = np.full((nLoci, nPat, nMat), 1, dtype = np.float32) + point_estimates = np.full((nLoci, nPat, nMat), 1, dtype=np.float32) for i in range(nLoci): if indGeno[i] != 9 and mask[i]: @@ -317,105 +398,161 @@ def njit_get_point_estimates(indGeno, paternalHaplotypes, maternalHaplotypes, er for k in range(nMat): sourceGeno = paternalHaplotypes[j, i] + maternalHaplotypes[k, i] if sourceGeno == indGeno[i]: - point_estimates[i, j, k] = 1-error[i] + point_estimates[i, j, k] = 1 - error[i] else: point_estimates[i, j, k] = error[i] return point_estimates - - def calculate_genotype_probabilities(self, total_probs, haplotype_library = None, paternal_haplotype_library= None, maternal_haplotype_library= None, **kwargs): - paternal_haplotype_library, maternal_haplotype_library, seperate_reference_panels = self.extract_reference_panels(haplotype_library, paternal_haplotype_library, maternal_haplotype_library) - return self.njit_calculate_genotype_probabilities(total_probs, paternal_haplotype_library.get_haplotypes(), maternal_haplotype_library.get_haplotypes(), seperate_reference_panels) + def calculate_genotype_probabilities( + self, + total_probs, + haplotype_library=None, + paternal_haplotype_library=None, + maternal_haplotype_library=None, + **kwargs, + ): + ( + paternal_haplotype_library, + maternal_haplotype_library, + seperate_reference_panels, + ) = self.extract_reference_panels( + haplotype_library, paternal_haplotype_library, maternal_haplotype_library + ) + return self.njit_calculate_genotype_probabilities( + total_probs, + paternal_haplotype_library.get_haplotypes(), + maternal_haplotype_library.get_haplotypes(), + seperate_reference_panels, + ) @staticmethod @jit(nopython=True, nogil=True) - def njit_calculate_genotype_probabilities(total_probs, paternal_haplotypes, maternal_haplotypes, seperate_reference_panels) : + def njit_calculate_genotype_probabilities( + total_probs, paternal_haplotypes, maternal_haplotypes, seperate_reference_panels + ): n_pat, n_loci = paternal_haplotypes.shape n_mat, n_loci = maternal_haplotypes.shape - geno_probs = np.full((4, n_loci), 0.00001, dtype = np.float32) + geno_probs = np.full((4, n_loci), 0.00001, dtype=np.float32) for i in range(n_loci): for j in range(n_pat): for k in range(n_mat): # diploid case where the markers are assumed independent. - if seperate_reference_panels or j != k: + if seperate_reference_panels or j != k: pat_value = paternal_haplotypes[j, i] mat_value = maternal_haplotypes[k, i] - prob_value = total_probs[i,j,k] + prob_value = total_probs[i, j, k] if pat_value != 9 and mat_value != 9: - # Add in a sum of total_probs values. - geno_probs[0, i] += prob_value * (1-pat_value)*(1-mat_value) #aa - geno_probs[1, i] += prob_value * (1-pat_value)*mat_value #aA - geno_probs[2, i] += prob_value * pat_value*(1-mat_value) #Aa - geno_probs[3, i] += prob_value * pat_value*mat_value #AA + # Add in a sum of total_probs values. + geno_probs[0, i] += ( + prob_value * (1 - pat_value) * (1 - mat_value) + ) # aa + geno_probs[1, i] += ( + prob_value * (1 - pat_value) * mat_value + ) # aA + geno_probs[2, i] += ( + prob_value * pat_value * (1 - mat_value) + ) # Aa + geno_probs[3, i] += prob_value * pat_value * mat_value # AA # Haploid case where the markers are not independent else: hap_value = paternal_haplotypes[j, i] - prob_value = total_probs[i,j,k] + prob_value = total_probs[i, j, k] if hap_value != 9: - geno_probs[0, i] += prob_value * (1-hap_value) + geno_probs[0, i] += prob_value * (1 - hap_value) geno_probs[1, i] += 0 geno_probs[2, i] += 0 geno_probs[3, i] += prob_value * hap_value - geno_probs = geno_probs/np.sum(geno_probs, axis = 0) + geno_probs = geno_probs / np.sum(geno_probs, axis=0) return geno_probs - - - @staticmethod @jit(nopython=True, nogil=True) - def transmission(cummulative_probabilities, previous_point_probability, recombination_rate, output): + def transmission( + cummulative_probabilities, + previous_point_probability, + recombination_rate, + output, + ): output[:] = cummulative_probabilities * previous_point_probability normalize(output) row_sums = np.sum(output, 0) col_sums = np.sum(output, 1) - output[:] *= (1 - recombination_rate)**2 # No recombination on either chromosome. - output[:] += np.expand_dims(row_sums, 0)/output.shape[0]*recombination_rate*(1-recombination_rate) # recombination on the maternal (second) chromosome) - output[:] += np.expand_dims(col_sums, 1)/output.shape[1]*recombination_rate*(1-recombination_rate) # recombination on the paternal (first) chromosome) - output[:] += recombination_rate**2/output.size # double recombination - + output[:] *= ( + 1 - recombination_rate + ) ** 2 # No recombination on either chromosome. + output[:] += ( + np.expand_dims(row_sums, 0) + / output.shape[0] + * recombination_rate + * (1 - recombination_rate) + ) # recombination on the maternal (second) chromosome) + output[:] += ( + np.expand_dims(col_sums, 1) + / output.shape[1] + * recombination_rate + * (1 - recombination_rate) + ) # recombination on the paternal (first) chromosome) + output[:] += recombination_rate**2 / output.size # double recombination + @staticmethod @jit(nopython=True, nogil=True) - def maximum_likelihood_step(previous_path_score, recombination_rate, point_estimate, output_path_score, output_index): - + def maximum_likelihood_step( + previous_path_score, + recombination_rate, + point_estimate, + output_path_score, + output_index, + ): n_pat = previous_path_score.shape[0] n_mat = previous_path_score.shape[1] combined_max_index = np.argmax(previous_path_score) - combined_max_score = previous_path_score.ravel()[combined_max_index] * recombination_rate**2/(n_mat*n_pat) + combined_max_score = ( + previous_path_score.ravel()[combined_max_index] + * recombination_rate**2 + / (n_mat * n_pat) + ) - paternal_max_index = np.full(n_pat, 0, dtype = np.int64) - paternal_max_value = np.full(n_pat, 0, dtype = np.float32) + paternal_max_index = np.full(n_pat, 0, dtype=np.int64) + paternal_max_value = np.full(n_pat, 0, dtype=np.float32) - maternal_max_index = np.full(n_mat, 0, dtype = np.int64) - maternal_max_value = np.full(n_mat, 0, dtype = np.float32) + maternal_max_index = np.full(n_mat, 0, dtype=np.int64) + maternal_max_value = np.full(n_mat, 0, dtype=np.float32) # Recombination on the maternal side, paternal side is fixed for i in range(n_pat): - index = np.argmax(previous_path_score[i,:]) - paternal_max_value[i] = previous_path_score[i, index] * (1-recombination_rate)*recombination_rate/n_mat - paternal_max_index[i] = i*n_mat + index - + index = np.argmax(previous_path_score[i, :]) + paternal_max_value[i] = ( + previous_path_score[i, index] + * (1 - recombination_rate) + * recombination_rate + / n_mat + ) + paternal_max_index[i] = i * n_mat + index # Recombination on the paternal side, maternal side is fixed for j in range(n_mat): index = np.argmax(previous_path_score[:, j]) - maternal_max_value[j] = previous_path_score[index, j] * (1-recombination_rate)*recombination_rate/n_pat - maternal_max_index[j] = index*n_mat + j + maternal_max_value[j] = ( + previous_path_score[index, j] + * (1 - recombination_rate) + * recombination_rate + / n_pat + ) + maternal_max_index[j] = index * n_mat + j for i in range(n_pat): for j in range(n_mat): + best_score = (1 - recombination_rate) ** 2 * previous_path_score[i, j] + best_index = i * n_mat + j - best_score = (1-recombination_rate)**2*previous_path_score[i,j] - best_index = i*n_mat + j - - # Paternal recombination + # Paternal recombination if paternal_max_value[i] > best_score: best_score = paternal_max_value[i] best_index = paternal_max_index[i] @@ -423,31 +560,30 @@ def maximum_likelihood_step(previous_path_score, recombination_rate, point_estim if maternal_max_value[j] > best_score: best_score = maternal_max_value[j] best_index = maternal_max_index[j] - + if combined_max_score > best_score: best_score = combined_max_score best_index = combined_max_index - output_path_score[i,j] = best_score*point_estimate[i,j] - output_index[i,j] = best_index - - + output_path_score[i, j] = best_score * point_estimate[i, j] + output_index[i, j] = best_index + output_path_score /= np.sum(output_path_score) -class JointMarkovModel(HaploidMarkovModel) : - def __init__(self, n_loci, error, recombination_rate = None): - HaploidMarkovModel.__init__(self, n_loci, error, recombination_rate) +class JointMarkovModel(HaploidMarkovModel): + def __init__(self, n_loci, error, recombination_rate=None): + HaploidMarkovModel.__init__(self, n_loci, error, recombination_rate) @staticmethod @jit(nopython=True, nogil=True) def njit_get_point_estimates(indGeno, haplotypes, error, mask): n_hap, n_loci = haplotypes.shape - point_estimates = np.full((n_loci, n_hap, n_hap + 1), 1, dtype = np.float32) + point_estimates = np.full((n_loci, n_hap, n_hap + 1), 1, dtype=np.float32) - diploid_section = point_estimates[:,:,0:-1] - haploid_section = point_estimates[:,:,-1] + diploid_section = point_estimates[:, :, 0:-1] + haploid_section = point_estimates[:, :, -1] # Diploid point estimates/Emission probabilities @@ -457,7 +593,7 @@ def njit_get_point_estimates(indGeno, haplotypes, error, mask): for k in range(n_hap): sourceGeno = haplotypes[j, i] + haplotypes[k, i] if sourceGeno == indGeno[i]: - diploid_section[i, j, k] = 1-error[i] + diploid_section[i, j, k] = 1 - error[i] else: diploid_section[i, j, k] = error[i] @@ -465,45 +601,50 @@ def njit_get_point_estimates(indGeno, haplotypes, error, mask): for i in range(n_loci): if indGeno[i] != 9 and mask[i]: for j in range(n_hap): - sourceGeno = 2*haplotypes[j, i] - if sourceGeno == indGeno[i]: - haploid_section[i, j] = 1-error[i] - else: - haploid_section[i, j] = error[i] + sourceGeno = 2 * haplotypes[j, i] + if sourceGeno == indGeno[i]: + haploid_section[i, j] = 1 - error[i] + else: + haploid_section[i, j] = error[i] return point_estimates - @staticmethod @jit(nopython=True, nogil=True) - def njit_calculate_genotype_probabilities(total_probs, reference_haplotypes) : + def njit_calculate_genotype_probabilities(total_probs, reference_haplotypes): n_hap, n_loci = reference_haplotypes.shape - geno_probs = np.full((4, n_loci), 0.00001, dtype = np.float32) + geno_probs = np.full((4, n_loci), 0.00001, dtype=np.float32) - diploid_section = total_probs[:,:,0:-1] - haploid_section = total_probs[:,:,-1] + diploid_section = total_probs[:, :, 0:-1] + haploid_section = total_probs[:, :, -1] for i in range(n_loci): for j in range(n_hap): for k in range(n_hap): # diploid case where the markers are assumed independent. - if j != k: + if j != k: pat_value = reference_haplotypes[j, i] mat_value = reference_haplotypes[k, i] - prob_value = diploid_section[i,j,k] + prob_value = diploid_section[i, j, k] if pat_value != 9 and mat_value != 9: - # Add in a sum of total_probs values. - geno_probs[0, i] += prob_value * (1-pat_value)*(1-mat_value) #aa - geno_probs[1, i] += prob_value * (1-pat_value)*mat_value #aA - geno_probs[2, i] += prob_value * pat_value*(1-mat_value) #Aa - geno_probs[3, i] += prob_value * pat_value*mat_value #AA + # Add in a sum of total_probs values. + geno_probs[0, i] += ( + prob_value * (1 - pat_value) * (1 - mat_value) + ) # aa + geno_probs[1, i] += ( + prob_value * (1 - pat_value) * mat_value + ) # aA + geno_probs[2, i] += ( + prob_value * pat_value * (1 - mat_value) + ) # Aa + geno_probs[3, i] += prob_value * pat_value * mat_value # AA # markers are not independent else: hap_value = reference_haplotypes[j, i] - prob_value = diploid_section[i,j,k] + prob_value = diploid_section[i, j, k] if hap_value != 9: - geno_probs[0, i] += prob_value * (1-hap_value) + geno_probs[0, i] += prob_value * (1 - hap_value) geno_probs[1, i] += 0 geno_probs[2, i] += 0 geno_probs[3, i] += prob_value * hap_value @@ -511,28 +652,29 @@ def njit_calculate_genotype_probabilities(total_probs, reference_haplotypes) : for i in range(n_loci): for j in range(n_hap): hap_value = reference_haplotypes[j, i] - prob_value = haploid_section[i,j] + prob_value = haploid_section[i, j] if hap_value != 9: - geno_probs[0, i] += prob_value * (1-hap_value) + geno_probs[0, i] += prob_value * (1 - hap_value) geno_probs[1, i] += 0 geno_probs[2, i] += 0 geno_probs[3, i] += prob_value * hap_value - geno_probs = geno_probs/np.sum(geno_probs, axis = 0) + geno_probs = geno_probs / np.sum(geno_probs, axis=0) return geno_probs - - - @staticmethod @jit(nopython=True, nogil=True) - def transmission(cummulative_probabilities, previous_point_probability, recombination_rate, output): - + def transmission( + cummulative_probabilities, + previous_point_probability, + recombination_rate, + output, + ): output[:] = cummulative_probabilities * previous_point_probability normalize(output) - diploid_section = output[:,0:-1] - haploid_section = output[:,-1] + diploid_section = output[:, 0:-1] + haploid_section = output[:, -1] diploid_weight = np.sum(diploid_section) haploid_weight = np.sum(haploid_section) @@ -540,23 +682,34 @@ def transmission(cummulative_probabilities, previous_point_probability, recombin row_sums = np.sum(diploid_section, 0) col_sums = np.sum(diploid_section, 1) - diploid_section[:] *= (1 - recombination_rate)**2 - diploid_section[:] += np.expand_dims(row_sums, 0)/diploid_section.shape[0]*recombination_rate*(1-recombination_rate) # recombination on the maternal (second) chromosome) - diploid_section[:] += np.expand_dims(col_sums, 1)/diploid_section.shape[1]*recombination_rate*(1-recombination_rate) # recombination on the paternal (first) chromosome) - diploid_section[:] += diploid_weight*recombination_rate**2/diploid_section.size # double recombination - - haploid_section[:] *= (1 - recombination_rate) - haploid_section[:] += haploid_weight*recombination_rate/haploid_section.size + diploid_section[:] *= (1 - recombination_rate) ** 2 + diploid_section[:] += ( + np.expand_dims(row_sums, 0) + / diploid_section.shape[0] + * recombination_rate + * (1 - recombination_rate) + ) # recombination on the maternal (second) chromosome) + diploid_section[:] += ( + np.expand_dims(col_sums, 1) + / diploid_section.shape[1] + * recombination_rate + * (1 - recombination_rate) + ) # recombination on the paternal (first) chromosome) + diploid_section[:] += ( + diploid_weight * recombination_rate**2 / diploid_section.size + ) # double recombination + + haploid_section[:] *= 1 - recombination_rate + haploid_section[:] += haploid_weight * recombination_rate / haploid_section.size # loose the recombination to the haploid section and add the haploid recombination to diploid - diploid_section[:] *= (1 - recombination_rate) - diploid_section[:] += recombination_rate * haploid_weight/diploid_section.size + diploid_section[:] *= 1 - recombination_rate + diploid_section[:] += recombination_rate * haploid_weight / diploid_section.size # loose the recombination to the haploid section and add the haploid recombination to diploid - haploid_section[:] *= (1 - recombination_rate) - haploid_section[:] += recombination_rate * diploid_weight/haploid_section.size + haploid_section[:] *= 1 - recombination_rate + haploid_section[:] += recombination_rate * diploid_weight / haploid_section.size - # @staticmethod # @jit(nopython=True, nogil=True) # def maximum_likelihood_step(previous_path_score, recombination_rate, point_estimate, output_path_score, output_index): @@ -579,7 +732,6 @@ def transmission(cummulative_probabilities, previous_point_probability, recombin # paternal_max_value[i] = previous_path_score[i, index] * (1-recombination_rate)*recombination_rate/n_mat # paternal_max_index[i] = i*n_mat + index - # # Recombination on the paternal side, maternal side is fixed # for j in range(n_mat): # index = np.argmax(previous_path_score[:, j]) @@ -592,7 +744,7 @@ def transmission(cummulative_probabilities, previous_point_probability, recombin # best_score = (1-recombination_rate)**2*previous_path_score[i,j] # best_index = i*n_mat + j - # # Paternal recombination + # # Paternal recombination # if paternal_max_value[i] > best_score: # best_score = paternal_max_value[i] # best_index = paternal_max_index[i] @@ -600,25 +752,23 @@ def transmission(cummulative_probabilities, previous_point_probability, recombin # if maternal_max_value[j] > best_score: # best_score = maternal_max_value[j] # best_index = maternal_max_index[j] - + # if combined_max_score > best_score: # best_score = combined_max_score # best_index = combined_max_index # output_path_score[i,j] = best_score*point_estimate[i,j] # output_index[i,j] = best_index - - + # output_path_score /= np.sum(output_path_score) @jit(nopython=True, nogil=True) def normalize(mat): - mat[:] /= np.sum(mat) + mat[:] /= np.sum(mat) + @jit(nopython=True, nogil=True) def normalize_along_first_axis(mat): for i in range(mat.shape[0]): - normalize(mat[i,:]) - - + normalize(mat[i, :]) diff --git a/DiploidHMM.py b/DiploidHMM.py index ad24f93..06e3cb6 100644 --- a/DiploidHMM.py +++ b/DiploidHMM.py @@ -2,11 +2,19 @@ import numpy as np from . import ProbMath from . import NumbaUtils -from . HaplotypeLibrary import haplotype_from_indices - - -def diploidHMM(individual, paternal_haplotypes, maternal_haplotypes, error, recombination_rate, calling_method='dosages', use_called_haps=True, include_geno_probs=False): - +from .HaplotypeLibrary import haplotype_from_indices + + +def diploidHMM( + individual, + paternal_haplotypes, + maternal_haplotypes, + error, + recombination_rate, + calling_method="dosages", + use_called_haps=True, + include_geno_probs=False, +): n_loci = len(individual.genotypes) # !!!! NEED TO MAKE SURE SOURCE HAPLOTYPES ARE ALL NON MISSING!!! @@ -25,36 +33,65 @@ def diploidHMM(individual, paternal_haplotypes, maternal_haplotypes, error, reco # Construct penetrance values (point estimates) if use_called_haps: - point_estimates = getDiploidPointEstimates(individual.genotypes, individual.haplotypes[0], individual.haplotypes[1], paternal_haplotypes, maternal_haplotypes, error) - elif calling_method in ('sample', 'dosages', 'Viterbi'): + point_estimates = getDiploidPointEstimates( + individual.genotypes, + individual.haplotypes[0], + individual.haplotypes[1], + paternal_haplotypes, + maternal_haplotypes, + error, + ) + elif calling_method in ("sample", "dosages", "Viterbi"): n_pat = len(paternal_haplotypes) n_mat = len(maternal_haplotypes) point_estimates = np.ones((n_loci, n_pat, n_mat), dtype=np.float32) - getDiploidPointEstimates_geno(individual.genotypes, paternal_haplotypes, maternal_haplotypes, error, point_estimates) + getDiploidPointEstimates_geno( + individual.genotypes, + paternal_haplotypes, + maternal_haplotypes, + error, + point_estimates, + ) else: probs = ProbMath.getGenotypeProbabilities_ind(individual) - point_estimates = getDiploidPointEstimates_probs(probs, paternal_haplotypes, maternal_haplotypes, error) + point_estimates = getDiploidPointEstimates_probs( + probs, paternal_haplotypes, maternal_haplotypes, error + ) # Do 'sample' and 'Viterbi' before other 'calling_method' as we don't need the forward-backward probs - if calling_method == 'sample': - haplotypes = getDiploidSample(point_estimates, recombination_rate, paternal_haplotypes, maternal_haplotypes) + if calling_method == "sample": + haplotypes = getDiploidSample( + point_estimates, + recombination_rate, + paternal_haplotypes, + maternal_haplotypes, + ) individual.imputed_haplotypes = haplotypes return - if calling_method == 'Viterbi': - haplotypes = get_viterbi(point_estimates, recombination_rate, paternal_haplotypes, maternal_haplotypes) + if calling_method == "Viterbi": + haplotypes = get_viterbi( + point_estimates, + recombination_rate, + paternal_haplotypes, + maternal_haplotypes, + ) individual.imputed_haplotypes = haplotypes return # Run forward-backward algorithm on penetrance values total_probs = diploidForwardBackward(point_estimates, recombination_rate) - if calling_method == 'dosages': - dosages = getDiploidDosages(total_probs, paternal_haplotypes, maternal_haplotypes) + if calling_method == "dosages": + dosages = getDiploidDosages( + total_probs, paternal_haplotypes, maternal_haplotypes + ) individual.dosages = dosages - if calling_method == 'probabilities': - values = getDiploidProbabilities(total_probs, paternal_haplotypes, maternal_haplotypes) + if calling_method == "probabilities": + values = getDiploidProbabilities( + total_probs, paternal_haplotypes, maternal_haplotypes + ) individual.info = values - if calling_method == 'callhaps': - raise ValueError('callhaps not yet implimented.') + if calling_method == "callhaps": + raise ValueError("callhaps not yet implimented.") @jit(nopython=True) @@ -74,7 +111,9 @@ def getDiploidDosages(hapEst, paternalHaplotypes, maternalHaplotypes): for i in range(nLoci): for j in range(nPat): for k in range(nMat): - dosages[i] += hapEst[i, j, k]*(paternalHaplotypes[j, i] + maternalHaplotypes[k, i]) + dosages[i] += hapEst[i, j, k] * ( + paternalHaplotypes[j, i] + maternalHaplotypes[k, i] + ) return dosages @@ -82,7 +121,7 @@ def getDiploidDosages(hapEst, paternalHaplotypes, maternalHaplotypes): def getDiploidProbabilities(hapEst, paternalHaplotypes, maternalHaplotypes): nPat, nLoci = paternalHaplotypes.shape nMat, nLoci = maternalHaplotypes.shape - probs = np.full((4, nLoci), 0, dtype = np.float32) + probs = np.full((4, nLoci), 0, dtype=np.float32) for i in range(nLoci): for j in range(nPat): for k in range(nMat): @@ -104,18 +143,24 @@ def getDiploidProbabilities(hapEst, paternalHaplotypes, maternalHaplotypes): def getDiploidSample(point_estimate, recombination_rate, paternal_haps, maternal_haps): """Sample a pair of haplotypes""" forward_probs = diploid_forward(point_estimate, recombination_rate, in_place=True) - haplotypes = diploidSampleHaplotypes(forward_probs, recombination_rate, paternal_haps, maternal_haps) + haplotypes = diploidSampleHaplotypes( + forward_probs, recombination_rate, paternal_haps, maternal_haps + ) return haplotypes @jit(nopython=True, nogil=True) -def get_viterbi(point_estimates, recombination_rate, paternal_haplotypes, maternal_haplotypes): +def get_viterbi( + point_estimates, recombination_rate, paternal_haplotypes, maternal_haplotypes +): """Get most likely haplotype pair using the Viterbi algorithm""" n_loci = point_estimates.shape[0] haplotypes = np.full((2, n_loci), 9, dtype=np.int8) forward_probs = diploid_forward(point_estimates, recombination_rate) - paternal_indices, maternal_indices = diploid_viterbi(forward_probs, recombination_rate) + paternal_indices, maternal_indices = diploid_viterbi( + forward_probs, recombination_rate + ) haplotypes[0] = haplotype_from_indices(paternal_indices, paternal_haplotypes) haplotypes[1] = haplotype_from_indices(maternal_indices, maternal_haplotypes) @@ -123,7 +168,9 @@ def get_viterbi(point_estimates, recombination_rate, paternal_haplotypes, matern @jit(nopython=True, nogil=True) -def getDiploidPointEstimates(indGeno, indPatHap, indMatHap, paternalHaplotypes, maternalHaplotypes, error): +def getDiploidPointEstimates( + indGeno, indPatHap, indMatHap, paternalHaplotypes, maternalHaplotypes, error +): nPat, nLoci = paternalHaplotypes.shape nMat, nLoci = maternalHaplotypes.shape @@ -136,48 +183,52 @@ def getDiploidPointEstimates(indGeno, indPatHap, indMatHap, paternalHaplotypes, if indPatHap[i] != 9 and indMatHap[i] != 9: value = 1 if indPatHap[i] == paternalHaplotypes[j, i]: - value *= (1-error[i]) + value *= 1 - error[i] else: value *= error[i] if indMatHap[i] == maternalHaplotypes[k, i]: - value *= (1-error[i]) + value *= 1 - error[i] else: value *= error[i] pointEst[i, j, k] = value else: - #I think this shouldn't be too horrible. + # I think this shouldn't be too horrible. sourceGeno = paternalHaplotypes[j, i] + maternalHaplotypes[k, i] if sourceGeno == indGeno[i]: - pointEst[i, j, k] = 1-error[i]*error[i] + pointEst[i, j, k] = 1 - error[i] * error[i] else: - pointEst[i, j, k] = error[i]*error[i] + pointEst[i, j, k] = error[i] * error[i] return pointEst + @jit(nopython=True, nogil=True) -def getDiploidPointEstimates_geno(indGeno, paternalHaplotypes, maternalHaplotypes, error, pointEst): +def getDiploidPointEstimates_geno( + indGeno, paternalHaplotypes, maternalHaplotypes, error, pointEst +): nPat, nLoci = paternalHaplotypes.shape nMat, nLoci = maternalHaplotypes.shape for i in range(nLoci): if indGeno[i] != 9: - error_2 = error[i]*error[i] + error_2 = error[i] * error[i] for j in range(nPat): for k in range(nMat): - - #I think this shouldn't be too horrible. + # I think this shouldn't be too horrible. sourceGeno = paternalHaplotypes[j, i] + maternalHaplotypes[k, i] if sourceGeno == indGeno[i]: - pointEst[i, j, k] = 1-error_2 + pointEst[i, j, k] = 1 - error_2 else: pointEst[i, j, k] = error_2 @jit(nopython=True) -def getDiploidPointEstimates_probs(indProbs, paternalHaplotypes, maternalHaplotypes, error): +def getDiploidPointEstimates_probs( + indProbs, paternalHaplotypes, maternalHaplotypes, error +): nPat, nLoci = paternalHaplotypes.shape nMat, nLoci = maternalHaplotypes.shape - pointEst = np.full((nPat, nMat, nLoci), 1, dtype = np.float32) + pointEst = np.full((nPat, nMat, nLoci), 1, dtype=np.float32) for i in range(nLoci): for j in range(nPat): for k in range(nMat): @@ -187,19 +238,35 @@ def getDiploidPointEstimates_probs(indProbs, paternalHaplotypes, maternalHaploty p_Aa = indProbs[2, i] p_AA = indProbs[3, i] e = error[i] - if paternalHaplotypes[j,i] == 0 and maternalHaplotypes[k, i] == 0: - value = p_aa*(1-e)**2 + (p_aA + p_Aa)*e*(1-e) + p_AA*e**2 + if paternalHaplotypes[j, i] == 0 and maternalHaplotypes[k, i] == 0: + value = ( + p_aa * (1 - e) ** 2 + + (p_aA + p_Aa) * e * (1 - e) + + p_AA * e**2 + ) - if paternalHaplotypes[j,i] == 1 and maternalHaplotypes[k, i] == 0: - value = p_Aa*(1-e)**2 + (p_aa + p_AA)*e*(1-e) + p_aA*e**2 + if paternalHaplotypes[j, i] == 1 and maternalHaplotypes[k, i] == 0: + value = ( + p_Aa * (1 - e) ** 2 + + (p_aa + p_AA) * e * (1 - e) + + p_aA * e**2 + ) - if paternalHaplotypes[j,i] == 0 and maternalHaplotypes[k, i] == 1: - value = p_aA*(1-e)**2 + (p_aa + p_AA)*e*(1-e) + p_Aa*e**2 + if paternalHaplotypes[j, i] == 0 and maternalHaplotypes[k, i] == 1: + value = ( + p_aA * (1 - e) ** 2 + + (p_aa + p_AA) * e * (1 - e) + + p_Aa * e**2 + ) - if paternalHaplotypes[j,i] == 1 and maternalHaplotypes[k, i] == 1: - value = p_AA*(1-e)**2 + (p_aA + p_Aa)*e*(1-e) + p_aa*e**2 + if paternalHaplotypes[j, i] == 1 and maternalHaplotypes[k, i] == 1: + value = ( + p_AA * (1 - e) ** 2 + + (p_aA + p_Aa) * e * (1 - e) + + p_aa * e**2 + ) - pointEst[j,k,i] = value + pointEst[j, k, i] = value return pointEst @@ -224,7 +291,7 @@ def diploid_normalize(array): sum_ += array[j, k, i] for j in range(n_pat): for k in range(n_mat): - array[j, k, i] = array[j, k, i]/sum_ + array[j, k, i] = array[j, k, i] / sum_ @jit(nopython=True) @@ -254,19 +321,19 @@ def transmit(previous_estimate, recombination_rate, output, pat, mat): mat[k] += previous_estimate[j, k] e = recombination_rate - e1e = (1-e)*e - e2m1 = (1-e)**2 + e1e = (1 - e) * e + e2m1 = (1 - e) ** 2 # Adding modifications to pat and mat to take into account number of haplotypes and recombination rate. - pat *= e1e/n_pat - mat *= e1e/n_mat + pat *= e1e / n_pat + mat *= e1e / n_mat - e2 = e*e/(n_mat*n_pat) + e2 = e * e / (n_mat * n_pat) # Account for recombinations for j in range(n_pat): for k in range(n_mat): - output[j, k] = previous_estimate[j, k]*e2m1 + pat[j] + mat[k] + e2 + output[j, k] = previous_estimate[j, k] * e2m1 + pat[j] + mat[k] + e2 @jit(nopython=True, nogil=True) @@ -292,7 +359,9 @@ def diploid_forward(point_estimate, recombination_rate, in_place=False): # Update estimates at this locus # Take the value at locus i-1 and transmit it forward. - transmit(combined[i-1, :, :], recombination_rate[i], forward_i, tmp_pat, tmp_mat) + transmit( + combined[i - 1, :, :], recombination_rate[i], forward_i, tmp_pat, tmp_mat + ) # Combine the forward estimate at locus i with the point estimate at i. # This is safe if in_place = True since we have not updated combined[i,:,:] yet and it will be still equal to point_estimate. @@ -316,10 +385,10 @@ def diploid_backward(point_estimate, recombination_rate): tmp_pat = np.empty(n_pat, dtype=np.float32) tmp_mat = np.empty(n_mat, dtype=np.float32) - for i in range(n_loci-2, -1, -1): + for i in range(n_loci - 2, -1, -1): # Skip the first loci. # Combine the backward estimate at i+1 with the point estimate at i+1 (unlike the forward pass, the backward estimate does not contain the point_estimate). - combined_i[:, :] = backward[i+1, :, :] * point_estimate[i+1, :, :] + combined_i[:, :] = backward[i + 1, :, :] * point_estimate[i + 1, :, :] combined_i[:, :] /= np.sum(combined_i) # Transmit the combined value forward. This is the backward estimate. @@ -346,7 +415,9 @@ def diploidForwardBackward(point_estimate, recombination_rate): @jit(nopython=True, nogil=True) -def diploidSampleHaplotypes(forward_probs, recombination_rate, paternal_haplotypes, maternal_haplotypes): +def diploidSampleHaplotypes( + forward_probs, recombination_rate, paternal_haplotypes, maternal_haplotypes +): """Sample a pair of paternal and maternal haplotypes from the forward and backward probability distributions and paternal and maternal haplotype libraries. Returns: @@ -354,7 +425,9 @@ def diploidSampleHaplotypes(forward_probs, recombination_rate, paternal_haplotyp """ n_loci = forward_probs.shape[0] haplotypes = np.full((2, n_loci), 9, dtype=np.int8) - paternal_indices, maternal_indices = diploidOneSample(forward_probs, recombination_rate) + paternal_indices, maternal_indices = diploidOneSample( + forward_probs, recombination_rate + ) haplotypes[0] = haplotype_from_indices(paternal_indices, paternal_haplotypes) haplotypes[1] = haplotype_from_indices(maternal_indices, maternal_haplotypes) @@ -371,17 +444,27 @@ def diploidOneSample(forward_probs, recombination_rate): n_loci, n_pat, n_mat = forward_probs.shape - pvals = np.empty((n_pat, n_mat), dtype=np.float32) # sampled probability distribution at one locus + pvals = np.empty( + (n_pat, n_mat), dtype=np.float32 + ) # sampled probability distribution at one locus paternal_indices = np.empty(n_loci, dtype=np.int64) maternal_indices = np.empty(n_loci, dtype=np.int64) # Backwards algorithm - for i in range(n_loci-1, -1, -1): # zero indexed then minus one since we skip the boundary + for i in range( + n_loci - 1, -1, -1 + ): # zero indexed then minus one since we skip the boundary # Sample at this locus - if i == n_loci-1: + if i == n_loci - 1: pvals[:, :] = forward_probs[i, :, :] else: - combine_backward_sampled_value(forward_probs[i, :, :], paternal_indices[i+1], maternal_indices[i+1], recombination_rate[i+1], pvals[:, :]) + combine_backward_sampled_value( + forward_probs[i, :, :], + paternal_indices[i + 1], + maternal_indices[i + 1], + recombination_rate[i + 1], + pvals[:, :], + ) j, k = NumbaUtils.multinomial_sample_2d(pvals=pvals) paternal_indices[i] = j @@ -409,22 +492,30 @@ def diploid_viterbi(forward_probs, recombination_rate): maternal_indices = np.empty(n_loci, dtype=np.int64) # Backwards algorithm - for i in range(n_loci-1, -1, -1): # zero indexed then minus one since we skip the boundary + for i in range( + n_loci - 1, -1, -1 + ): # zero indexed then minus one since we skip the boundary # Sample at this locus - if i == n_loci-1: + if i == n_loci - 1: pvals[:, :] = forward_probs[i, :, :] else: - combine_backward_sampled_value(forward_probs[i, :, :], paternal_indices[i+1], maternal_indices[i+1], recombination_rate[i+1], pvals[:, :]) + combine_backward_sampled_value( + forward_probs[i, :, :], + paternal_indices[i + 1], + maternal_indices[i + 1], + recombination_rate[i + 1], + pvals[:, :], + ) idx = np.argmax(pvals) - j, k = idx//n_mat, idx%n_mat + j, k = idx // n_mat, idx % n_mat paternal_indices[i] = j maternal_indices[i] = k # Last sample (at the first locus) idx = np.argmax(pvals) - j, k = idx//n_mat, idx%n_mat + j, k = idx // n_mat, idx % n_mat paternal_indices[0] = j maternal_indices[0] = k @@ -435,7 +526,8 @@ def diploid_viterbi(forward_probs, recombination_rate): @jit(nopython=True) def diploidIndices(sampled_probs): """Get paternal and maternal indices from sampled probability distribution - Intended to be used with sampled probabilities as returned from diploidOneSample()""" + Intended to be used with sampled probabilities as returned from diploidOneSample() + """ n_loci, n_pat, n_mat = sampled_probs.shape paternal_indices = np.empty(n_loci, dtype=np.int64) @@ -447,15 +539,17 @@ def diploidIndices(sampled_probs): for k in range(n_mat): # If sampled_probs[j, k, i] == 1 # set indices array to values j and k - if sampled_probs[i, j, k] > 1-eps: + if sampled_probs[i, j, k] > 1 - eps: paternal_indices[i] = j maternal_indices[i] = k - + return paternal_indices, maternal_indices @jit(nopython=True) -def combine_backward_sampled_value(previous_estimate, pat_hap, mat_hap, recombination_rate, output): +def combine_backward_sampled_value( + previous_estimate, pat_hap, mat_hap, recombination_rate, output +): """Includes information from the previous sampled locus into the estimate at the current sampled locus. previous_estimate combination of the forward + point estimate at the locus. @@ -470,21 +564,21 @@ def combine_backward_sampled_value(previous_estimate, pat_hap, mat_hap, recombin n_pat, n_mat = previous_estimate.shape e = recombination_rate - single_rec = (1-e)*e - no_rec = (1-e)**2 - double_rec = e*e + single_rec = (1 - e) * e + no_rec = (1 - e) ** 2 + double_rec = e * e # Haplotype is moving from pat_hap, mat_hap. # Double recombination -- both haplotypes change. - output[:, :] = double_rec/(n_mat*n_pat) + output[:, :] = double_rec / (n_mat * n_pat) # Maternal recombination -- pat_hap stays the same. for k in range(n_mat): - output[pat_hap, k] += single_rec/n_mat + output[pat_hap, k] += single_rec / n_mat # Paternal recombination -- mat_hap stays the same. for j in range(n_pat): - output[j, mat_hap] += single_rec/n_pat + output[j, mat_hap] += single_rec / n_pat # No recombinations -- both haplotypes stay the same. output[pat_hap, mat_hap] += no_rec diff --git a/HaploidHMM.py b/HaploidHMM.py index 5176faa..7c7a568 100644 --- a/HaploidHMM.py +++ b/HaploidHMM.py @@ -1,10 +1,17 @@ from numba import jit import numpy as np from . import NumbaUtils -from . HaplotypeLibrary import haplotype_from_indices +from .HaplotypeLibrary import haplotype_from_indices -def haploidHMM(individual, source_haplotypes, error, recombination_rate, threshold=0.9, calling_method='dosages'): +def haploidHMM( + individual, + source_haplotypes, + error, + recombination_rate, + threshold=0.9, + calling_method="dosages", +): target_haplotype = individual.haplotypes n_loci = len(target_haplotype) @@ -20,27 +27,31 @@ def haploidHMM(individual, source_haplotypes, error, recombination_rate, thresho recombination_rate = np.full(n_loci, recombination_rate, dtype=np.float32) # Construct penetrance values (point estimates) - point_estimates = getHaploidPointEstimates(target_haplotype, source_haplotypes, error) + point_estimates = getHaploidPointEstimates( + target_haplotype, source_haplotypes, error + ) # Run forward-backward algorithm on penetrance values # Note: don't need these probabilites if using the sample method - if calling_method != 'sample': + if calling_method != "sample": total_probs = haploidForwardBackward(point_estimates, recombination_rate) # Handle the different calling methods - if calling_method == 'callhaps': + if calling_method == "callhaps": # Call haplotypes called_haps = haploidCallHaps(total_probs, threshold) # Call genotypes called_genotypes = getHaploidGenotypes(called_haps, source_haplotypes) return called_genotypes - if calling_method == 'dosages': + if calling_method == "dosages": dosages = getHaploidDosages(total_probs, source_haplotypes) individual.dosages = dosages - if calling_method == 'sample': - haplotype = getHaploidSample(point_estimates, recombination_rate, source_haplotypes) + if calling_method == "sample": + haplotype = getHaploidSample( + point_estimates, recombination_rate, source_haplotypes + ) individual.imputed_haplotypes = haplotype - if calling_method == 'Viterbi': + if calling_method == "Viterbi": haplotype = get_viterbi(point_estimates, recombination_rate, source_haplotypes) individual.imputed_haplotypes = haplotype @@ -75,7 +86,9 @@ def get_viterbi(point_estimates, recombination_rate, haplotype_library): @jit(nopython=True) def haploidCallHaps(hapEst, threshold): nHaps, nLoci = hapEst.shape - calledHaps = np.full(nLoci, -1, dtype=np.int64) # These are haplotype ids. -1 is missing. + calledHaps = np.full( + nLoci, -1, dtype=np.int64 + ) # These are haplotype ids. -1 is missing. for i in range(nLoci): maxVal = -1 maxLoc = -1 @@ -90,7 +103,9 @@ def haploidCallHaps(hapEst, threshold): @jit(nopython=True) def getHaploidGenotypes(calledHaps, sourceHaplotypes): nHaps, nLoci = sourceHaplotypes.shape - calledGenotypes = np.full(nLoci, 9, dtype=np.int8) # These are haplotype ids. -1 is missing. + calledGenotypes = np.full( + nLoci, 9, dtype=np.int8 + ) # These are haplotype ids. -1 is missing. for i in range(nLoci): if calledHaps[i] != -1: calledGenotypes[i] = sourceHaplotypes[calledHaps[i], i] @@ -135,9 +150,9 @@ def haploidTransformProbs(previous, new, estimate, point_estimate, recombination # Account for recombination rate e = recombination_rate - e1 = 1-recombination_rate + e1 = 1 - recombination_rate for j in range(n_haps): - new[j] = new[j]*e1 + e/n_haps + new[j] = new[j] * e1 + e / n_haps # Update distributions (in place) for j in range(n_haps): @@ -164,16 +179,20 @@ def haploidOneSample(forward_probs, recombination_rate): sample_indices = np.empty(n_loci, dtype=np.int64) # Backwards algorithm - for i in range(n_loci-2, -1, -1): # zero indexed then minus one since we skip the boundary + for i in range( + n_loci - 2, -1, -1 + ): # zero indexed then minus one since we skip the boundary # Sample at this locus - j = NumbaUtils.multinomial_sample(pvals=est[i+1, :]) + j = NumbaUtils.multinomial_sample(pvals=est[i + 1, :]) sampled_probs[:] = 0 sampled_probs[j] = 1 - sample_indices[i+1] = j + sample_indices[i + 1] = j # Get estimate at this locus using the *sampled* distribution # (instead of the point estimates/emission probabilities) - haploidTransformProbs(prev, new, est[i, :], sampled_probs, recombination_rate[i+1]) + haploidTransformProbs( + prev, new, est[i, :], sampled_probs, recombination_rate[i + 1] + ) # No need to normalise at this locus as multinomial_sample() # handles un-normalized probabilities @@ -202,17 +221,21 @@ def haploid_viterbi(forward_probs, recombination_rate): indices = np.empty(n_loci, dtype=np.int64) # Backwards algorithm - for i in range(n_loci-2, -1, -1): # zero indexed then minus one since we skip the boundary + for i in range( + n_loci - 2, -1, -1 + ): # zero indexed then minus one since we skip the boundary # Choose the most likely state (i.e. max probability) at this locus - j = np.argmax(est[i+1, :]) + j = np.argmax(est[i + 1, :]) sampled_probs[:] = 0 sampled_probs[j] = 1 - indices[i+1] = j + indices[i + 1] = j # Get estimate at this locus using the most likely distribution # (instead of the point estimates/emission probabilities) - haploidTransformProbs(prev, new, est[i, :], sampled_probs, recombination_rate[i+1]) + haploidTransformProbs( + prev, new, est[i, :], sampled_probs, recombination_rate[i + 1] + ) # No need to normalise at this locus as argmax() does not depend on normalisation # Most likely state at the first locus @@ -241,7 +264,9 @@ def haploidForward(point_estimate, recombination_rate): for i in range(1, n_loci): # Update estimates at this locus - haploidTransformProbs(prev, new, est[i, :], point_estimate[i-1, :], recombination_rate[i]) + haploidTransformProbs( + prev, new, est[i, :], point_estimate[i - 1, :], recombination_rate[i] + ) return est @@ -255,9 +280,13 @@ def haploidBackward(point_estimate, recombination_rate): prev = np.ones(n_haps, dtype=np.float32) new = np.empty(n_haps, dtype=np.float32) - for i in range(n_loci-2, -1, -1): # zero indexed then minus one since we skip the boundary + for i in range( + n_loci - 2, -1, -1 + ): # zero indexed then minus one since we skip the boundary # Update estimates at this locus - haploidTransformProbs(prev, new, est[i, :], point_estimate[i+1, :], recombination_rate[i+1]) + haploidTransformProbs( + prev, new, est[i, :], point_estimate[i + 1, :], recombination_rate[i + 1] + ) return est @@ -266,8 +295,9 @@ def haploidBackward(point_estimate, recombination_rate): def haploidForwardBackward(point_estimate, recombination_rate): """Calculate normalized state probabilities at each loci using the forward-backward algorithm""" - est = (haploidForward(point_estimate, recombination_rate) * - haploidBackward(point_estimate, recombination_rate)) + est = haploidForward(point_estimate, recombination_rate) * haploidBackward( + point_estimate, recombination_rate + ) # Return normalized probabilities n_loci = point_estimate.shape[0] diff --git a/HaplotypeLibrary.py b/HaplotypeLibrary.py index 6ad46dd..368e3df 100644 --- a/HaplotypeLibrary.py +++ b/HaplotypeLibrary.py @@ -1,22 +1,26 @@ -import pickle import random import numpy as np from numba import njit, jit -from numba.experimental import jitclass + def profile(x): return x + # Helper functions def slices(start, length, n): """Return `n` slices starting at `start` of length `length`""" - return [slice(i, i+length) for i in range(start, length*n + start, length)] + return [slice(i, i + length) for i in range(start, length * n + start, length)] -def bin_slices(l, n): + +def bin_slices(total, n): """Return a list of slice() objects that split l items into n bins The first l%n bins are length l//n+1; the remaining n-l%n bins are length l//n Similar to np.array_split()""" - return slices(0, l//n+1, l%n) + slices((l//n+1)*(l%n), l//n, n-l%n) + return slices(0, total // n + 1, total % n) + slices( + (total // n + 1) * (total % n), total // n, n - total % n + ) + def topk_indices(genotype, haplotypes, n_topk): """Return top-k haplotype indices with fewest opposite homozygous markers compared to genotype @@ -24,21 +28,23 @@ def topk_indices(genotype, haplotypes, n_topk): is first shuffled so that tied values are effectively randomly sampled""" # Mask of homozygous loci in the genotype - homozygous_mask = (genotype == 0) | (genotype == 2) # note: this purposefully ignores missing loci + homozygous_mask = (genotype == 0) | ( + genotype == 2 + ) # note: this purposefully ignores missing loci # Select just these homozygous loci g = genotype[homozygous_mask] h = haplotypes[:, homozygous_mask] # Number of opposite homozygous loci for all haplotypes - opposite_homozygous = np.sum(g//2 != h, axis=1) + opposite_homozygous = np.sum(g // 2 != h, axis=1) # Shuffle indices indices = np.arange(len(opposite_homozygous)) np.random.shuffle(indices) # Stable argsort on the shuffled values - args = np.argsort(opposite_homozygous[indices], kind='stable') + args = np.argsort(opposite_homozygous[indices], kind="stable") # Top-k indices (fewest opposite homozygous loci) return indices[args[:n_topk]] @@ -61,27 +67,23 @@ def __init__(self, n_loci=None): self._identifiers = [] # index to identifier mapping self.dtype = None - def __repr__(self): - return repr(self._identifiers) + '\n' + repr(self._haplotypes) - + return repr(self._identifiers) + "\n" + repr(self._haplotypes) def __len__(self): """Number of haplotypes in the library""" return len(self._haplotypes) - def __iter__(self): """Iterate over tuple of (id, haplotype)""" for i in range(len(self)): yield self._identifiers[i], self._haplotypes[i] - def append(self, haplotype, identifier=None): """Append a single haplotype to the library. Note: a copy of the haplotype is taken""" if self._frozen: - raise RuntimeError('Cannot append to frozen library') + raise RuntimeError("Cannot append to frozen library") if self.dtype is None: self.dtype = haplotype.dtype @@ -93,68 +95,63 @@ def append(self, haplotype, identifier=None): self._identifiers.append(identifier) self._haplotypes.append(haplotype.copy()) - def freeze(self): """Freeze the library: convert identifier and haplotype lists to NumPy arrays""" if self._frozen: - raise RuntimeError('Cannot freeze an already frozen library') + raise RuntimeError("Cannot freeze an already frozen library") self._haplotypes = np.array(self._haplotypes) self._identifiers = np.array(self._identifiers) self._frozen = True - def unfreeze(self): """Unfreeze the library: convert identifiers and haplotypes to lists""" if not self._frozen: - raise RuntimeError('Cannot unfreeze an unfrozen library') + raise RuntimeError("Cannot unfreeze an unfrozen library") self._haplotypes = list(self._haplotypes) self._identifiers = list(self._identifiers) self._frozen = False - def update(self, haplotypes, identifier): """Update identifier's haplotypes 'haplotypes' can be a 1d array of loci or a 2d array of shape(#haps, #loci)""" if not self._frozen: - raise RuntimeError('Cannot update an unfrozen library') + raise RuntimeError("Cannot update an unfrozen library") self._check_haplotype_dtype(haplotypes) indices = self._indices(identifier) # Use Numpy's broadcasting checks to handle mismatch of shape in the following: self._haplotypes[indices] = haplotypes - def exclude_identifiers(self, identifiers): """Return a NumPy array of haplotypes excluding specified identifiers 'identifiers' can be a single identifier or iterable of identifiers""" if not self._frozen: - raise RuntimeError('Cannot exclude from an unfrozen library') + raise RuntimeError("Cannot exclude from an unfrozen library") mask = ~np.isin(self._identifiers, identifiers) return self._haplotypes[mask] - def sample_only_identifiers(self, identifiers): """Returns HaplotypeLibrary object of haplotypes only including specified identifiers 'identifiers' can be a single identifier or iterable of identifiers""" if not self._frozen: - raise RuntimeError('Cannot exclude from an unfrozen library') + raise RuntimeError("Cannot exclude from an unfrozen library") mask = np.isin(self._identifiers, identifiers) return self._sampled_library(mask) - def sample(self, n_haplotypes): """Return a NumPy array of randomly sampled haplotypes""" if not self._frozen: - raise RuntimeError('Cannot sample from an unfrozen library') + raise RuntimeError("Cannot sample from an unfrozen library") if n_haplotypes > len(self): n_haplotypes = len(self) - sampled_indices = np.sort(np.random.choice(len(self), size=n_haplotypes, replace=False)) + sampled_indices = np.sort( + np.random.choice(len(self), size=n_haplotypes, replace=False) + ) return self._haplotypes[sampled_indices] - def sample_targeted(self, n_haplotypes, genotype, n_bins, exclude_identifiers=None): """Sample haplotypes that 'closely match' genotype `genotype`""" if not self._frozen: - raise RuntimeError('Cannot sample from an unfrozen library') + raise RuntimeError("Cannot sample from an unfrozen library") if n_haplotypes > len(self): return self @@ -176,7 +173,6 @@ def sample_targeted(self, n_haplotypes, genotype, n_bins, exclude_identifiers=No # Return HaplotypeLibrary object return self._sampled_library(sampled_indices) - def exclude_identifiers_and_sample(self, identifiers, n_haplotypes): """Return a NumPy array of (n_haplotypes) randomly sampled haplotypes excluding specified identifiers. @@ -184,35 +180,35 @@ def exclude_identifiers_and_sample(self, identifiers, n_haplotypes): Note: A copy of the haplotypes are created because of fancy indexing""" # Exclude specified identifiers if not self._frozen: - raise RuntimeError('Cannot sample or exclude from an unfrozen library') + raise RuntimeError("Cannot sample or exclude from an unfrozen library") exclude_mask = ~np.isin(self._identifiers, identifiers) n_remaining_haplotypes = exclude_mask.sum() # Generate random sample if n_haplotypes > n_remaining_haplotypes: n_haplotypes = n_remaining_haplotypes - sampled_indices = np.random.choice(n_remaining_haplotypes, size=n_haplotypes, replace=False) + sampled_indices = np.random.choice( + n_remaining_haplotypes, size=n_haplotypes, replace=False + ) sampled_indices.sort() # Return HaplotypeLibrary object return self._sampled_library(sampled_indices) - def asMatrix(self): """Return the NumPy array - kept for backwards compatibility""" if self._frozen: return self._haplotypes.copy() return np.array(self._haplotypes) - def removeMissingValues(self): """Replace missing values randomly with 0 or 1 with 50 % probability kept for backwards compatibility""" for hap in self._haplotypes: removeMissingValues(hap) - - def get_called_haplotypes(self, threshold = 0.99): + def get_called_haplotypes(self, threshold=0.99): """Return "called" haplotypes -- these are haplotypes which only contain integer values (0,1,9). - For haplotypes where there is uncertainty, a threshold is used to determine whether the value is called as a value or is missing. """ + For haplotypes where there is uncertainty, a threshold is used to determine whether the value is called as a value or is missing. + """ if not self._frozen: self.freeze() @@ -220,23 +216,26 @@ def get_called_haplotypes(self, threshold = 0.99): return self._haplotypes else: - called_haplotypes = np.full(self._haplotypes.shape, 0, dtype = np.float32) + called_haplotypes = np.full(self._haplotypes.shape, 0, dtype=np.float32) for i in range(called_haplotypes.shape[0]): - called_haplotypes[i,:] = self.call_haplotypes(self._haplotypes[i,:], threshold) + called_haplotypes[i, :] = self.call_haplotypes( + self._haplotypes[i, :], threshold + ) return called_haplotypes - @staticmethod + @staticmethod @jit(nopython=True) def call_haplotypes(hap, threshold): nLoci = len(hap) - output = np.full(nLoci, 9, dtype = np.int8) + output = np.full(nLoci, 9, dtype=np.int8) for i in range(nLoci): - if hap[i] <= 1 : - if hap[i] > threshold : output[i] = 1 - if hap[i] < 1-threshold : output[i] = 0 + if hap[i] <= 1: + if hap[i] > threshold: + output[i] = 1 + if hap[i] < 1 - threshold: + output[i] = 0 return output - def get_haplotypes(self): if not self._frozen: self.freeze() @@ -247,7 +246,6 @@ def get_identifiers(self): """Get haplotype identifiers""" return list(self._identifiers) - def _indices(self, identifier): """Get row indices associated with an identifier. These can be used for fancy indexing""" # Return empty list if identifier == None @@ -257,19 +255,19 @@ def _indices(self, identifier): raise RuntimeError("Cannot get indices from an unfrozen library") if identifier not in self._identifiers: raise KeyError(f"Identifier '{identifier}' not in library") - return np.flatnonzero(self._identifiers == identifier).tolist() + return np.flatnonzero(self._identifiers == identifier).tolist() def _check_haplotype_dtype(self, haplotype): """Check the haplotype has expected dtype""" if haplotype.dtype != self.dtype: - raise TypeError('haplotype(s) not equal to library dtype, {self.dtype}') + raise TypeError("haplotype(s) not equal to library dtype, {self.dtype}") def _check_haplotype(self, haplotype, expected_shape): """Check haplotype has expected shape and dtype. Could extend to check values in {0,1,9}""" self._check_haplotype_dtype(haplotype) if haplotype.shape != expected_shape: - raise ValueError('haplotype(s) has unexpected shape') + raise ValueError("haplotype(s) has unexpected shape") def _sampled_library(self, indices): """Create a 'duplicated' HaplotypeLibrary consisting of specified indices only""" @@ -299,10 +297,11 @@ def haplotype_from_indices(indices, haplotype_library): @njit def removeMissingValues(hap): - for i in range(len(hap)) : + for i in range(len(hap)): if hap[i] == 9: hap[i] = random.randint(0, 1) + class ErrorLibrary(object): @profile def __init__(self, hap, haplotypes): @@ -313,43 +312,46 @@ def __init__(self, hap, haplotypes): def getWindowValue(self, k): return jit_getWindowValue(self.errors, k, self.hap) + @jit(nopython=True) -def jit_getWindowValue(errors, k, hap) : - window = np.full(errors.shape, 0, dtype = np.int8) +def jit_getWindowValue(errors, k, hap): + window = np.full(errors.shape, 0, dtype=np.int8) nHaps, nLoci = errors.shape - #Let me be silly here. - for i in range(k+1): - window[:,0] += errors[:,i] + # Let me be silly here. + for i in range(k + 1): + window[:, 0] += errors[:, i] for i in range(1, nLoci): - window[:,i] = window[:,i-1] + window[:, i] = window[:, i - 1] if i > k: - if hap[i-k-1] != 9: - window[:,i] -= errors[:,i-k-1] #This is no longer in the window. - if i < (nLoci-k): - if hap[i+k] != 9: - window[:,i] += errors[:,i+k] #This is now included in the window. + if hap[i - k - 1] != 9: + window[:, i] -= errors[:, i - k - 1] # This is no longer in the window. + if i < (nLoci - k): + if hap[i + k] != 9: + window[:, i] += errors[:, i + k] # This is now included in the window. return window + @jit(nopython=True) def jit_assessErrors(hap, haps): - errors = np.full(haps.shape, 0, dtype = np.int8) + errors = np.full(haps.shape, 0, dtype=np.int8) nHaps, nLoci = haps.shape for i in range(nLoci): if hap[i] != 9: if hap[i] == 0: - errors[:, i] = haps[:,i] + errors[:, i] = haps[:, i] if hap[i] == 1: - errors[:, i] = 1-haps[:,i] + errors[:, i] = 1 - haps[:, i] return errors -from collections import OrderedDict + class HaplotypeDict(object): def __init__(self): self.nHaps = 0 self.haps = [] self.tree = dict() + # @profile def append(self, haplotype): byteVal = haplotype.tobytes() @@ -359,12 +361,13 @@ def append(self, haplotype): self.tree[byteVal] = self.nHaps self.haps.append(haplotype) self.nHaps += 1 - return self.nHaps -1 + return self.nHaps - 1 return self.tree[byteVal] def get(self, index): return self.haps[index] + # hap = np.array([0, 0, 0, 0, 0, 0, 0, 0]) # hapLib = [np.array([1, 0, 0, 0, 0, 0, 1]), diff --git a/HaplotypeOperations.py b/HaplotypeOperations.py index 963fedc..a00021e 100644 --- a/HaplotypeOperations.py +++ b/HaplotypeOperations.py @@ -1,12 +1,12 @@ -import numba -from numba import jit, float32, int8, int64, optional, boolean -from numba.experimental import jitclass +from numba import jit import numpy as np + def setup_individual(ind): - fillInPhaseFromGenotypes(ind.haplotypes[0], ind.genotypes) - fillInPhaseFromGenotypes(ind.haplotypes[1], ind.genotypes) - fillInGenotypesFromPhase(ind.genotypes, ind.haplotypes[0], ind.haplotypes[1]) + fillInPhaseFromGenotypes(ind.haplotypes[0], ind.genotypes) + fillInPhaseFromGenotypes(ind.haplotypes[1], ind.genotypes) + fillInGenotypesFromPhase(ind.genotypes, ind.haplotypes[0], ind.haplotypes[1]) + def align_individual(ind): # Note: The note below is no longer true. @@ -21,33 +21,39 @@ def align_individual(ind): fillInCompPhase(ind.haplotypes[0], ind.genotypes, ind.haplotypes[1]) fillInCompPhase(ind.haplotypes[1], ind.genotypes, ind.haplotypes[0]) + def ind_fillInGenotypesFromPhase(ind): - #Note: We never directly set genotypes so no need to go from genotypes -> phase + # Note: We never directly set genotypes so no need to go from genotypes -> phase fillInGenotypesFromPhase(ind.genotypes, ind.haplotypes[0], ind.haplotypes[1]) + def fillFromParents(ind): if ind.sire is not None: if ind.sire.genotypes is not None: sirePhase = getPhaseFromGenotypes(ind.sire.genotypes) fillIfMissing(ind.haplotypes[0], sirePhase) - + if ind.dam is not None: if ind.dam.genotypes is not None: damPhase = getPhaseFromGenotypes(ind.dam.genotypes) fillIfMissing(ind.haplotypes[0], damPhase) + @jit(nopython=True) def fillIfMissing(orig, new): for i in range(len(orig)): if orig[i] == 9: orig[i] = new[i] + @jit(nopython=True) def fillInGenotypesFromPhase(geno, phase1, phase2): for i in range(len(geno)): if geno[i] == 9: if phase1[i] != 9 and phase2[i] != 9: geno[i] = phase1[i] + phase2[i] + + @jit(nopython=True) def fillInCompPhase(target, geno, compPhase): for i in range(len(geno)): @@ -55,21 +61,27 @@ def fillInCompPhase(target, geno, compPhase): if geno[i] != 9: if compPhase[i] != 9: target[i] = geno[i] - compPhase[i] - + + @jit(nopython=True) def fillInPhaseFromGenotypes(phase, geno): for i in range(len(geno)): - if phase[i] == 9 : - if geno[i] == 0: phase[i] = 0 - if geno[i] == 2: phase[i] = 1 + if phase[i] == 9: + if geno[i] == 0: + phase[i] = 0 + if geno[i] == 2: + phase[i] = 1 + @jit(nopython=True) def getPhaseFromGenotypes(geno): - phase = np.full(len(geno), 9, dtype = np.int8) + phase = np.full(len(geno), 9, dtype=np.int8) for i in range(len(geno)): - if phase[i] == 9 : - if geno[i] == 0: phase[i] = 0 - if geno[i] == 2: phase[i] = 1 + if phase[i] == 9: + if geno[i] == 0: + phase[i] = 0 + if geno[i] == 2: + phase[i] = 1 return phase @@ -99,4 +111,3 @@ def getPhaseFromGenotypes(geno): # e = -e # if e == -1: index += 1 # if index >= midpoint: changed = True - diff --git a/InputOutput.py b/InputOutput.py index bbd8840..38905aa 100644 --- a/InputOutput.py +++ b/InputOutput.py @@ -10,62 +10,121 @@ alphaplinkpython_avail = False try: - import alphaplinkpython from alphaplinkpython import PlinkWriter + alphaplinkpython_avail = True except ImportError: alphaplinkpython_avail = False -##Global: +# Global: args = None -def getParser(program) : - parser = argparse.ArgumentParser(description='') + +def getParser(program): + parser = argparse.ArgumentParser(description="") core_parser = parser.add_argument_group("Core arguments") - core_parser.add_argument('-out', required=True, type=str, help='The output file prefix.') + core_parser.add_argument( + "-out", required=True, type=str, help="The output file prefix." + ) addInputFileParser(parser) - #Genotype files. + # Genotype files. # output_options_parser = parser.add_argument_group("Output options") # output_options_parser.add_argument('-writekey', default="id", required=False, type=str, help='Determines the order in which individuals are ordered in the output file based on their order in the corresponding input file. Animals not in the input file are placed at the end of the file and sorted in alphanumeric order. These animals can be surpressed with the "-onlykeyed" option. Options: id, pedigree, genotypes, sequence, segregation. Defualt: id.') # output_options_parser.add_argument('-onlykeyed', action='store_true', required=False, help='Flag to surpress the animals who are not present in the file used with -writekey. Also surpresses "dummy" animals.') - + if program == "Default": pass - if program == "AlphaImpute" : + if program == "AlphaImpute": core_impute_parser = parser.add_argument_group("Impute options") - core_impute_parser.add_argument('-no_impute', action='store_true', required=False, help='Flag to read in the files but not perform imputation.') - core_impute_parser.add_argument('-no_phase', action='store_true', required=False, help='Flag to not do HD phasing initially.') - core_impute_parser.add_argument('-maxthreads',default=1, required=False, type=int, help='Number of threads to use. Default: 1.') - core_impute_parser.add_argument('-binaryoutput', action='store_true', required=False, help='Flag to write out the genotypes as a binary plink output.') + core_impute_parser.add_argument( + "-no_impute", + action="store_true", + required=False, + help="Flag to read in the files but not perform imputation.", + ) + core_impute_parser.add_argument( + "-no_phase", + action="store_true", + required=False, + help="Flag to not do HD phasing initially.", + ) + core_impute_parser.add_argument( + "-maxthreads", + default=1, + required=False, + type=int, + help="Number of threads to use. Default: 1.", + ) + core_impute_parser.add_argument( + "-binaryoutput", + action="store_true", + required=False, + help="Flag to write out the genotypes as a binary plink output.", + ) if program in ["AlphaPeel", "AlphaMGS", "AlphaCall"]: probability_parser = parser.add_argument_group("Genotype probability arguments") - add_arguments_from_dictionary(probability_parser, get_probability_options(), None) + add_arguments_from_dictionary( + probability_parser, get_probability_options(), None + ) - - if program in ["longreads"]: longread_parser = parser.add_argument_group("Long read arguments") - longread_parser.add_argument('-longreads', default=None, required=False, type=str, nargs="*", help='A read file.') - - if program == "AlphaPlantImpute" : + longread_parser.add_argument( + "-longreads", + default=None, + required=False, + type=str, + nargs="*", + help="A read file.", + ) + + if program == "AlphaPlantImpute": core_plant_parser = parser.add_argument_group("Mandatory arguments") - core_plant_parser.add_argument('-plantinfo', default=None, required=False, type=str, nargs="*", help='A plant info file.') - - - if program == "AlphaMGS" : + core_plant_parser.add_argument( + "-plantinfo", + default=None, + required=False, + type=str, + nargs="*", + help="A plant info file.", + ) + + if program == "AlphaMGS": core_assign_parser = parser.add_argument_group("Core assignement arguments") - core_assign_parser.add_argument('-potentialgrandsires', default=None, required=False, type=str, help='A list of potential dams for each individual.') - core_assign_parser.add_argument('-usemaf', action='store_true', required=False, help='A flag to use the minor allele frequency when constructing genotype estimates for the sire and maternal grandsire. Not recomended for small input pedigrees.') + core_assign_parser.add_argument( + "-potentialgrandsires", + default=None, + required=False, + type=str, + help="A list of potential dams for each individual.", + ) + core_assign_parser.add_argument( + "-usemaf", + action="store_true", + required=False, + help="A flag to use the minor allele frequency when constructing genotype estimates for the sire and maternal grandsire. Not recomended for small input pedigrees.", + ) if program == "AlphaCall": call_parser = parser.add_argument_group("AlphaCall arguments") - call_parser.add_argument('-threshold', default=None, required=False, type=float, help='Genotype calling threshold. Use. .3 for best guess genotype.') - call_parser.add_argument('-sexchrom', action='store_true', required=False, help='A flag to that this is a sex chromosome. Sex needs to be given in the pedigree file. This is currently an experimental option.') + call_parser.add_argument( + "-threshold", + default=None, + required=False, + type=float, + help="Genotype calling threshold. Use. .3 for best guess genotype.", + ) + call_parser.add_argument( + "-sexchrom", + action="store_true", + required=False, + help="A flag to that this is a sex chromosome. Sex needs to be given in the pedigree file. This is currently an experimental option.", + ) return parser @@ -79,45 +138,153 @@ def addInputFileParser(parser): def get_input_options(): - parse_dictionary = dict() - parse_dictionary["bfile"] = lambda parser: parser.add_argument('-bfile', default=None, required=False, type=str, nargs="*", help='A file in plink (binary) format. Only stable on Linux).') - parse_dictionary["genotypes"] = lambda parser: parser.add_argument('-genotypes', default=None, required=False, type=str, nargs="*", help='A file in AlphaGenes format.') - parse_dictionary["reference"] = lambda parser: parser.add_argument('-reference', default=None, required=False, type=str, nargs="*", help='A haplotype reference panel in AlphaGenes format.') - parse_dictionary["seqfile"] = lambda parser: parser.add_argument('-seqfile', default=None, required=False, type=str, nargs="*", help='A sequence data file.') - parse_dictionary["pedigree"] = lambda parser: parser.add_argument('-pedigree',default=None, required=False, type=str, nargs="*", help='A pedigree file in AlphaGenes format.') - parse_dictionary["phasefile"] = lambda parser: parser.add_argument('-phasefile',default=None, required=False, type=str, nargs="*", help='A phase file in AlphaGenes format.') - parse_dictionary["startsnp"] = lambda parser: parser.add_argument('-startsnp',default=None, required=False, type=int, help="The first marker to consider. The first marker in the file is marker '1'. Default: 1.") - parse_dictionary["stopsnp"] = lambda parser: parser.add_argument('-stopsnp',default=None, required=False, type=int, help='The last marker to consider. Default: all markers considered.') - parse_dictionary["seed"] = lambda parser: parser.add_argument('-seed',default=None, required=False, type=int, help='A random seed to use for debugging.') - + parse_dictionary["bfile"] = lambda parser: parser.add_argument( + "-bfile", + default=None, + required=False, + type=str, + nargs="*", + help="A file in plink (binary) format. Only stable on Linux).", + ) + parse_dictionary["genotypes"] = lambda parser: parser.add_argument( + "-genotypes", + default=None, + required=False, + type=str, + nargs="*", + help="A file in AlphaGenes format.", + ) + parse_dictionary["reference"] = lambda parser: parser.add_argument( + "-reference", + default=None, + required=False, + type=str, + nargs="*", + help="A haplotype reference panel in AlphaGenes format.", + ) + parse_dictionary["seqfile"] = lambda parser: parser.add_argument( + "-seqfile", + default=None, + required=False, + type=str, + nargs="*", + help="A sequence data file.", + ) + parse_dictionary["pedigree"] = lambda parser: parser.add_argument( + "-pedigree", + default=None, + required=False, + type=str, + nargs="*", + help="A pedigree file in AlphaGenes format.", + ) + parse_dictionary["phasefile"] = lambda parser: parser.add_argument( + "-phasefile", + default=None, + required=False, + type=str, + nargs="*", + help="A phase file in AlphaGenes format.", + ) + parse_dictionary["startsnp"] = lambda parser: parser.add_argument( + "-startsnp", + default=None, + required=False, + type=int, + help="The first marker to consider. The first marker in the file is marker '1'. Default: 1.", + ) + parse_dictionary["stopsnp"] = lambda parser: parser.add_argument( + "-stopsnp", + default=None, + required=False, + type=int, + help="The last marker to consider. Default: all markers considered.", + ) + parse_dictionary["seed"] = lambda parser: parser.add_argument( + "-seed", + default=None, + required=False, + type=int, + help="A random seed to use for debugging.", + ) + return parse_dictionary + def get_output_options(): parse_dictionary = dict() - parse_dictionary["writekey"] = lambda parser: parser.add_argument('-writekey', default="id", required=False, type=str, help='Determines the order in which individuals are ordered in the output file based on their order in the corresponding input file. Animals not in the input file are placed at the end of the file and sorted in alphanumeric order. These animals can be surpressed with the "-onlykeyed" option. Options: id, pedigree, genotypes, sequence, segregation. Defualt: id.') - parse_dictionary["onlykeyed"] = lambda parser: parser.add_argument('-onlykeyed', action='store_true', required=False, help='Flag to surpress the animals who are not present in the file used with -writekey. Also surpresses "dummy" animals.') - parse_dictionary["iothreads"] = lambda parser: parser.add_argument('-iothreads', default=1, required=False, type=int, help='Number of threads to use for io. Default: 1.') + parse_dictionary["writekey"] = lambda parser: parser.add_argument( + "-writekey", + default="id", + required=False, + type=str, + help='Determines the order in which individuals are ordered in the output file based on their order in the corresponding input file. Animals not in the input file are placed at the end of the file and sorted in alphanumeric order. These animals can be surpressed with the "-onlykeyed" option. Options: id, pedigree, genotypes, sequence, segregation. Defualt: id.', + ) + parse_dictionary["onlykeyed"] = lambda parser: parser.add_argument( + "-onlykeyed", + action="store_true", + required=False, + help='Flag to surpress the animals who are not present in the file used with -writekey. Also surpresses "dummy" animals.', + ) + parse_dictionary["iothreads"] = lambda parser: parser.add_argument( + "-iothreads", + default=1, + required=False, + type=int, + help="Number of threads to use for io. Default: 1.", + ) return parse_dictionary + def get_multithread_options(): parse_dictionary = dict() - parse_dictionary["iothreads"] = lambda parser: parser.add_argument('-iothreads', default=1, required=False, type=int, help='Number of threads to use for input and output. Default: 1.') - parse_dictionary["maxthreads"] = lambda parser: parser.add_argument('-maxthreads', default=1, required=False, type=int, help='Maximum number of threads to use for analysis. Default: 1.') + parse_dictionary["iothreads"] = lambda parser: parser.add_argument( + "-iothreads", + default=1, + required=False, + type=int, + help="Number of threads to use for input and output. Default: 1.", + ) + parse_dictionary["maxthreads"] = lambda parser: parser.add_argument( + "-maxthreads", + default=1, + required=False, + type=int, + help="Maximum number of threads to use for analysis. Default: 1.", + ) return parse_dictionary + def get_probability_options(): parse_dictionary = dict() - parse_dictionary["error"] = lambda parser: parser.add_argument('-error', default=0.01, required=False, type=float, help='Genotyping error rate. Default: 0.01.') - parse_dictionary["seqerror"] = lambda parser: parser.add_argument('-seqerror', default=0.001, required=False, type=float, help='Assumed sequencing error rate. Default: 0.001.') - parse_dictionary["recombination"] = lambda parser: parser.add_argument('-recomb', default=1, required=False, type=float, help='Recombination rate per chromosome. Default: 1.') + parse_dictionary["error"] = lambda parser: parser.add_argument( + "-error", + default=0.01, + required=False, + type=float, + help="Genotyping error rate. Default: 0.01.", + ) + parse_dictionary["seqerror"] = lambda parser: parser.add_argument( + "-seqerror", + default=0.001, + required=False, + type=float, + help="Assumed sequencing error rate. Default: 0.001.", + ) + parse_dictionary["recombination"] = lambda parser: parser.add_argument( + "-recomb", + default=1, + required=False, + type=float, + help="Recombination rate per chromosome. Default: 1.", + ) return parse_dictionary - -def add_arguments_from_dictionary(parser, arg_dict, options = None): +def add_arguments_from_dictionary(parser, arg_dict, options=None): if options is None: for key, value in arg_dict.items(): value(parser) @@ -129,9 +296,9 @@ def add_arguments_from_dictionary(parser, arg_dict, options = None): print("Option not found:", option, arg_dict) -def parseArgs(program, parser = None, no_args = False): +def parseArgs(program, parser=None, no_args=False): global args - args = rawParseArgs(program, parser, no_args = no_args) + args = rawParseArgs(program, parser, no_args=no_args) args.program = program # We want start/stop snp to be in python format (i.e. 0 to n-1). @@ -140,13 +307,14 @@ def parseArgs(program, parser = None, no_args = False): if args.startsnp is not None: args.startsnp -= 1 args.stopsnp -= 1 - ##Add any necessary code to check args here. + # Add any necessary code to check args here. except AttributeError as error: pass return args -def rawParseArgs(program, parser = None, no_args = False) : + +def rawParseArgs(program, parser=None, no_args=False): if parser is None: parser = getParser(program) @@ -155,11 +323,10 @@ def rawParseArgs(program, parser = None, no_args = False) : else: args = sys.argv[1:] - if len(args) == 0 : + if len(args) == 0: parser.print_help(sys.stderr) sys.exit(1) - if len(args) == 1: if args[0] in ["-h", "-help", "--help"]: parser.print_help(sys.stderr) @@ -168,8 +335,9 @@ def rawParseArgs(program, parser = None, no_args = False) : with open(args[0]) as f: args = [] for line in f: - if line[0] != "-": line = "-" + line - args.extend(re.split(r"[,|\s|\n]+",line)) + if line[0] != "-": + line = "-" + line + args.extend(re.split(r"[,|\s|\n]+", line)) for arg in args: if len(arg) == 0: args.remove(arg) @@ -181,14 +349,18 @@ def rawParseArgs(program, parser = None, no_args = False) : return parser.parse_args(args) -parseArgs("Default", parser = None, no_args = True) +parseArgs("Default", parser=None, no_args=True) + @jit(nopython=True) def setNumbaSeeds(seed): np.random.seed(seed) random.seed(seed) -def readInPedigreeFromInputs(pedigree, args, genotypes=True, haps=False, reads=False, update_coding=False): + +def readInPedigreeFromInputs( + pedigree, args, genotypes=True, haps=False, reads=False, update_coding=False +): # Try catch is incase the program does not have a seed option. seed = getattr(args, "seed", None) @@ -197,26 +369,25 @@ def readInPedigreeFromInputs(pedigree, args, genotypes=True, haps=False, reads=F random.seed(args.seed) setNumbaSeeds(args.seed) - startsnp = getattr(args, "startsnp", None) stopsnp = getattr(args, "stopsnp", None) pedigree.MainMetaFounder = getattr(args, "main_metafounder", None) if pedigree.MainMetaFounder[:3] != "MF_": - print(f"ERROR: The main_metafounder must start with MF_. \nExiting...") + print("ERROR: The main_metafounder must start with MF_. \nExiting...") sys.exit(2) pedigree.args = args pedigreeReadIn = False - pedigree_args = getattr(args, "pedigree", None) - if pedigree_args is not None: + pedigree_args = getattr(args, "pedigree", None) + if pedigree_args is not None: pedigreeReadIn = True for ped in args.pedigree: pedigree.readInPedigree(ped) # This gets the attribute from args, but returns None if the atribute is not valid. - genotypes = getattr(args, "genotypes", None) - if genotypes is not None: + genotypes = getattr(args, "genotypes", None) + if genotypes is not None: for geno in args.genotypes: pedigree.readInGenotypes(geno, startsnp, stopsnp) @@ -228,75 +399,84 @@ def readInPedigreeFromInputs(pedigree, args, genotypes=True, haps=False, reads=F phenotype = getattr(args, "phenotype", None) if phenotype is not None: if phenoPenetrance is None: - print("ERROR: To use phenotype information, please provide a phenotype penetrance via '-pheno_penetrance_prob_file'\nExiting...") + print( + "ERROR: To use phenotype information, please provide a phenotype penetrance via '-pheno_penetrance_file'\nExiting..." + ) sys.exit(2) if pedigree.nLoci > 1: # For now, this will be removed once mapping of phenotype to genotype is done. print( "ERROR: Currently phenotype information can only be used with a single locus genotype input. Please either remove the pheno_file or use a single locus genotype input.\nExiting..." - ) + ) sys.exit(2) for pheno in args.phenotype: pedigree.readInPhenotype(pheno) - reference = getattr(args, "reference", None) - if reference is not None: + reference = getattr(args, "reference", None) + if reference is not None: for ref in args.reference: pedigree.readInReferencePanel(ref, startsnp, stopsnp) - - seqfile = getattr(args, "seqfile", None) - if seqfile is not None: + + seqfile = getattr(args, "seqfile", None) + if seqfile is not None: for seq in args.seqfile: pedigree.readInSequence(seq, startsnp, stopsnp) - - phasefile = getattr(args, "phasefile", None) - if phasefile is not None: + + phasefile = getattr(args, "phasefile", None) + if phasefile is not None: if args.program == "AlphaPeel": - print("Use of an external phase file is not currently supported. Phase information will be translated to genotype probabilities. If absolutely necessary use a penetrance file instead.") + print( + "Use of an external phase file is not currently supported. Phase information will be translated to genotype probabilities. If absolutely necessary use a penetrance file instead." + ) for phase in args.phasefile: pedigree.readInPhase(phase, startsnp, stopsnp) - aapfile = getattr(args, "alt_allele_prob_file", None) if aapfile is not None: for aap in args.alt_allele_prob_file: pedigree.readInAAP(aap) - bfile = getattr(args, "bfile", None) - if bfile is not None: + if bfile is not None: global alphaplinkpython_avail if alphaplinkpython_avail: for plink in args.bfile: - if pedigreeReadIn == True: - print(f"Pedigree file read in from -pedigree option. Reading in binary plink file {plink}. Pedigree information in the .fam file will be ignored.") + if pedigreeReadIn is True: + print( + f"Pedigree file read in from -pedigree option. Reading in binary plink file {plink}. Pedigree information in the .fam file will be ignored." + ) readInGenotypesPlink(pedigree, plink, startsnp, stopsnp, pedigreeReadIn) else: - warnings.warn("The module alphaplinkpython was not found. Plink files cannot be read in and will be ignored.") + warnings.warn( + "The module alphaplinkpython was not found. Plink files cannot be read in and will be ignored." + ) # Note: need to read .bim before .ped as the .bim sets the allele coding to use for reading the .ped - bim = getattr(args, 'bim', None) + bim = getattr(args, "bim", None) if bim is not None: for file in args.bim: pedigree.readInBim(file, startsnp, stopsnp) - ped = getattr(args, 'ped', None) + ped = getattr(args, "ped", None) if ped is not None: for file in args.ped: - pedigree.readInPed(file, startsnp, stopsnp, haps=False, update_coding=update_coding) + pedigree.readInPed( + file, startsnp, stopsnp, haps=False, update_coding=update_coding + ) - #It's important that these happen after all the datafiles are read in. - #Each read in can add individuals. This information needs to be calculated on the final pedigree. + # It's important that these happen after all the datafiles are read in. + # Each read in can add individuals. This information needs to be calculated on the final pedigree. pedigree.fillIn(genotypes, haps, reads) pedigree.setUpGenerations() pedigree.setupFamilies() -def readMapFile(mapFile, start = None, stop = None) : + +def readMapFile(mapFile, start=None, stop=None): ids = [] chrs = [] positions = [] with open(mapFile) as f: for line in f: - parts = line.split(); + parts = line.split() try: positions.append(float(parts[2])) chrs.append(parts[1]) @@ -318,7 +498,7 @@ def readMapFile(mapFile, start = None, stop = None) : # positions = [] # with open(mapFileName) as f: # for line in f: -# parts = line.split(); +# parts = line.split(); # try: # positions.append(float(parts[1])) # except ValueError: @@ -329,49 +509,59 @@ def readMapFile(mapFile, start = None, stop = None) : # return np.array(positions, dtype = np.float32) -def readInSeg(pedigree, fileName, start=None, stop = None): +def readInSeg(pedigree, fileName, start=None, stop=None): print("Reading in seg file:", fileName) - if start is None: start = 0 - if stop is None: stop = pedigree.nLoci - nLoci = stop - start + 1 #Contains stop. - - + if start is None: + start = 0 + if stop is None: + stop = pedigree.nLoci + nLoci = stop - start + 1 # Contains stop. print(pedigree.maxIdn) - seg = np.full((pedigree.maxIdn, 4, nLoci), .25, dtype = np.float32) + seg = np.full((pedigree.maxIdn, 4, nLoci), 0.25, dtype=np.float32) index = 0 fileNColumns = 0 - indHit = np.full(pedigree.maxIdn, 0, dtype = np.int64) + indHit = np.full(pedigree.maxIdn, 0, dtype=np.int64) with open(fileName) as f: e = 0 currentInd = None for line in f: - parts = line.split(); - idx = parts[0]; + parts = line.split() + idx = parts[0] if fileNColumns == 0: fileNColumns = len(parts) if fileNColumns != len(parts): - raise ValueError(f"The length of the line is not the expected length. Expected {fileNColumns} got {len(parts)} on individual {idx} and line {e}.") + raise ValueError( + f"The length of the line is not the expected length. Expected {fileNColumns} got {len(parts)} on individual {idx} and line {e}." + ) - segLine=np.array([float(val) for val in parts[(start+1):(stop+2)]], dtype = np.float32) + segLine = np.array( + [float(val) for val in parts[(start + 1) : (stop + 2)]], + dtype=np.float32, + ) if len(segLine) != nLoci: - raise ValueError(f"The length of the line subsection is not the expected length. Expected {nLoci} got {len(segLine)} on individual {idx} and line {e}.") + raise ValueError( + f"The length of the line subsection is not the expected length. Expected {nLoci} got {len(segLine)} on individual {idx} and line {e}." + ) if idx not in pedigree.individuals: print(f"Individual {idx} not found in pedigree. Individual ignored.") else: ind = pedigree.individuals[idx] - if e == 0: + if e == 0: currentInd = ind.idx if currentInd != ind.idx: - raise ValueError(f"Unexpected individual. Expecting individual {currentInd}, but got ind {ind.idx} on value {e}") - seg[ind.idn,e,:] = segLine - e = (e+1)%4 - ind.fileIndex['segregation'] = index; index += 1 + raise ValueError( + f"Unexpected individual. Expecting individual {currentInd}, but got ind {ind.idx} on value {e}" + ) + seg[ind.idn, e, :] = segLine + e = (e + 1) % 4 + ind.fileIndex["segregation"] = index + index += 1 indHit[ind.idn] += 1 for ind in pedigree: if indHit[ind.idn] != 4: @@ -383,57 +573,56 @@ def readInSeg(pedigree, fileName, start=None, stop = None): def writeIdnIndexedMatrix(pedigree, matrix, outputFile): np.set_printoptions(suppress=True) print("Writing to ", outputFile) - with open(outputFile, 'w+') as f: - + with open(outputFile, "w+") as f: for idx, ind in pedigree.writeOrder(): - if len(matrix.shape) == 2 : - tmp = np.around(matrix[ind.idn, :], decimals = 4) - f.write(' '.join(map(str, tmp))) + if len(matrix.shape) == 2: + tmp = np.around(matrix[ind.idn, :], decimals=4) + f.write(" ".join(map(str, tmp))) # f.write('\n') - if len(matrix.shape) == 3 : - for i in range(matrix.shape[1]) : + if len(matrix.shape) == 3: + for i in range(matrix.shape[1]): f.write(idx + " ") - tmp2 = map("{:.4f}".format, matrix[ind.idn,i, :].tolist()) - tmp3 = ' '.join(tmp2) + tmp2 = map("{:.4f}".format, matrix[ind.idn, i, :].tolist()) + tmp3 = " ".join(tmp2) f.write(tmp3) - f.write('\n') + f.write("\n") # f.write('\n') + + def writeFamIndexedMatrix(pedigree, matrix, outputFile): np.set_printoptions(suppress=True) print("Writing to ", outputFile) - with open(outputFile, 'w+') as f: - + with open(outputFile, "w+") as f: for fam in pedigree.getFamilies(): - if len(matrix.shape) == 2 : - tmp = np.around(matrix[fam.idn, :], decimals = 4) - f.write(' '.join(map(str, tmp))) + if len(matrix.shape) == 2: + tmp = np.around(matrix[fam.idn, :], decimals=4) + f.write(" ".join(map(str, tmp))) # f.write('\n') - if len(matrix.shape) == 3 : - for i in range(matrix.shape[1]) : + if len(matrix.shape) == 3: + for i in range(matrix.shape[1]): f.write(str(fam.idn) + " ") - tmp2 = map("{:.4f}".format, matrix[fam.idn,i, :].tolist()) - tmp3 = ' '.join(tmp2) + tmp2 = map("{:.4f}".format, matrix[fam.idn, i, :].tolist()) + tmp3 = " ".join(tmp2) f.write(tmp3) - f.write('\n') + f.write("\n") # f.write('\n') -def writeOutGenotypesPlink(pedigree, fileName): +def writeOutGenotypesPlink(pedigree, fileName): global alphaplinkpython_avail if alphaplinkpython_avail: - import alphaplinkpython from alphaplinkpython import PlinkWriter ped = [getFamString(ind) for ind in pedigree] nLoci = pedigree.nLoci nInd = len(pedigree.individuals) - genotypes = np.full((nLoci, nInd), 0, dtype = np.int8) + genotypes = np.full((nLoci, nInd), 0, dtype=np.int8) for i, ind in enumerate(pedigree): - genotypes[:,i] = ind.genotypes + genotypes[:, i] = ind.genotypes - genotypeIds = ["snp" + str(i+1) for i in range(nLoci)] + genotypeIds = ["snp" + str(i + 1) for i in range(nLoci)] genotypePos = [i + 1 for i in range(nLoci)] if args.startsnp is not None: genotypeIds = ["snp" + str(i + args.startsnp + 1) for i in range(nLoci)] @@ -444,22 +633,24 @@ def writeOutGenotypesPlink(pedigree, fileName): writeSimpleBim(genotypeIds, genotypePos, fileName + ".bim") PlinkWriter.writeBedFile(genotypes, fileName + ".bed") else: - warnings.warn("The module alphaplinkpython was not found. Plink files cannot be written out and will be ignored.") + warnings.warn( + "The module alphaplinkpython was not found. Plink files cannot be written out and will be ignored." + ) -def writeSimpleBim(genotypeIds, genotypePos, fileName) : + +def writeSimpleBim(genotypeIds, genotypePos, fileName): with open(fileName, "w") as file: for i in range(len(genotypeIds)): line = f"1 {genotypeIds[i]} 0 {genotypePos[i]} A B \n" file.write(line) - -def readInGenotypesPlink(pedigree, fileName, startsnp, endsnp, externalPedigree = False): - bim = PlinkWriter.readBimFile(fileName + '.bim') - fam = PlinkWriter.readFamFile(fileName + '.fam') - bed = PlinkWriter.readBedFile(fileName + '.bed', bim, fam) +def readInGenotypesPlink(pedigree, fileName, startsnp, endsnp, externalPedigree=False): + bim = PlinkWriter.readBimFile(fileName + ".bim") + fam = PlinkWriter.readFamFile(fileName + ".fam") + bed = PlinkWriter.readBedFile(fileName + ".bed", bim, fam) if startsnp is not None: - bed = bed[startsnp:endsnp,:] + bed = bed[startsnp:endsnp, :] pedList = [[line.getId(), line.getSire(), line.getDam()] for line in fam] idList = [line.getId() for line in fam] @@ -478,6 +669,7 @@ def getFamString(ind): return [str(ind.idx), str(sireStr), str(damStr)] # return [str(ind.idx).encode('utf-8'), str(sireStr).encode('utf-8'), str(damStr).encode('utf-8')] + # @profile # def writeIdnIndexedMatrix2(pedigree, matrix, outputFile): # np.set_printoptions(suppress=True) @@ -497,20 +689,19 @@ def getFamString(ind): # f.write('\n') -def print_boilerplate(name, version = None, commit = None, date = None): +def print_boilerplate(name, version=None, commit=None, date=None): """Print software name, version and contact""" width = 42 # width of 'website' line - print('-' * width) - print(f'{name:^{width}}') # centre aligned - print('-' * width) + print("-" * width) + print(f"{name:^{width}}") # centre aligned + print("-" * width) if version is not None: - print(f'Version: {version}') + print(f"Version: {version}") if commit is not None: - print(f'Commit: {commit}') + print(f"Commit: {commit}") if date is not None: - print(f'Date: {date}') - - print('Email: alphagenes@roslin.ed.ac.uk') - print('Website: http://alphagenes.roslin.ed.ac.uk') - print('-' * width) - + print(f"Date: {date}") + + print("Email: alphagenes.dev@gmail.com") + print("Website: https://github.com/AlphaGenes") + print("-" * width) diff --git a/MultiThreadIO.py b/MultiThreadIO.py index 2577944..81c118c 100644 --- a/MultiThreadIO.py +++ b/MultiThreadIO.py @@ -4,15 +4,18 @@ import math from . import InputOutput -def convert_data_to_line(data_tuple, fmt) : + +def convert_data_to_line(data_tuple, fmt): idx, data = data_tuple - return idx + ' ' + ' '.join(map(fmt, data)) + '\n' + return idx + " " + " ".join(map(fmt, data)) + "\n" + def convert_data_to_line_plink(data_tuple, fmt): """Format data for PLINK plain text output""" idx, data = data_tuple return f"0 {idx} 0 0 0 0 {' '.join(data.astype(fmt))}\n" + def writeLines(fileName, data_list, fmt, converter=convert_data_to_line): # print(f"Writing results to: {fileName}") try: @@ -20,16 +23,22 @@ def writeLines(fileName, data_list, fmt, converter=convert_data_to_line): except AttributeError as error: iothreads = 1 - with open(fileName, 'w+') as f: - + with open(fileName, "w+") as f: if iothreads > 1: - with concurrent.futures.ProcessPoolExecutor(max_workers = iothreads) as executor: # The minus one is to account for the main thread. + with concurrent.futures.ProcessPoolExecutor( + max_workers=iothreads + ) as executor: # Break up into small-ish chunks to reduce overall memory cost. # Hard code: splits into 1k individuals. # These get then split up into one chunk per thread. subsets = split_by(data_list, 1000) for subset in subsets: - for result in executor.map(converter, subset, itertools.repeat(fmt), chunksize=math.ceil(1000/iothreads)): + for result in executor.map( + converter, + subset, + itertools.repeat(fmt), + chunksize=math.ceil(1000 / iothreads), + ): f.write(result) if iothreads <= 1: @@ -44,30 +53,30 @@ def writeLinesPlinkPlainTxt(fileName, data_list): def split_by(array, step): - output = [] i = 0 - while i*step < len(array): - start = i*step - stop = (i+1)*step + while i * step < len(array): + start = i * step + stop = (i + 1) * step output.append(array[start:stop]) i += 1 return output - def process_input_line(line, startsnp, stopsnp, dtype): - parts = line.split(); + parts = line.split() idx = parts[0] parts = parts[1:] - if startsnp is not None : - parts = parts[startsnp : stopsnp + 1] #Offset 1 for id and 2 for id + include stopsnp + if startsnp is not None: + parts = parts[ + startsnp : stopsnp + 1 + ] # Offset 1 for id and 2 for id + include stopsnp - if dtype in [np.float16, np.float32, np.float64] : + if dtype in [np.float16, np.float32, np.float64]: data = np.array([val for val in parts], dtype=dtype) - else : - data = np.array([int(val) for val in parts], dtype = dtype) + else: + data = np.array([int(val) for val in parts], dtype=dtype) return (idx, data) @@ -82,14 +91,16 @@ def process_input_line_plink(line, startsnp, stopsnp, dtype): 4 Sex code ('1' = male, '2' = female, '0' = unknown) 5 Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control) 6-end Genotypes as pairs of alleles (A, C, G or T) - + At present, this extracts individual's identifier as the within-family ID """ parts = line.split() - idx = parts[1] # Use within-family ID + idx = parts[1] # Use within-family ID genotypes = parts[6:] if startsnp is not None: - genotypes = genotypes[startsnp*2: stopsnp*2 + 2] # Each locus is represented by two alleles + genotypes = genotypes[ + startsnp * 2 : stopsnp * 2 + 2 + ] # Each locus is represented by two alleles data = np.array(genotypes, dtype=np.bytes_) return (idx, data) @@ -104,7 +115,6 @@ def readLines(fileName, startsnp, stopsnp, dtype, processor=process_input_line): output = [] with open(fileName) as f: - if iothreads > 1: # This could be more efficient, but it's dwarfed by some of the other stuff in the program. # i.e. line is roughly the same size as the haplotypes (2 bytes per genotype value, i.e. (space)(value); and two values per haplotype. @@ -112,19 +122,32 @@ def readLines(fileName, startsnp, stopsnp, dtype, processor=process_input_line): all_outputs = [] lines = list(itertools.islice(f, 1000)) while len(lines) > 0: - with concurrent.futures.ProcessPoolExecutor(max_workers = iothreads) as executor: - chunk_output = executor.map(processor, lines, itertools.repeat(startsnp), itertools.repeat(stopsnp), itertools.repeat(dtype), chunksize=math.ceil(1000/iothreads)) + with concurrent.futures.ProcessPoolExecutor( + max_workers=iothreads + ) as executor: + chunk_output = executor.map( + processor, + lines, + itertools.repeat(startsnp), + itertools.repeat(stopsnp), + itertools.repeat(dtype), + chunksize=math.ceil(1000 / iothreads), + ) all_outputs.append(chunk_output) lines = list(itertools.islice(f, 1000)) output = itertools.chain.from_iterable(all_outputs) if iothreads <= 1: for line in f: - output.append(processor(line, startsnp = startsnp, stopsnp = stopsnp, dtype = dtype)) + output.append( + processor(line, startsnp=startsnp, stopsnp=stopsnp, dtype=dtype) + ) return output def readLinesPlinkPlainTxt(fileName, startsnp, stopsnp, dtype): """Read lines in PLINK plain text format""" - return readLines(fileName, startsnp, stopsnp, dtype, processor=process_input_line_plink) + return readLines( + fileName, startsnp, stopsnp, dtype, processor=process_input_line_plink + ) diff --git a/NumbaUtils.py b/NumbaUtils.py index 2693306..75ea697 100644 --- a/NumbaUtils.py +++ b/NumbaUtils.py @@ -6,7 +6,8 @@ def multinomial_sample(pvals): """Draw one sample from a multinomial distribution (similar to np.random.multinomial) pvals - array of probabilities (np.float32) (that ideally should sum to one - but doesn't have to) - return - 1d index of the sampled value. Use multinomial_sample_2d() to convert to a 2d index""" + return - 1d index of the sampled value. Use multinomial_sample_2d() to convert to a 2d index + """ # Cumulative sum of the probabilities, total sum is cumsum[-1] cumsum = np.cumsum(pvals) @@ -29,4 +30,4 @@ def multinomial_sample_2d(pvals): index = multinomial_sample(pvals) # Convert 1d index to 2d and return indices tuple ncols = pvals.shape[1] - return (index//ncols, index%ncols) + return (index // ncols, index % ncols) diff --git a/Pedigree.py b/Pedigree.py index 202fbd8..71100b0 100644 --- a/Pedigree.py +++ b/Pedigree.py @@ -1,7 +1,7 @@ import numpy as np -import numba import sys -from numba import jit, int8, int64, boolean, deferred_type, optional, float32, double +import re +from numba import jit, int64 from numba.experimental import jitclass from collections import OrderedDict @@ -9,8 +9,10 @@ from . import ProbMath from . import MultiThreadIO + class Family(object): """Family is a container for fullsib families""" + def __init__(self, idn, sire, dam, offspring): self.idn = idn self.sire = sire @@ -22,7 +24,7 @@ def __init__(self, idn, sire, dam, offspring): self.sire.families.append(self) self.dam.families.append(self) - def addChild(self, child) : + def addChild(self, child): self.offspring.append(child) def toJit(self): @@ -32,10 +34,11 @@ def toJit(self): spec = OrderedDict() -spec['idn'] = int64 -spec['sire'] = int64 -spec['dam'] = int64 -spec['offspring'] = int64[:] +spec["idn"] = int64 +spec["sire"] = int64 +spec["dam"] = int64 +spec["offspring"] = int64[:] + @jitclass(spec) class jit_Family(object): @@ -45,10 +48,9 @@ def __init__(self, idn, sire, dam, offspring): self.dam = dam self.offspring = offspring.astype(np.int64) -class Individual(object): - - def __init__(self, idx, idn, MetaFounder=None) : +class Individual(object): + def __init__(self, idx, idn, MetaFounder=None): self.genotypes = None self.haplotypes = None self.dosages = None @@ -67,17 +69,20 @@ def __init__(self, idx, idn, MetaFounder=None) : # Info is here to provide other software to add in additional information to an Individual. self.info = None - #For plant impute. Inbred is either DH or heavily selfed. Ancestors is historical source of the cross (may be more than 2 way so can't handle via pedigree). + # For plant impute. Inbred is either DH or heavily selfed. Ancestors is historical source of the cross (may be more than 2 way so can't handle via pedigree). self.inbred = False - self.imputationAncestors = [] #This is a list of lists. Either length 0, length 1 or length 2. + self.imputationAncestors = ( + [] + ) # This is a list of lists. Either length 0, length 1 or length 2. self.selfingGeneration = None - self.sire = None self.dam = None - self.idx = idx # User inputed string identifier - self.idn = idn # ID number assigned by the pedigree - self.fileIndex = dict() # Position of an animal in each file when reading in. Used to make sure Input and Output order are the same. + self.idx = idx # User inputed string identifier + self.idn = idn # ID number assigned by the pedigree + self.fileIndex = ( + dict() + ) # Position of an animal in each file when reading in. Used to make sure Input and Output order are the same. self.fileIndex["id"] = idx self.dummy = False @@ -93,12 +98,12 @@ def __init__(self, idx, idn, MetaFounder=None) : # but need high_density_threshold to be set in the pedigree first self.is_high_density = False - self.genotypedFounderStatus = None #? + self.genotypedFounderStatus = None # ? self.MetaFounder = [MetaFounder] if MetaFounder is not None else None self.phenotype = None - + def __eq__(self, other): return self is other @@ -115,30 +120,38 @@ def subset(self, start, stop): # print(self.__class__) new_ind = self.__class__(self.idx, self.idn) if self.genotypes is not None: - new_ind.genotypes = self.genotypes[start:stop].copy() # Maybe could get away with not doing copies... doing them just to be safe. + new_ind.genotypes = self.genotypes[ + start:stop + ].copy() # Maybe could get away with not doing copies... doing them just to be safe. if self.haplotypes is not None: - new_ind.haplotypes = (self.haplotypes[0][start:stop].copy(), self.haplotypes[1][start:stop].copy()) + new_ind.haplotypes = ( + self.haplotypes[0][start:stop].copy(), + self.haplotypes[1][start:stop].copy(), + ) if self.reads is not None: - new_ind.reads = (self.reads[0][start:stop].copy(), self.reads[1][start:stop].copy()) + new_ind.reads = ( + self.reads[0][start:stop].copy(), + self.reads[1][start:stop].copy(), + ) return new_ind - + if self.phenotype is not None: new_ind.phenotype = self.phenotype.copy() - + def getPercentMissing(self): return np.mean(self.genotypes == 9) def getGeneration(self): + if self.generation is not None: + return self.generation - if self.generation is not None : return self.generation - - if self.dam is None: + if self.dam is None: damGen = -1 else: damGen = self.dam.getGeneration() - if self.sire is None: + if self.sire is None: sireGen = -1 else: sireGen = self.sire.getGeneration() @@ -146,68 +159,82 @@ def getGeneration(self): self.generation = max(sireGen, damGen) + 1 return self.generation - - def constructInfo(self, nLoci, genotypes = True, haps = False, reads = False) : + def constructInfo(self, nLoci, genotypes=True, haps=False, reads=False): if genotypes and self.genotypes is None: - self.genotypes = np.full(nLoci, 9, dtype = np.int8) - - if haps and self.haplotypes is None: - self.haplotypes = (np.full(nLoci, 9, dtype = np.int8), np.full(nLoci, 9, dtype = np.int8)) + self.genotypes = np.full(nLoci, 9, dtype=np.int8) + + if haps and self.haplotypes is None: + self.haplotypes = ( + np.full(nLoci, 9, dtype=np.int8), + np.full(nLoci, 9, dtype=np.int8), + ) if reads and self.reads is None: - self.reads = (np.full(nLoci, 0, dtype = np.int64), np.full(nLoci, 0, dtype = np.int64)) - + self.reads = ( + np.full(nLoci, 0, dtype=np.int64), + np.full(nLoci, 0, dtype=np.int64), + ) + def isFounder(self): return (self.sire is None) and (self.dam is None) def getGenotypedFounderStatus(self): # options: 1: "GenotypedFounder", 0:"ChildOfNonGenotyped", 2:"ChildOfGenotyped" if self.genotypedFounderStatus is None: - if self.isFounder() : + if self.isFounder(): if self.genotypes is None or np.all(self.genotypes == 9): - self.genotypedFounderStatus = 0 + self.genotypedFounderStatus = 0 else: self.genotypedFounderStatus = 1 else: - parentStatus = max(self.sire.getGenotypedFounderStatus(), self.dam.getGenotypedFounderStatus()) + parentStatus = max( + self.sire.getGenotypedFounderStatus(), + self.dam.getGenotypedFounderStatus(), + ) if parentStatus > 0: self.genotypedFounderStatus = 2 else: if self.genotypes is None or np.all(self.genotypes == 9): - self.genotypedFounderStatus = 0 + self.genotypedFounderStatus = 0 else: self.genotypedFounderStatus = 1 return self.genotypedFounderStatus + def isGenotypedFounder(self): - return (self.getGenotypedFounderStatus() == 1) + return self.getGenotypedFounderStatus() == 1 class PlantImputeIndividual(Individual): """Simple derived class for AlphaPlantImpute2 with some extra member variables""" + def __init__(self, idx, idn): super().__init__(idx, idn) self.founders = [] self.descendants = [] -# Not sure of the code source: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ -# Slightly modified. -import re -def sorted_nicely( l , key): - """ Sort the given iterable in the way that humans expect.""" - return sorted(l, key = lambda k: alphanum_key(key(k))) +def sorted_nicely(list_to_sort, key): + """Sort the given iterable in the way that humans expect. + Modified from https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/ + + + """ + return sorted(list_to_sort, key=lambda k: alphanum_key(key(k))) + def alphanum_key(k): - convert = lambda text: int(text) if text.isdigit() else text - return [ convert(c) for c in re.split('([0-9]+)', str(k)) ] + """Modified from https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/""" + def convert(text): + return int(text) if text.isdigit() else text -class Generation(object): + return [convert(c) for c in re.split("([0-9]+)", str(k))] - def __init__(self, number): +class Generation(object): + def __init__(self, number): self.number = number self.families = [] @@ -229,11 +256,8 @@ def add_family(self, fam): # Note: Individuals are added seperately. - class Pedigree(object): - - def __init__(self, fileName = None, constructor = Individual): - + def __init__(self, fileName=None, constructor=Individual): self.maxIdn = 0 self.maxFam = 0 @@ -241,15 +265,17 @@ def __init__(self, fileName = None, constructor = Individual): self.families = None self.constructor = constructor self.nGenerations = 0 - self.generations = None #List of lists + self.generations = None # List of lists self.truePed = None self.nLoci = 0 - + self.startsnp = 0 self.endsnp = self.nLoci - self.referencePanel = [] #This should be an array of haplotypes. Or a dictionary? + self.referencePanel = ( + [] + ) # This should be an array of haplotypes. Or a dictionary? self.MainMetaFounder = None self.AAP = {} @@ -258,7 +284,7 @@ def __init__(self, fileName = None, constructor = Individual): self.nPheno = 0 # remove? - self.maf=None #Maf is the frequency of 2s. + self.maf = None # Maf is the frequency of 2s. # Threshold that determines if an individual is high-density genotyped self.high_density_threshold = 0.9 @@ -272,7 +298,6 @@ def __init__(self, fileName = None, constructor = Individual): self.allele_coding = None def reset_families(self): - for ind in self.individuals.values(): ind.families = [] ind.generation = None @@ -286,7 +311,7 @@ def reset_families(self): self.setupFamilies() def subset(self, start, stop): - new_pedigree = Pedigree(constructor = self.constructor) + new_pedigree = Pedigree(constructor=self.constructor) new_pedigree.nLoci = stop - start # Add all of the individuals. @@ -299,7 +324,6 @@ def subset(self, start, stop): new_ind = new_pedigree[ind.idx] new_sire = new_pedigree[ind.sire.idx] - # Add individuals new_ind.sire = new_sire new_sire.offspring.append(new_ind) @@ -315,46 +339,60 @@ def subset(self, start, stop): def merge(self, new_pedigree, start, stop): # This just merged genotype, haplotype, and dosage information (if availible). - + for ind in self: new_ind = new_pedigree[ind.idx] if new_ind.genotypes is not None: if ind.genotypes is None: - ind.genotypes = np.full(self.nLoci, 9, dtype = np.int8) + ind.genotypes = np.full(self.nLoci, 9, dtype=np.int8) ind.genotypes[start:stop] = new_ind.genotypes - + if new_ind.dosages is not None: if ind.dosages is None: - ind.dosages = np.full(self.nLoci, -1, dtype = np.float32) + ind.dosages = np.full(self.nLoci, -1, dtype=np.float32) ind.dosages[start:stop] = new_ind.dosages - + if new_ind.haplotypes is not None: if ind.haplotypes is None: - ind.haplotypes = (np.full(self.nLoci, 9, dtype = np.int8), np.full(self.nLoci, 9, dtype = np.int8)) + ind.haplotypes = ( + np.full(self.nLoci, 9, dtype=np.int8), + np.full(self.nLoci, 9, dtype=np.int8), + ) ind.haplotypes[0][start:stop] = new_ind.haplotypes[0] ind.haplotypes[1][start:stop] = new_ind.haplotypes[1] - - def __len__(self): return len(self.individuals) - def writeOrder(self): if self.writeOrderList is None: - inds = [ind for ind in self if (not ind.dummy) and (self.args.writekey in ind.fileIndex)] - self.writeOrderList = sorted_nicely(inds, key = lambda ind: ind.fileIndex[self.args.writekey]) - + inds = [ + ind + for ind in self + if (not ind.dummy) and (self.args.writekey in ind.fileIndex) + ] + self.writeOrderList = sorted_nicely( + inds, key=lambda ind: ind.fileIndex[self.args.writekey] + ) + if not self.args.onlykeyed: - indsNoDummyNoFileIndex = [ind for ind in self if (not ind.dummy) and (not self.args.writekey in ind.fileIndex)] - self.writeOrderList.extend(sorted_nicely(indsNoDummyNoFileIndex, key = lambda ind: ind.idx)) - + indsNoDummyNoFileIndex = [ + ind + for ind in self + if (not ind.dummy) and (self.args.writekey not in ind.fileIndex) + ] + self.writeOrderList.extend( + sorted_nicely(indsNoDummyNoFileIndex, key=lambda ind: ind.idx) + ) + dummys = [ind for ind in self if ind.dummy] - self.writeOrderList.extend(sorted_nicely(dummys, key = lambda ind: ind.idx)) + self.writeOrderList.extend( + sorted_nicely(dummys, key=lambda ind: ind.idx) + ) - for ind in self.writeOrderList : + for ind in self.writeOrderList: yield (ind.idx, ind) def setMaf(self): @@ -362,53 +400,51 @@ def setMaf(self): # The default values of 1 (maf) and 2 (counts) provide a sensible prior # For example, a locus where all individuals are missing, the MAF will be 0.5 - maf = np.full(self.nLoci, 1, dtype = np.float32) - counts = np.full(self.nLoci, 2, dtype = np.float32) + maf = np.full(self.nLoci, 1, dtype=np.float32) + counts = np.full(self.nLoci, 2, dtype=np.float32) for ind in self.individuals.values(): if ind.genotypes is not None: addIfNotMissing(maf, counts, ind.genotypes) - - self.maf = maf/counts + + self.maf = maf / counts def getMissingness(self): - missingness = np.full(self.nLoci, 1, dtype = np.float32) + missingness = np.full(self.nLoci, 1, dtype=np.float32) counts = 0 for ind in self.individuals.values(): if ind.genotypes is not None: counts += 1 addIfMissing(missingness, ind.genotypes) - return missingness/counts - + return missingness / counts def set_high_density(self): """Set whether each individual is high-density""" - for individual in self:#.individuals.values(): - is_high_density = np.mean(individual.genotypes != 9) >= self.high_density_threshold + for individual in self: # .individuals.values(): + is_high_density = ( + np.mean(individual.genotypes != 9) >= self.high_density_threshold + ) if is_high_density: individual.is_high_density = True - - def fillIn(self, genotypes = True, haps = False, reads = False): - + def fillIn(self, genotypes=True, haps=False, reads=False): for individual in self: - individual.constructInfo(self.nLoci, genotypes = True, haps = haps, reads = reads) + individual.constructInfo(self.nLoci, genotypes=True, haps=haps, reads=reads) - - def __getitem__(self, key) : + def __getitem__(self, key): return self.individuals[key] def __setitem__(self, key, value): self.individuals[key] = value - def __iter__(self) : + def __iter__(self): if self.generations is None: self.setUpGenerations() for gen in self.generations: for ind in gen.individuals: yield ind - def __reversed__(self) : + def __reversed__(self): if self.generations is None: self.setUpGenerations() for gen in reversed(self.generations): @@ -416,21 +452,25 @@ def __reversed__(self) : yield ind def sort_individuals(self, individuals): - - return {k:v for k, v in sorted(individuals.items(), key = lambda pair: alphanum_key(pair[0]))} + return { + k: v + for k, v in sorted( + individuals.items(), key=lambda pair: alphanum_key(pair[0]) + ) + } # Generation code - def setUpGenerations(self) : + def setUpGenerations(self): # Try and make pedigree independent. self.individuals = self.sort_individuals(self.individuals) self.nGenerations = 0 - #We can't use a simple iterator over self here, becuase __iter__ calls this function. + # We can't use a simple iterator over self here, becuase __iter__ calls this function. for idx, ind in self.individuals.items(): gen = ind.getGeneration() self.nGenerations = max(gen, self.nGenerations) - self.nGenerations += 1 #To account for generation 0. + self.nGenerations += 1 # To account for generation 0. self.generations = [Generation(i) for i in range(self.nGenerations)] @@ -438,9 +478,8 @@ def setUpGenerations(self) : gen = ind.getGeneration() self.generations[gen].add_individual(ind) - #This is really sloppy, but probably not important. - def setupFamilies(self) : - + # This is really sloppy, but probably not important. + def setupFamilies(self): if self.generations is None: self.setUpGenerations() @@ -448,35 +487,36 @@ def setupFamilies(self) : for ind in self: if not ind.isFounder(): parents = (ind.sire.idx, ind.dam.idx) - if parents in self.families : + if parents in self.families: self.families[parents].addChild(ind) else: - self.families[parents] = Family(self.maxFam, ind.sire, ind.dam, [ind]) + self.families[parents] = Family( + self.maxFam, ind.sire, ind.dam, [ind] + ) self.maxFam += 1 - for family in self.families.values(): self.generations[family.generation].add_family(family) - - def getFamilies(self, rev = False) : + def getFamilies(self, rev=False): if self.generations is None: self.setUpGenerations() if self.families is None: self.setupFamilies() gens = range(0, len(self.generations)) - if rev: gens = reversed(gens) + if rev: + gens = reversed(gens) for i in gens: for family in self.generations[i].families: yield family - - - def getIndividual(self, idx) : + def getIndividual(self, idx): if idx not in self.individuals: - self.individuals[idx] = self.constructor(idx, self.maxIdn, MetaFounder=self.MainMetaFounder) + self.individuals[idx] = self.constructor( + idx, self.maxIdn, MetaFounder=self.MainMetaFounder + ) self.maxIdn += 1 self.generations = None return self.individuals[idx] @@ -493,7 +533,7 @@ def readInPlantInfo(self, fileName): for line in lines: parts = line.split() - idx = parts[0]; + idx = parts[0] if idx not in self.individuals: self.individuals[idx] = self.constructor(idx, self.maxIdn) @@ -503,7 +543,7 @@ def readInPlantInfo(self, fileName): if len(parts) > 1: if parts[1] == "DH" or parts[1] == "INBRED": ind.inbred = True - elif parts[1][0] == "S" : + elif parts[1][0] == "S": ind.inbred = False ind.selfingGeneration = int(parts[1][1:]) else: @@ -517,7 +557,6 @@ def readInPlantInfo(self, fileName): else: self.addAncestors(ind, parts[2:]) - def addAncestors(self, ind, parts): ancestors = [] for idx in parts: @@ -528,7 +567,6 @@ def addAncestors(self, ind, parts): ancestors.append(ancestor) ind.imputationAncestors.append(ancestors) - def readInPedigreeFromList(self, pedList): index = 0 for parts in pedList: @@ -536,17 +574,22 @@ def readInPedigreeFromList(self, pedList): # check if the individual is a metafounder if idx is not None and idx[:3] == "MF_": - print(f"ERROR: Individual {idx} uses the prefix 'MF_' which is reserved for metafounders and cannot be used for an individual's id. \nExiting...") + print( + f"ERROR: Individual {idx} uses the prefix 'MF_' which is reserved for metafounders and cannot be used for an individual's id. \nExiting..." + ) sys.exit(2) # All individuals (and dummy individuals) in the pedigree are assigned the default metafounder. # This is then updated using the pedigree input for the individuals in the base population (founders). # But, all descendants will still have the default metafounder. # When required, we can work with the Metafounder just for a founding individual using a condition like - # if ind.MetaFounder == mfx and ind.isFounder(): - self.individuals[idx] = self.constructor(idx, self.maxIdn, MetaFounder=self.MainMetaFounder) + # if ind.MetaFounder == mfx and ind.isFounder(): + self.individuals[idx] = self.constructor( + idx, self.maxIdn, MetaFounder=self.MainMetaFounder + ) self.maxIdn += 1 - self.individuals[idx].fileIndex['pedigree'] = index; index += 1 + self.individuals[idx].fileIndex["pedigree"] = index + index += 1 for parts in pedList: idx = parts[0] @@ -559,18 +602,18 @@ def readInPedigreeFromList(self, pedList): sireID = None MFP = False else: - MFP = (sireID[:3] == "MF_") + MFP = sireID[:3] == "MF_" if damID == "0": damID = None MFM = False else: - MFM = (damID[:3] == "MF_") + MFM = damID[:3] == "MF_" if sireID or damID: # check if one of the parents is a metafounder - if (MFP or MFM): + if MFP or MFM: # check if both of the parents are metafounders - if (MFP and MFM): + if MFP and MFM: # check if the metafounders match if sireID == damID: # Overwrite the default metafounder. @@ -578,17 +621,21 @@ def readInPedigreeFromList(self, pedList): else: ind.MetaFounder = [sireID, damID] else: - print(f"ERROR: Both parents must be metafounders if one is a metafounder. For individual {idx} the parents were {sireID} and {damID}.\nConsider using a dummy individual for the metafounder.\nExiting...") + print( + f"ERROR: Both parents must be metafounders if one is a metafounder. For individual {idx} the parents were {sireID} and {damID}.\nConsider using a dummy individual for the metafounder.\nExiting..." + ) sys.exit(2) else: if sireID is None: sireID = "FatherOf" + idx elif damID is None: damID = "MotherOf" + idx - + if sireID and not MFP: if sireID not in self.individuals: - self.individuals[sireID] = self.constructor(sireID, self.maxIdn, MetaFounder=self.MainMetaFounder) + self.individuals[sireID] = self.constructor( + sireID, self.maxIdn, MetaFounder=self.MainMetaFounder + ) sire = self.individuals[sireID] self.maxIdn += 1 sire.dummy = True @@ -600,7 +647,9 @@ def readInPedigreeFromList(self, pedList): if damID and not MFM: if damID not in self.individuals: - self.individuals[damID] = self.constructor(damID, self.maxIdn, MetaFounder=self.MainMetaFounder) + self.individuals[damID] = self.constructor( + damID, self.maxIdn, MetaFounder=self.MainMetaFounder + ) dam = self.individuals[damID] self.maxIdn += 1 dam.dummy = True @@ -612,11 +661,13 @@ def readInPedigreeFromList(self, pedList): # Optional fourth column contains sex OR inbred/outbred status if len(parts) > 3: - male, female = {'m', '0', 'xy'}, {'f', '1', 'xx'} - inbred, outbred = {'dh', 'inbred'}, {'outbred'} + male, female = {"m", "0", "xy"}, {"f", "1", "xx"} + inbred, outbred = {"dh", "inbred"}, {"outbred"} expected_entries = male | female | inbred | outbred if parts[3].lower() not in expected_entries: - print(f"ERROR: unexpected entry in pedigree file, fourth field: '{parts[3]}'\nExiting...") + print( + f"ERROR: unexpected entry in pedigree file, fourth field: '{parts[3]}'\nExiting..." + ) sys.exit(2) # Sex if parts[3].lower() in male: @@ -629,20 +680,23 @@ def readInPedigreeFromList(self, pedList): elif parts[3].lower() in outbred: ind.inbred = False - - def readInFromPlink(self, idList, pedList, bed, externalPedigree = False): + def readInFromPlink(self, idList, pedList, bed, externalPedigree=False): index = 0 if not externalPedigree: self.readInPedigreeFromList(pedList) - + for i, idx in enumerate(idList): - genotypes=bed[:, i].copy() ##I think this is the right order. Doing the copy to be safe. + genotypes = bed[ + :, i + ].copy() # I think this is the right order. Doing the copy to be safe. nLoci = len(genotypes) if self.nLoci == 0: self.nLoci = nLoci if self.nLoci != nLoci: - print(f"ERROR: incorrect number of loci when reading in plink file. Expected {self.nLoci} got {nLoci}.\nExiting...") + print( + f"ERROR: incorrect number of loci when reading in plink file. Expected {self.nLoci} got {nLoci}.\nExiting..." + ) sys.exit(2) if idx not in self.individuals: self.individuals[idx] = self.constructor(idx, self.maxIdn) @@ -652,41 +706,55 @@ def readInFromPlink(self, idList, pedList, bed, externalPedigree = False): ind.constructInfo(nLoci, genotypes=True) ind.genotypes = genotypes - ind.fileIndex['plink'] = index; index += 1 + ind.fileIndex["plink"] = index + index += 1 - if np.mean(genotypes == 9) < .1 : + if np.mean(genotypes == 9) < 0.1: ind.initHD = True - - - def check_line(self, id_data, fileName, idxExpected=None, ncol=None, getInd=True, even_cols=False): + def check_line( + self, + id_data, + fileName, + idxExpected=None, + ncol=None, + getInd=True, + even_cols=False, + ): idx, data = id_data if idxExpected is not None and idx != idxExpected: - print(f"ERROR: Expected individual {idxExpected} but got individual {idx}.\nExiting...") + print( + f"ERROR: Expected individual {idxExpected} but got individual {idx}.\nExiting..." + ) sys.exit(2) if ncol is None: ncol = len(data) if ncol != len(data): - print(f"ERROR: incorrect number of columns in {fileName}. Expected {ncol} values but got {len(data)} for individual {idx}.\nExiting...") + print( + f"ERROR: incorrect number of columns in {fileName}. Expected {ncol} values but got {len(data)} for individual {idx}.\nExiting..." + ) sys.exit(2) if even_cols and ncol % 2 != 0: - print(f"ERROR: file {fileName} doesn't contain an even number of allele columns for individual {idx}.\nExiting...") + print( + f"ERROR: file {fileName} doesn't contain an even number of allele columns for individual {idx}.\nExiting..." + ) sys.exit(2) - + nLoci = len(data) if self.nLoci == 0: self.nLoci = nLoci if self.nLoci != nLoci: - print(f"ERROR: inconsistent number of markers or alleles in {fileName}. Expected {self.nLoci} got {nLoci}.") + print( + f"ERROR: inconsistent number of markers or alleles in {fileName}. Expected {self.nLoci} got {nLoci}." + ) sys.exit(2) ind = None - if getInd : + if getInd: ind = self.getIndividual(idx) - return ind, data, ncol - + return ind, data, ncol def update_allele_coding(self, alleles): """Update allele codings with new alleles @@ -709,33 +777,35 @@ def update_allele_coding(self, alleles): return # Update any missing entries in self.allele_coding[0] - mask = self.allele_coding[0] == b'0' + mask = self.allele_coding[0] == b"0" self.allele_coding[0, mask] = alleles[mask] # Update entries in self.allele_coding[1] if: # - the alleles have not already been seen in self.allele_coding[0] # - and the entry (in self.allele_coding[1]) is missing - mask = self.allele_coding[1] == b'0' + mask = self.allele_coding[1] == b"0" mask &= self.allele_coding[0] != alleles self.allele_coding[1, mask] = alleles[mask] # Check for > 2 alleles at any loci. These are: # - alleles that are not missing # - alleles that are not in either of self.allele_coding[0] or self.allele_coding[1] - mask = alleles != b'0' - mask &= ((alleles != self.allele_coding[0]) & (alleles != self.allele_coding[1])) # poss speedup alleles != self.allele_coding[0] done above + mask = alleles != b"0" + mask &= (alleles != self.allele_coding[0]) & ( + alleles != self.allele_coding[1] + ) # poss speedup alleles != self.allele_coding[0] done above if np.sum(mask) > 0: - print(f'ERROR: more than two alleles found in input file(s) at loci {np.flatnonzero(mask)}\nExiting...') + print( + f"ERROR: more than two alleles found in input file(s) at loci {np.flatnonzero(mask)}\nExiting..." + ) sys.exit(2) - def allele_coding_complete(self): """Check whether the allele coding is complete (contains no missing values)""" if self.allele_coding is None: return False else: - return np.sum(self.allele_coding == b'0') == 0 - + return np.sum(self.allele_coding == b"0") == 0 def decode_alleles(self, alleles): """Decode PLINK plain text alleles to AlphaGenes genotypes or haplotypes @@ -744,10 +814,10 @@ def decode_alleles(self, alleles): # 'Double' self.allele_coding as there are two allele columns at each locus coding = np.repeat(self.allele_coding, 2, axis=1) - decoded = np.full_like(alleles, b'0', dtype=np.int8) + decoded = np.full_like(alleles, b"0", dtype=np.int8) decoded[alleles == coding[0]] = 0 # set alleles coded as 0 decoded[alleles == coding[1]] = 1 # set alleles coded as 1 - decoded[alleles == b'0'] = 9 # convert missing (b'0' -> 9) + decoded[alleles == b"0"] = 9 # convert missing (b'0' -> 9) # Extract haplotypes decoded = np.atleast_2d(decoded) @@ -756,71 +826,79 @@ def decode_alleles(self, alleles): haplotypes = np.full((n_haps, n_loci), 9, dtype=np.int8) haplotypes[::2] = decoded[:, ::2] haplotypes[1::2] = decoded[:, 1::2] - + # Convert to genotypes genotypes = decoded[:, ::2] + decoded[:, 1::2] genotypes[genotypes > 9] = 9 # reset missing values return genotypes.squeeze(), haplotypes - def encode_alleles(self, haplotypes): """Encode haplotypes as PLINK plain text handles any even number of haplotypes with shape (n_individuals*2, n_loci)""" assert len(haplotypes) % 2 == 0 - assert len(haplotypes[0])== self.nLoci + assert len(haplotypes[0]) == self.nLoci # 'Double' self.allele_coding as there are two allele columns at each locus in PLINK format coding = np.repeat(self.allele_coding, 2, axis=1) # Encoded array is 'reshaped' - one individual per line, each locus is a pair of alleles - encoded = np.full((len(haplotypes)//2, self.nLoci*2), b'0', dtype=np.bytes_) + encoded = np.full((len(haplotypes) // 2, self.nLoci * 2), b"0", dtype=np.bytes_) # 'Splice' haplotypes into (adjacent) pairs of alleles encoded[:, ::2] = haplotypes[::2] encoded[:, 1::2] = haplotypes[1::2] # Encode - mask0 = encoded == b'0' # major alleles (0) - mask1 = encoded == b'1' # minor alleles (1) - mask9 = encoded == b'9' # missing (9) + mask0 = encoded == b"0" # major alleles (0) + mask1 = encoded == b"1" # minor alleles (1) + mask9 = encoded == b"9" # missing (9) encoded[mask0] = np.broadcast_to(coding[0], encoded.shape)[mask0] encoded[mask1] = np.broadcast_to(coding[1], encoded.shape)[mask1] - encoded[mask9] = b'0' + encoded[mask9] = b"0" return encoded.squeeze() - def check_allele_coding(self, filename): """Check coding is sensible""" - # Monoallelic loci: + # Monoallelic loci: # allele_coding[0] filled, but allele_coding[1] unfilled, i.e. coding[1] == b'0' - n_monoallelic = (self.allele_coding[1] == b'0').sum() + n_monoallelic = (self.allele_coding[1] == b"0").sum() # Unusual letters - unusual = ~np.isin(self.allele_coding, [b'A', b'C', b'G', b'T', b'0']) # unexpected letters + unusual = ~np.isin( + self.allele_coding, [b"A", b"C", b"G", b"T", b"0"] + ) # unexpected letters if np.sum(unusual) > 0: - letters = ' '.join(np.unique(self.allele_coding[unusual].astype(str))) - print(f'ERROR: unexpected values found in {filename}: [{letters}].\n' - f'Please check the file is in PLINK .ped format\nExiting...') + letters = " ".join(np.unique(self.allele_coding[unusual].astype(str))) + print( + f"ERROR: unexpected values found in {filename}: [{letters}].\n" + f"Please check the file is in PLINK .ped format\nExiting..." + ) sys.exit(2) elif n_monoallelic > 0: - print(f'WARNING: allele coding from {filename} has {n_monoallelic} monoallelic loci') + print( + f"WARNING: allele coding from {filename} has {n_monoallelic} monoallelic loci" + ) else: # Reassuring message if tests pass - print('Allele coding OK') - + print("Allele coding OK") - def readInPed(self, filename, startsnp=None, stopsnp=None, haps=False, update_coding=False): + def readInPed( + self, filename, startsnp=None, stopsnp=None, haps=False, update_coding=False + ): """Read in genotypes, and optionally haplotypes, from a PLINK plain text formated file, usually .ped If update_coding is True, the allele coding is interpreted from the .ped file and any coding in self.allele_coding is updated (if the coding is incomplete). - Note: to force a full read of the allele coding set self.allele_codine = None first""" + Note: to force a full read of the allele coding set self.allele_codine = None first + """ - print(f'Reading in PLINK .ped format: {filename}') + print(f"Reading in PLINK .ped format: {filename}") # Check the allele coding is to be got from file or is provided via self.allele_coding if not update_coding and self.allele_coding is None: - raise ValueError('readInPed () called with no allele coding') + raise ValueError("readInPed () called with no allele coding") - data_list = MultiThreadIO.readLinesPlinkPlainTxt(filename, startsnp=startsnp, stopsnp=stopsnp, dtype=np.bytes_) + data_list = MultiThreadIO.readLinesPlinkPlainTxt( + filename, startsnp=startsnp, stopsnp=stopsnp, dtype=np.bytes_ + ) index = 0 ncol = None if self.nLoci != 0: @@ -831,24 +909,29 @@ def readInPed(self, filename, startsnp=None, stopsnp=None, haps=False, update_co if not self.allele_coding_complete(): if self.allele_coding is None: - print(f'Interpreting allele coding from {filename}') + print(f"Interpreting allele coding from {filename}") else: - print(f'Updating allele coding with coding from {filename}') + print(f"Updating allele coding with coding from {filename}") for value in data_list: - ind, alleles, ncol = self.check_line(value, filename, idxExpected=None, ncol=ncol, even_cols=True) + ind, alleles, ncol = self.check_line( + value, filename, idxExpected=None, ncol=ncol, even_cols=True + ) ind.constructInfo(self.nLoci, genotypes=True) - ind.fileIndex['plink'] = index; index += 1 + ind.fileIndex["plink"] = index + index += 1 if update_coding: # Initialise allele coding array if none exists # read_or_create = 'Reading' if self.allele_coding is None else 'Updating' if self.allele_coding is None: - self.allele_coding = np.full((2, self.nLoci//2), b'0', dtype=np.bytes_) + self.allele_coding = np.full( + (2, self.nLoci // 2), b"0", dtype=np.bytes_ + ) # Update allele codes # print(f'{read_or_create} allele coding with coding from {filename}') - self.update_allele_coding(alleles[::2]) # first allele in each pair + self.update_allele_coding(alleles[::2]) # first allele in each pair self.update_allele_coding(alleles[1::2]) # second allele # Decode haplotypes and genotypes @@ -856,42 +939,47 @@ def readInPed(self, filename, startsnp=None, stopsnp=None, haps=False, update_co if haps: ind.haplotypes = haplotypes - if np.mean(ind.genotypes == 9) < .1: + if np.mean(ind.genotypes == 9) < 0.1: ind.initHD = True # Check allele coding self.check_allele_coding(filename) # Reset nLoci back to its undoubled state - self.nLoci = self.nLoci//2 - - - def readInGenotypes(self, fileName, startsnp=None, stopsnp = None): + self.nLoci = self.nLoci // 2 + def readInGenotypes(self, fileName, startsnp=None, stopsnp=None): print("Reading in AlphaGenes format:", fileName) index = 0 ncol = None - data_list = MultiThreadIO.readLines(fileName, startsnp = startsnp, stopsnp = stopsnp, dtype = np.int8) + data_list = MultiThreadIO.readLines( + fileName, startsnp=startsnp, stopsnp=stopsnp, dtype=np.int8 + ) for value in data_list: - ind, genotypes, ncol = self.check_line(value, fileName, idxExpected = None, ncol = ncol) + ind, genotypes, ncol = self.check_line( + value, fileName, idxExpected=None, ncol=ncol + ) ind.constructInfo(self.nLoci, genotypes=True) ind.genotypes = genotypes - ind.fileIndex['genotypes'] = index; index += 1 + ind.fileIndex["genotypes"] = index + index += 1 - if np.mean(genotypes == 9) < .1 : + if np.mean(genotypes == 9) < 0.1: ind.initHD = True def readInPhenotype(self, fileName): """function for reading in phenotype input - + :param fileName: The file path :type fileName: str """ - data_list = MultiThreadIO.readLines(fileName, startsnp=None, stopsnp=None, dtype = np.float32) + data_list = MultiThreadIO.readLines( + fileName, startsnp=None, stopsnp=None, dtype=np.float32 + ) for value in data_list: idx, pheno = value @@ -901,124 +989,145 @@ def readInPhenotype(self, fileName): if self.nPheno == 0: self.nPheno = nPheno if self.nPheno != nPheno: - print(f"ERROR: inconsistent number of phenotypes when reading in phenotype file. Expected {self.nPheno} got {nPheno}.\nExiting...") + print( + f"ERROR: inconsistent number of phenotypes when reading in phenotype file. Expected {self.nPheno} got {nPheno}.\nExiting..." + ) sys.exit(2) if idx not in self.individuals: self.individuals[idx] = self.constructor(idx, self.maxIdn) self.maxIdn += 1 ind = self.individuals[idx] - + # List to store repeated phenotype records for the same trait. - if ind.phenotype == None: + if ind.phenotype is None: ind.phenotype = [] - ind.phenotype.append(np.full(self.nPheno, pheno, dtype = np.int8)) - + ind.phenotype.append(np.full(self.nPheno, pheno, dtype=np.int8)) def readInPhenotypePenetrance(self, fileName): """ function for reading in the phenotype penetrance input - + :param fileName: The file path :type filename: str """ - self.phenoPenetrance = np.loadtxt(fileName, dtype = np.float32) - self.phenoPenetrance = self.phenoPenetrance / np.sum(self.phenoPenetrance, 1)[:,None] # Normalising of the phenotype penetrance input so all rows sum to 1. - - - def readInReferencePanel(self, fileName, startsnp=None, stopsnp = None): + self.phenoPenetrance = np.loadtxt(fileName, dtype=np.float32) + self.phenoPenetrance = ( + self.phenoPenetrance / np.sum(self.phenoPenetrance, 1)[:, None] + ) # Normalising of the phenotype penetrance input so all rows sum to 1. + def readInReferencePanel(self, fileName, startsnp=None, stopsnp=None): print("Reading in reference panel:", fileName) index = 0 ncol = None - data_list = MultiThreadIO.readLines(fileName, startsnp = startsnp, stopsnp = stopsnp, dtype = np.int8) + data_list = MultiThreadIO.readLines( + fileName, startsnp=startsnp, stopsnp=stopsnp, dtype=np.int8 + ) for value in data_list: - ind, haplotype, ncol = self.check_line(value, fileName, idxExpected = None, ncol = ncol, getInd=False) + ind, haplotype, ncol = self.check_line( + value, fileName, idxExpected=None, ncol=ncol, getInd=False + ) self.referencePanel.append(haplotype) - def readInPhase(self, fileName, startsnp=None, stopsnp = None): + def readInPhase(self, fileName, startsnp=None, stopsnp=None): index = 0 ncol = None - data_list = MultiThreadIO.readLines(fileName, startsnp = startsnp, stopsnp = stopsnp, dtype = np.int8) + data_list = MultiThreadIO.readLines( + fileName, startsnp=startsnp, stopsnp=stopsnp, dtype=np.int8 + ) e = 0 currentInd = None for value in data_list: - - if e == 0: + if e == 0: idxExpected = None else: idxExpected = currentInd.idx - ind, haplotype, ncol = self.check_line(value, fileName, idxExpected = idxExpected, ncol = ncol) + ind, haplotype, ncol = self.check_line( + value, fileName, idxExpected=idxExpected, ncol=ncol + ) currentInd = ind ind.constructInfo(self.nLoci, haps=True) ind.haplotypes[e][:] = haplotype - e = 1-e + e = 1 - e - ind.fileIndex['phase'] = index; index += 1 + ind.fileIndex["phase"] = index + index += 1 - - def readInSequence(self, fileName, startsnp=None, stopsnp = None): + def readInSequence(self, fileName, startsnp=None, stopsnp=None): index = 0 ncol = None print("Reading in sequence data :", fileName) - - data_list = MultiThreadIO.readLines(fileName, startsnp = startsnp, stopsnp = stopsnp, dtype = np.int64) + + data_list = MultiThreadIO.readLines( + fileName, startsnp=startsnp, stopsnp=stopsnp, dtype=np.int64 + ) e = 0 currentInd = None for value in data_list: - if e == 0: + if e == 0: idxExpected = None else: idxExpected = currentInd.idx - ind, reads, ncol = self.check_line(value, fileName, idxExpected = idxExpected, ncol = ncol) + ind, reads, ncol = self.check_line( + value, fileName, idxExpected=idxExpected, ncol=ncol + ) currentInd = ind ind.constructInfo(self.nLoci, reads=True) - ind.fileIndex['sequence'] = index; index += 1 + ind.fileIndex["sequence"] = index + index += 1 ind.reads[e][:] = reads - e = 1-e + e = 1 - e def readInAAP(self, fileName): """function for reading alternative allele probability input :param fileName: The file path :type fileName: str - """ + """ with open(fileName) as f: lines = [line.strip() for line in f if line.strip()] if not lines: - print("ERROR: The `alt_allele_prob_file` is empty. Expected a header row like 'MF_1 MF_2 ...'.\nExiting...") + print( + "ERROR: The `alt_allele_prob_file` is empty. Expected a header row like 'MF_1 MF_2 ...'.\nExiting..." + ) sys.exit(2) header = lines[0].split() if not header: - print("ERROR: First row of the `alt_allele_prob_file` is empty. Expected metafounder IDs (e.g., 'MF_1 MF_2 ...') or a single column of alternative allele probabilities for each locus.\nExiting...") + print( + "ERROR: First row of the `alt_allele_prob_file` is empty. Expected metafounder IDs (e.g., 'MF_1 MF_2 ...') or a single column of alternative allele probabilities for each locus.\nExiting..." + ) sys.exit(2) # If the first row does not start with MF_, treat the file as a single column of loci. if not header[0].startswith("MF_"): if len(header) != 1: - print(f"ERROR: First row of the `alt_allele_prob_file` starts with '{header[0]}' but is not a valid metafounder ID.\n" - f"Expected a header like 'MF_1 MF_2 ...' or a single column of loci values.\nExiting...") + print( + f"ERROR: First row of the `alt_allele_prob_file` starts with '{header[0]}' but is not a valid metafounder ID.\n" + f"Expected a header like 'MF_1 MF_2 ...' or a single column of loci values.\nExiting..." + ) sys.exit(2) - + nLoci = self.nLoci if len(lines) != nLoci: - print(f"ERROR: Incorrect number of locus rows in the `alt_allele_prob_file`. Expected {nLoci} rows but found {len(lines)}.\nExiting...") + print( + f"ERROR: Incorrect number of locus rows in the `alt_allele_prob_file`. Expected {nLoci} rows but found {len(lines)}.\nExiting..." + ) sys.exit(2) - + # Check for NA/NaN values per locus missing_loci = [] data_values = [] @@ -1030,14 +1139,18 @@ def readInAAP(self, fileName): try: data_values.append(float(value)) except ValueError: - print(f"ERROR: Non-numeric value found at locus {i + 1} in the `alt_allele_prob_file`.\nExiting...") + print( + f"ERROR: Non-numeric value found at locus {i + 1} in the `alt_allele_prob_file`.\nExiting..." + ) sys.exit(2) - + # Report missing values if found if missing_loci: - print(f"ERROR: Missing values (NA/NaN) found in the `alt_allele_prob_file` at loci: {', '.join(map(str, missing_loci))}.\nIf the alternative allele probability is unknown, please use default of 0.5\nExiting...") + print( + f"ERROR: Missing values (NA/NaN) found in the `alt_allele_prob_file` at loci: {', '.join(map(str, missing_loci))}.\nIf the alternative allele probability is unknown, please use default of 0.5\nExiting..." + ) sys.exit(2) - + data = np.array(data_values, dtype=np.float32) self.AAP[self.MainMetaFounder] = data return @@ -1058,41 +1171,56 @@ def readInAAP(self, fileName): nDataRows = len(lines) - 1 if nDataRows != nLoci: if nDataRows < nLoci: - print(f"ERROR: Not all loci have an alternative allele frequency in the `-alt_allele_prob_file` input. Expected {nLoci} rows but found {nDataRows}.\nExiting...") + print( + f"ERROR: Not all loci have an alternative allele frequency in the `-alt_allele_prob_file` input. Expected {nLoci} rows but found {nDataRows}.\nExiting..." + ) else: - print(f"ERROR: The `-alt_allele_prob_file` input has more alternative allele probabilities than loci. Expected {nLoci} rows but found {nDataRows}.\nExiting...") + print( + f"ERROR: The `-alt_allele_prob_file` input has more alternative allele probabilities than loci. Expected {nLoci} rows but found {nDataRows}.\nExiting..." + ) sys.exit(2) aap_matrix = np.zeros((n_meta, nLoci), dtype=np.float32) missing_values_per_mf = {mfx: [] for mfx in metafounders} - + for i, line in enumerate(lines[1:]): parts = line.split() if len(parts) != n_meta: - print(f"ERROR: in the `alt_allele_prob_file`, locus row {i + 1} has {len(parts)} values but header has {n_meta} metafounders. If the alternative allele probabaility is unknown, please use default of 0.5\nExiting...") + print( + f"ERROR: in the `alt_allele_prob_file`, locus row {i + 1} has {len(parts)} values but header has {n_meta} metafounders. If the alternative allele probabaility is unknown, please use default of 0.5\nExiting..." + ) sys.exit(2) - + # Check for NA/NaN values per metafounder has_missing = False for col, value in enumerate(parts): if value.lower() in ("na", "nan"): missing_values_per_mf[metafounders[col]].append(i + 1) has_missing = True - + # Skip conversion if row has missing values if has_missing: continue - + try: aap_matrix[:, i] = np.array(parts, dtype=np.float32) except ValueError: - print(f"ERROR: Non-numeric value found in locus row {i + 1} of the `alt_allele_prob_file`. If the alternative allele probabaility is unknown, please use default of 0.5\nExiting....") + print( + f"ERROR: Non-numeric value found in locus row {i + 1} of the `alt_allele_prob_file`. If the alternative allele probabaility is unknown, please use default of 0.5\nExiting...." + ) sys.exit(2) - + # Report metafounders with missing values mfs_with_missing = [mfx for mfx, loci in missing_values_per_mf.items() if loci] if mfs_with_missing: - missing_summary = '; '.join([f"{mfx} (loci: {', '.join(map(str, missing_values_per_mf[mfx]))})" for mfx in mfs_with_missing]) - print(f"ERROR: Missing values (NA/NaN) found in the `alt_allele_prob_file`:\n{missing_summary}\nIf the alternative allele probability is unknown, please use default of 0.5\nExiting...") + missing_summary = "; ".join( + [ + f"{mfx} (loci: {', '.join(map(str, missing_values_per_mf[mfx]))})" + for mfx in mfs_with_missing + ] + ) + print( + f"ERROR: Missing values (NA/NaN) found in the `alt_allele_prob_file`:\n{missing_summary}\nIf the alternative allele probability is unknown, please use default of 0.5\nExiting..." + ) sys.exit(2) # flag of whether adding a default alternative allele probability @@ -1106,21 +1234,22 @@ def readInAAP(self, fileName): if default_aap: self.AAP[self.MainMetaFounder] = np.full(nLoci, 0.5, dtype=np.float32) - def callGenotypes(self, threshold): for idx, ind in self.writeOrder(): matrix = ProbMath.getGenotypeProbabilities_ind(ind, InputOutput.args) - - matrixCollapsedHets = np.array([matrix[0,:], matrix[1,:] + matrix[2,:], matrix[3,:]], dtype=np.float32) - calledGenotypes = np.argmax(matrixCollapsedHets, axis = 0) + + matrixCollapsedHets = np.array( + [matrix[0, :], matrix[1, :] + matrix[2, :], matrix[3, :]], + dtype=np.float32, + ) + calledGenotypes = np.argmax(matrixCollapsedHets, axis=0) setMissing(calledGenotypes, matrixCollapsedHets, threshold) if InputOutput.args.sexchrom and ind.sex == 0: doubleIfNotMissing(calledGenotypes) ind.genotypes = calledGenotypes - def writePedigree(self, outputFile): - with open(outputFile, 'w+') as f: + with open(outputFile, "w+") as f: for ind in self: sire = "0" if ind.sire is not None: @@ -1128,54 +1257,50 @@ def writePedigree(self, outputFile): dam = "0" if ind.dam is not None: dam = ind.dam.idx - f.write(ind.idx + ' ' + sire + ' ' + dam + '\n') - + f.write(ind.idx + " " + sire + " " + dam + "\n") def writeGenotypes(self, outputFile): - data_list = [] - for ind in self : - data_list.append( (ind.idx, ind.genotypes) ) + for ind in self: + data_list.append((ind.idx, ind.genotypes)) MultiThreadIO.writeLines(outputFile, data_list, str) - def writePhase(self, outputFile): data_list = [] - for ind in self : - data_list.append( (ind.idx, ind.haplotypes[0]) ) - data_list.append( (ind.idx, ind.haplotypes[1]) ) + for ind in self: + data_list.append((ind.idx, ind.haplotypes[0])) + data_list.append((ind.idx, ind.haplotypes[1])) MultiThreadIO.writeLines(outputFile, data_list, str) - def writeDosages(self, outputFile): data_list = [] - for ind in self : + for ind in self: if ind.dosages is not None: - data_list.append( (ind.idx, ind.dosages) ) + data_list.append((ind.idx, ind.dosages)) else: dosages = ind.genotypes.copy() dosages[dosages == 9] = 1 - data_list.append( (ind.idx, dosages) ) + data_list.append((ind.idx, dosages)) MultiThreadIO.writeLines(outputFile, data_list, "{:.4f}".format) - def writeGenotypes_prefil(self, outputFile): # print("Output is using filled genotypes. Filling missing with a value of 1") # fillValues = np.full(1, self.nLoci) - print("Output is using filled genotypes. Filling missing with rounded allele frequency") + print( + "Output is using filled genotypes. Filling missing with rounded allele frequency" + ) self.setMaf() fillValues = np.round(self.maf) - with open(outputFile, 'w+') as f: + with open(outputFile, "w+") as f: for idx, ind in self.individuals.items(): fill(ind.genotypes, fillValues) self.writeLine(f, ind.idx, ind.genotypes, str) - def writeGenotypesPed(self, outputFile): """Write genotypes in PLINK plain text format""" data_list = [] @@ -1183,15 +1308,14 @@ def writeGenotypesPed(self, outputFile): # Split genotypes into 'pseudo' haplotypes such that # the first allele/haplotype of a heterozygous locus is always 0 missing = ind.genotypes == 9 - h1 = ind.genotypes//2 + h1 = ind.genotypes // 2 h2 = ind.genotypes - h1 h1[missing] = 9 h2[missing] = 9 alleles = self.encode_alleles(np.vstack([h1, h2])) - data_list.append( (ind.idx, alleles) ) + data_list.append((ind.idx, alleles)) MultiThreadIO.writeLinesPlinkPlainTxt(outputFile, data_list) - def writePhasePed(self, outputFile): """Write phased data (i.e. haplotypes) in PLINK plain text .ped format""" data_list = [] @@ -1201,13 +1325,12 @@ def writePhasePed(self, outputFile): elif ind.haplotypes.ndim == 1: # haploid diploid = np.vstack([ind.haplotypes, ind.haplotypes]) alleles = self.encode_alleles(diploid) - data_list.append( (ind.idx, alleles) ) + data_list.append((ind.idx, alleles)) MultiThreadIO.writeLinesPlinkPlainTxt(outputFile, data_list) - - def writeLine(self, f, idx, data, func) : - f.write(idx + ' ' + ' '.join(map(func, data)) + '\n') + def writeLine(self, f, idx, data, func): + f.write(idx + " " + " ".join(map(func, data)) + "\n") @jit(nopython=True) @@ -1216,6 +1339,7 @@ def fill(genotypes, fillValue): if genotypes[i] == 9: genotypes[i] = fillValue[i] + @jit(nopython=True) def addIfNotMissing(array1, counts, array2): for i in range(len(array1)): @@ -1230,6 +1354,7 @@ def addIfMissing(array1, array2): if array2[i] == 9: array1[i] += 1 + @jit(nopython=True) def doubleIfNotMissing(calledGenotypes): nLoci = len(calledGenotypes) @@ -1237,24 +1362,10 @@ def doubleIfNotMissing(calledGenotypes): if calledGenotypes[i] == 1: calledGenotypes[i] = 2 + @jit(nopython=True) -def setMissing(calledGenotypes, matrix, thresh) : +def setMissing(calledGenotypes, matrix, thresh): nLoci = len(calledGenotypes) for i in range(nLoci): - if matrix[calledGenotypes[i],i] < thresh: + if matrix[calledGenotypes[i], i] < thresh: calledGenotypes[i] = 9 - - - - - - - - - - - - - - - diff --git a/ProbMath.py b/ProbMath.py index 3817503..24866a1 100644 --- a/ProbMath.py +++ b/ProbMath.py @@ -14,22 +14,24 @@ def getGenotypesFromMaf(maf): return mafGenotypes -def getGenotypesFromMultiMaf(mafDict) : + +def getGenotypesFromMultiMaf(mafDict): nLoci = len(mafDict[list(mafDict.keys())[0]]) - mafGenotypes = np.full((4, nLoci), .25, dtype = np.float32) + mafGenotypes = np.full((4, nLoci), 0.25, dtype=np.float32) # Assumes two maf inputs. # maf1 from the sire, maf2 from the dam. maf1 = mafDict[list(mafDict.keys())[0]] maf2 = mafDict[list(mafDict.keys())[1]] - mafGenotypes[0,:] = (1-maf1)*(1-maf2) - mafGenotypes[1,:] = (1-maf1)*(maf2) - mafGenotypes[2,:] = (maf1)*(1-maf2) - mafGenotypes[3,:] = maf1*maf2 + mafGenotypes[0, :] = (1 - maf1) * (1 - maf2) + mafGenotypes[1, :] = (1 - maf1) * (maf2) + mafGenotypes[2, :] = (maf1) * (1 - maf2) + mafGenotypes[3, :] = maf1 * maf2 return mafGenotypes -def getGenotypeProbabilities_ind(ind, args = None, log = False): + +def getGenotypeProbabilities_ind(ind, args=None, log=False): if args is None: error = 0.01 seqError = 0.001 @@ -90,10 +92,10 @@ def getGenotypeProbabilities( valSeq = np.exp(valSeq) vals *= valSeq if (np.sum(vals, 0) == 0).any() and XChrMaleFlag: - index_test=np.where(np.sum(vals, 0) < 1)[0] + index_test = np.where(np.sum(vals, 0) < 1)[0] print( - f"Warning: Possible data issue: male genotype [position {index_test+1}] coded as '2'," - "which is biologically impossible." + f"Warning: Possible data issue: male genotype [position {index_test+1}] coded as '2'," + "which is biologically impossible." ) return vals / np.sum(vals, 0) @@ -299,6 +301,7 @@ def generateErrorMat(error): errorMat = errorMat / np.sum(errorMat, 1)[:, None] return errorMat + def updateGenoProbsFromPhenotype(geno_probs, phenotypes, phenoPenetrance): vals = geno_probs # Where there are repeated phenotype records, continue to multiply the penetrance as assumed independent. @@ -306,10 +309,10 @@ def updateGenoProbsFromPhenotype(geno_probs, phenotypes, phenoPenetrance): reps = 0 while reps < repPhenotypes: pheno = phenotypes[reps] - vals = vals*phenoPenetrance[:,pheno].reshape(-1,1) + vals = vals * phenoPenetrance[:, pheno].reshape(-1, 1) reps += 1 - - vals = vals/np.sum(vals, 0) + + vals = vals / np.sum(vals, 0) return vals @@ -353,7 +356,7 @@ def generateSegregationXXChrom(partial=False, mu=1e-08): father[fatherAlleleCoding[allele]], mother[motherAlleleCoding[allele]] ) - #segregationTensor = segregationTensor*(1-e) + e/4 #trace has 4 times as many elements as it should since it has 4 internal reps. + # segregationTensor = segregationTensor*(1-e) + e/4 #trace has 4 times as many elements as it should since it has 4 internal reps. if partial: segregationTensor = np.mean(segregationTensor, 3) segregationTensor = segregationTensor.astype(np.float32) @@ -387,7 +390,7 @@ def generateSegregationXYChrom(partial=False, mu=1e-08): motherAlleleCoding[allele] ] - #segregationTensor = segregationTensor*(1-e) + e/4 #trace has 4 times as many elements as it should since it has 4 internal reps. + # segregationTensor = segregationTensor*(1-e) + e/4 #trace has 4 times as many elements as it should since it has 4 internal reps. if partial: segregationTensor = np.mean(segregationTensor, 3) segregationTensor = segregationTensor.astype(np.float32) @@ -447,7 +450,7 @@ def generateSegregation(partial=False, mu=1e-08): # errorMat = errorMat/np.sum(errorMat, 1)[:,None] # return errorMat -## Not sure if below is ever used. +# Not sure if below is ever used. # def generateTransmission(error) : # paternalTransmission = np.array([ [1-error, 1.-error, error, error], # [error, error, 1-error, 1-error]]) diff --git a/README.md b/README.md index 4c2d05d..93c1d52 100644 --- a/README.md +++ b/README.md @@ -1,241 +1,6 @@ +`tinyhouse` +=========== Hello, -Welcome to tinyhouse, an increasingly non light-weight python module for processing genotypes and pedigrees, and for performing common operations for livestock breeding settings. This document is designed to give a brief overview of this software package, including some over-arching design philosophies, and some specifics about programs that use our software. +Welcome to `tinyhouse`, an increasingly non light-weight Python module for processing genotypes and pedigrees, and for performing common operations in selective breeding settings. -This document covers three main topics. - -1. Some general philosophy about writing code in python, and using tinyhouse as a library. -2. a bit about what each of the files in tinyhouse contains. This is not a full exhaustive list though. It will also include areas where improvement is needed (and hopefully will take place in the near future!). -3. It will include a brief overview of the software that depends on tinyhouse. The goal to give some idea of how it is currently being used, along with example programs that show how it might be used in the future. - -Some overarching ideas. -====== - -Python ------- - -When I started writing this library, most of the group (myself included) was using Fortran as their primary coding language. This worked well because most of our legacy code in AlphaImpute and AlphaHouse was written in Fortran, and John knew and understood the language. As time went on, working with Fortran was creating more problems then it was solving, and so we started looking for other languages. Python was an obvious choice -- it is widely used both within and outside of academia. There are a large number of mature third party modules and libraries that can be taken advantage of. It is easy to learn, a joy to code in, and simple to deploy and maintain. It also provides (through `numpy` and `numba`) fast computational libraries that give C or Fortran like speed increases, with minimal overhead. - -Many of these features stand in stark contrast to Fortran, which is no longer widely used, hard to code in, and lacks easy interfaces to common operations. Some simple data structures like linked lists and dictionaries are missing in Fortran, along with modern object-oriented programing principles like inheritance, and common operations like searching and sorting. Although it is possible (and AlphaImpute, AlphaPhase, and AlphaHouse impliment) these features, it takes a sizable number of developer-hours to do this, and porting them to different data structures is not always straightforward. These hurdles can make it expensive (in terms of developer-hours) to try out new ideas, or play around with new approaches and algorithms which hampered my ability to do research. - -Computational speed has remained one of the main concerns about moving away from Fortran. This group uses a large amount of CPU time every month, and any computational savings can increase the throughput we have getting jobs through Eddie. In addition, some of the commercial partners we work with, need to be able to run our software in a short time frame (generally 24 hours on their data), and so an order of magnitude speed difference will matter. - -To allow python to obtain Fortran level speeds for performing large scale matrix operations, much of the code is now parallelizable, and uses both `numpy` and `numba`. `numpy` is a commonly used matrix library in python. `numba` is a just in time (jit) compiler that is able to compile a subset of the core python language, along with some numpy features into faster code. In many cases, using `numba` and numpy requires little extra coding and provides competitive speed gains compared to Fortran. In some cases, working with `numba` can require some creative engineering, but the speed gains and convenience of using python more generally are worth it. - -Most importantly, I believe that concerns over choosing a language for speed are generally misplaced. In an academic environment the limiting factor for developing fast code tends to be developer time. More developer time means that the developer can spend more time profiling the code, and refining the algorithms used. This will generally give larger speed increases then re-implementing the same algorithm in a (slightly) faster language. I believe that this is particularly the case for non-speed critical operations, where the convenience of having a pre-existing data structure, like a non-fixed length list, or a dictionary, greatly outweighs the marginal speed losses for using one. - --Andrew Whalen - -Numpy ---- - -`numpy` is a commonly used matrix library for python. It provides sensible matrix algebra, and because of the C/Fortran/MKL framework that underlies it, allows these operations to be very high-performance particularly for large matracies. It also interfaces well with `numba`. We use `numpy` arrays to store most of our genotypic data across our programs. - -Because `numba` is type sensitive, and much of tinyhouse was developed to be `numba` aware, we tend to be explicit about the matrix type when declaring a numpy array. Failing to be explicit about types can lead to downstream compilation errors (which are generally not very informative, and can be tricky to debug). For example, if you want to allocate an empty array with 10 elements, use: -``` -newArray = np.full(10, 0, dtype = np.float32) -``` -or -``` -newArray = np.full(10, 0, dtype = np.int8) -``` -over -``` -newArray = np.full(10, 0) -``` - -As a general rule, use the following types for the following data structures: - -* `float32`: General purpose floating point number. Most of the computations we make are not influenced by floating point errors (particularly in AlphaImpute and AlphaPeel), where the error rates or margins of uncertainty are much larger than the floating point precision. Because of this, it makes sense to store information like phenotypes, genotype probabilities, or genotype dosages as a `float32` over a `float64`. -* `int64`: general purpose integer. Also used for sequence data read counts. There is a concern that an `int32` is not large enough to handle future datasets (e.g. for very large pedigrees), but an `int64` should be more than enough. -* `int8`: We store genotype values and haplotype values as an `int8`. Under a traditional coding scheme, Genotypes can take a value 0, 1, or 2. Haplotypes will take a value of 0 or 1. For historical reasons, we use 9 to denote a missing value. Potentially a more-informative missing value, like `NA` might be usable, but we would need to ensure `numba` and `numpy` compatibility. - -Some other notes: - -* When indexing matrices that have `nLoci` elements, I tend to have the last column be the loci, e.g. if we have a matrix of genotypes for three individuals, Use -``` -np.full((3, nLoci), 0, dtype = np.int8) -``` -over -``` -np.full((nLoci, 3), 0, dtype = np.int8). -``` - - -numba ---- - -`numba` is a just in time (jit) compiler for python. The idea is that you take a normal python function, and add on a decorator[^1]. `numba` will then compile the function and give potentially massive speed gains. The existence and usability of `numba` is one of the key features that makes python a feasible language for us in the long term. - -As an example, here is a function that adds two numbers together: - -``` -def add(a, b) : - return a + b -add(1, 2) -``` - -Here is a jit compiled version of the same function: - -``` -from numba import jit -@jit(nopython=True) -def add(a, b) : - return a + b -add(1, 2) -``` - -In the code we use the decorator `@jit(nopython=True)` or `@njit` to compile a function. If the `@jit` flag is given without arguments, then the code may not be compiled, particularly if it contains non-`numba` compatible elements. Using the `nopython=True` arguments, or `njit` forces the function to be compiled. An error will be thrown if the function cannot be compiled. - -The types of arguments that `numba` can accept is growing. Generally it can take most base python objects including numpy arrays. You can also write just-in-time classes. See the `jit_family` class in `pedigree.py` for an example. - -Some notes: - -* Generally in base python it is faster to use a numpy vectorized operation (e.g., for matrix addition). In `numba` this is not always the case. If the vector or matrix is small (under 100 elements) it is usually faster to write the explicit for loop. The speed increase for the for loop can be an order of magnitude greater than the vectorized operation. This occurs a lot in AlphaPeel since we are working with genotype probabilities that are of length 4. -* Auxiliary data structures: Although many objects can be passed to `numba`, our `individual` and `pedigree` objects cannot. Although it may be possible to make them `numba` compatible, I think this would take a lot of work and would decrease their usability in the long term. One way around this is to create wrappers around a `jit` function which take an individual (or set of individuals) and performs a common operation on e.g. their genotypes. Alternatively we can use `jit` versions of a class, where e.g., individuals are replaced with integer id numbers. -* For individual programs, it can often make sense to create "information" objects which are holders for a large number of matrices. These matrices are generally indexed by an individuals id number, `idn`. Unlike traditional pedigree objects, these information objects are usually much easier to turn into a `jit` class. It may seem a little bit weird that information on an individual is not directly linked to the individual object, but so far this has been an effective work around. - -[^1]: Decorators are functions that get applied to a function and return a function. They are denoted with the `@` symbol. We use two common decorators, `numba`'s `@jit` and `kernprof`'s `@profile`. See, e.g., https://realpython.com/primer-on-python-decorators/ - -Parallelization ---- - -There are a number of ways to parallelize python code. The most common are to use `numpy`'s internal parallelized functions, or to use `concurrent.futures.ThreadPoolExecutor` or `concurrent.futures.ProcessPoolExecutor`. `numpy`'s default parallelization is a good low-overhead option if most of the computational bottlenecks are large matrix operations. However it doesn't not always provide substantial improvements for complex matrix operations, and may not scale to a large number of processes. In contrast, both the `ThreadPoolExecutor` and `ProcessPoolExecutor` can parallelize arbitrary functions. The primary difference between them is: - -**ThreadPoolExecutor** This executes the function across several **threads**. By default these threads share memory. However Python is still bound by the global interpreter lock, which prevents multiple threads executing their commands simultaneously. This means that `ThreadPoolExecutor` will generally not lead to any performance gains since only a single thread is active at a given time, but may allow for some asynchronous tasks to be performed with python. It is possible to get around the global interpreter lock with `numba`. - -**ProcessPoolExecutor** This executes the functions across several python **processes**. These processes do not share memory, but have separate global interpreters. This means that you can get sizable speed gains by executing function calls across multiple processes, but may occur overhead for transferring data between processes. - -In our code we use both `ThreadPoolExecutor` and `ProcessPoolExecutor`, depending on what situation we are in. If we need to parallelize a large section of `jit` compiled `numba` code, we use a `ThreadPoolExecutor` and flag the function with `@jit(nopython=True, nogil=True)`. For example, see `Peeling.py` in `TinyPeel`. If we need to parallelize non-numba functions, we use a `ProcessPoolExecutor` instead. - -For more information about Python's global interpreter lock, see e.g., https://wiki.python.org/moin/GlobalInterpreterLock - -Profiling ----- - -For profiling, Kernprof seems to do a pretty good job. It requires adding the `@profile` decorator to functions. This will cause errors when not running the profiler. Because of this, many of the packages have the following lines in the header: -``` -try: - profile -except: - def profile(x): - return x - -``` -These lines look to see if profile is a valid function (i.e. if currently running with kernprof), otherwise it turns `profile` into the identity function. - -Style -=== - -We vaugley follow PEP8: https://www.python.org/dev/peps/pep-0008/ - -Some exceptions: - -* camelCase is generally used for variable names. -* underscores (\_) are inconsistently used to denote `@jit` functions and classes. - -Folder layouts -============== - -All of the python programs should follow a similar layout. For example, here is the layout for the program "AlphaCall". - -``` -. -├── build_pipeline.sh -├── docs -├── example -│   ├── out.called.0.98 -│   ├── reads.txt -│   └── runScript.sh -├── setup.py -├── src -│   └── alphacall -│   ├── AlphaCall-Convert.py -│   ├── AlphaCall.py -│   └── tinyhouse -│   ├── BasicHMM.py -│   ├── BurrowsWheelerLibrary.py -│   ├── HaplotypeLibrary.py -│   ├── HaplotypeOperations.py -│   ├── InputOutput.py -│   ├── Pedigree.py -│   ├── ProbMath.py -└── tests - ├── basedata - │   ├── reads.txt - │   └── target.txt - ├── checkResults.r - ├── outputs - │   └── out.called.0.98 - └── runTests.sh - -``` -In this folder, there are a number of mandatory files, and some optional files. - -**build_pipeline.sh** This bash script should contain all of the commands required to build and install the python module. It may be as simple as `python setup.py -bdist_wheel`, or may contain some additional code to clean and maintain the code base. Making sure this is up to date is highly recommended, since it is easy to forget exactly what arguments need to be passed to `setup.py`. - -**setup.py** This is a python file that contains instructions for how to compile the python program. For an example, see the `setup.py` in `AlphaCall`. There may be a broader explanation for how `setup.py` files work in the future. - -**src** This folder contains all of the python source code for the project. A version of `tinyhouse` is likely also included here. In this folder there should be a sub-folder with the name of the package, e.g. `alphacall`. The main scripts should be included in this sub-folder. Due to the way python handles relative imports, there may be some scripts in `src` to enable running the code directly, without having to first install the program. These are there to help with debugging and testing functionality without having to build and re-install first. - -**tests** This folder should contain scripts to test the functionality of the program. This may also include datasets required to run those tests (and potentially scripts to generate the data). The tests should be run-able with `./runTests.sh`, and should output success or failure. In the future there may be a more general way of running tests across multiple programs. - -**example:** This folder should contain a simple example of the program. The example should run by calling `./runScript.sh`. - -**docs:** This folder should contain all of the documentation for the project. If the documentation needs to be compiled, all of the files to compile it should be included. If there is a relevant paper, it may be a good idea to place it in here as well. - - - -Some files -========== - -InputOutput.py ----- - -Need a lot of words here. - -BasicHMM.py ----- - -This module contains code to run a simple Li and Stephens style HMM based on a set of reference haplotypes. This was originally generated for TinyPlantImpute. There is a haploid and a diploid version. Both need work. The primary functions are `haploidHMM` and `diploidHMM`. The haploid HMM takes in a single haplotype and a reference panel. The diploid HMM takes in an individual and a set of haplotype panels (for each sire/dam). There is an option to output either the called genotypes, dosages, or maximum likelihood (Viterbi) path, although not all of these algorithms are currently implemented for both options. - -Pedigree.py ----- -This module contains functions for storing data for individuals in a structured way. There are three main classes, `Individual`, `Pedigree`, and `Family`. The `Indivdiual` class provides a space for storing data related to an individual. This includes common data structures like genotypes, haplotypes, or read counts, and relationships with other individuals like sires, dams, or offspring. A `Family` object is a container for a full sib family (a single sire and dam, and their offspring). A `Pedigree` object is the default class for holding individuals. The primary container is a dictionary that contains all of the individuals indexed by ID number. It also includes an iterator to iterate over individuals in "generation" order (which places offspring after their parents). A large portion of the input/output is handled here and should be separated out. - - -ProbMath.py ----- -This module contains some probability math that is shared between programs. The key function is `getGenotypeProbabilities_ind` which takes in an individual and returns a `4 x nLoci` matrix of genotype probabilities based on the individuals genotype and read count data. This is also where the transmission matrix lives for AlphaPeel, AlphaAssign, and AlphaFamImpute. - -HaplotypeOperations.py ----- -This module contains some simple operations that can be performed on haplotypes. - - -BurrowsWheelerLibrary.py and HaplotypeLibrary.py ------ -These two modules include two different ways of constructing haplotype -libraries. These are solely used by AlphaImpute2, and will be better documented when AlphaImpute2 gets a bit more mature. - -Some programs -=== - -AlphaImpute2.0 ---- -AlphaImpute is an imputation program for going from a low density SNP array to a high density SNP array. It is currently a work in progress. - -AlphaPeel ---- -AlphaPeel is a pedigree based imputation program. It is designed - -AlphaCall ---- - - -TinyPlantImpute ---- - - -AlphaFamImpute ---- diff --git a/Utils.py b/Utils.py index aee6639..e9889f7 100644 --- a/Utils.py +++ b/Utils.py @@ -1,4 +1,6 @@ import datetime + + def time_func(text): # This creates a decorator with "text" set to "text" def timer_dec(func): @@ -6,8 +8,11 @@ def timer_dec(func): def timer(*args, **kwargs): start_time = datetime.datetime.now() values = func(*args, **kwargs) - print(f"{text}: {(datetime.datetime.now() - start_time).total_seconds():.2f} seconds") + print( + f"{text}: {(datetime.datetime.now() - start_time).total_seconds():.2f} seconds" + ) return values + return timer return timer_dec diff --git a/docs/Makefile b/docs/Makefile new file mode 100755 index 0000000..69fe55e --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..543c6b1 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst new file mode 100644 index 0000000..3b11687 --- /dev/null +++ b/docs/source/changelog.rst @@ -0,0 +1,11 @@ +========= +Changelog +========= + +[0.1.0] - 2026-03-?? +==================== + +Maintenance +----------- + +* Add the documentation, add pre-commit hooks. \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..cd5647d --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys + + +# -- Project information ----------------------------------------------------- + +project = "tinyhouse" +copyright = "2019, Andrew Whalen" +author = "Andrew Whalen" + +# The short X.Y version +version = "" +# The full version, including alpha/beta/rc tags +release = "" + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx.ext.autodoc", + "sphinx.ext.extlinks", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = ".rst" + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "python" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "tinyhousedoc" + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', + "extraclassoptions": "openany,oneside" +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "tinyhouse.tex", "tinyhouse Documentation", "Andrew Whalen", "manual"), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "tinyhouse", "tinyhouse Documentation", [author], 1)] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "tinyhouse", + "tinyhouse Documentation", + author, + "tinyhouse", + "One line description of project.", + "Miscellaneous", + ), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ["search.html"] + + +# -- Extension configuration ------------------------------------------------- + +extlinks = { + "pr": ("https://github.com/AlphaGenes/tinyhouse/pull/%s", "PR #%s"), + "issue": ("https://github.com/AlphaGenes/tinyhouse/issues/%s", "Issue #%s"), + "user": ("https://github.com/%s", "@%s"), +} diff --git a/docs/source/contribute.rst b/docs/source/contribute.rst new file mode 100644 index 0000000..ee38f99 --- /dev/null +++ b/docs/source/contribute.rst @@ -0,0 +1,40 @@ +Contribution and Development Guide +================================== + +Welcome to the ``tinyhouse`` contribution and development guide. + +``tinyhouse`` is a collection of Python modules for processing genotypes and pedigrees. +It is used as a submodule in AlphaSuite programs such as ``AlphaPeel``, ``AlphaImpute2``, and others. + +This guide is intended for developers who want to contribute to ``tinyhouse``. +It covers best practices for working with submodules and how to run tests. + +1. Make your changes in your fork of ``tinyhouse``, or in the relevant branch of + AlphaSuite programs if the changes are specific to those programs. + +2. Push the changes to your fork of ``tinyhouse``. + +3. Test the changes by running the tests in AlphaSuite programs with the + updated ``tinyhouse`` submodule reference. For example: + + .. code-block:: bash + + cd AlphaPeel # navigate to the AlphaPeel directory + cd src/tinypeel/tinyhouse + git remote add fork https://github.com/yourusername/tinyhouse.git + git fetch fork + git checkout fork/your-branch + cd ../../../ + pytest + +4. If all tests pass, create a pull request. + If not, fix the issues and repeat steps 1–3 until all tests pass. + +5. In the pull request, include: + + - The commits of ``AlphaPeel`` and ``AlphaImpute2`` used for testing. + + - Any other relevant information about the changes. + + + diff --git a/docs/source/get_started.rst b/docs/source/get_started.rst new file mode 100644 index 0000000..202cff3 --- /dev/null +++ b/docs/source/get_started.rst @@ -0,0 +1,25 @@ +=============== +Getting Started +=============== + +An example +---------- + +You can call the ``tinyhouse`` function by the following: + +.. code-block:: python + + # require the installation of AlphaPeel + import tinypeel.tinyhouse.Pedigree as Pedigree + import tinypeel.tinyhouse.InputOutput as InputOutput + + pedigree = Pedigree.Pedigree() + + class args: + pedigree = ["example/simple_example/simple_pedigree.txt"] + genotypes = ["example/simple_example/simple_genotype.txt"] + main_metafounder = "MF_1" + + InputOutput.readInPedigreeFromInputs(pedigree, args) + + print(f"Individual {pedigree.individuals['C'].idx} has parents {pedigree.individuals['C'].sire.idx} and {pedigree.individuals['C'].dam.idx}.") \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..4b531da --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,16 @@ +``tinyhouse`` +============= + +Hello, + +Welcome to ``tinyhouse``, an increasingly non light-weight Python module for processing genotypes and pedigrees, and for performing common operations in selective breeding settings. + +.. toctree:: + :maxdepth: 2 + + get_started + contribute + philosophy + changelog + +.. highlight:: none \ No newline at end of file diff --git a/docs/source/philosophy.rst b/docs/source/philosophy.rst new file mode 100644 index 0000000..285fd5b --- /dev/null +++ b/docs/source/philosophy.rst @@ -0,0 +1,255 @@ +========== +Philosophy +========== + +This document is designed to give a brief overview of this software package, including some over-arching design philosophies, and some specifics about programs that use our software. + +This document covers three main topics. + +1. Some general philosophy about writing code in python, and using ``tinyhouse`` as a library. + +2. a bit about what each of the files in ``tinyhouse`` contains. This is not a full exhaustive list though. +It will also include areas where improvement is needed (and hopefully will take place in the near future!). + +3. It will include a brief overview of the software that depends on ``tinyhouse``. +The goal to give some idea of how it is currently being used, along with example programs that show how it might be used in the future. + +Some overarching ideas +---------------------- + +Python +~~~~~~ + +When I started writing this library, most of the group (myself included) was using Fortran as their primary coding language. +This worked well because most of our legacy code in ``AlphaImpute`` and ``AlphaHouse`` was written in Fortran, and John knew and understood the language. As time went on, working with Fortran was creating more problems then it was solving, and so we started looking for other languages. Python was an obvious choice -- it is widely used both within and outside of academia. There are a large number of mature third party modules and libraries that can be taken advantage of. It is easy to learn, a joy to code in, and simple to deploy and maintain. It also provides (through ``numpy`` and ``numba``) fast computational libraries that give C or Fortran like speed increases, with minimal overhead. + +Many of these features stand in stark contrast to Fortran, which is no longer widely used, hard to code in, and lacks easy interfaces to common operations. Some simple data structures like linked lists and dictionaries are missing in Fortran, along with modern object-oriented programing principles like inheritance, and common operations like searching and sorting. Although it is possible (and ``AlphaImpute``, ``AlphaPhase``, and ``AlphaHouse`` impliment) these features, it takes a sizable number of developer-hours to do this, and porting them to different data structures is not always straightforward. These hurdles can make it expensive (in terms of developer-hours) to try out new ideas, or play around with new approaches and algorithms which hampered my ability to do research. + +Computational speed has remained one of the main concerns about moving away from Fortran. This group uses a large amount of CPU time every month, and any computational savings can increase the throughput we have getting jobs through Eddie. In addition, some of the commercial partners we work with, need to be able to run our software in a short time frame (generally 24 hours on their data), and so an order of magnitude speed difference will matter. + +To allow python to obtain Fortran level speeds for performing large scale matrix operations, much of the code is now parallelizable, and uses both ``numpy`` and ``numba``. ``numpy`` is a commonly used matrix library in python. ``numba`` is a just in time (jit) compiler that is able to compile a subset of the core python language, along with some numpy features into faster code. In many cases, using ``numba`` and ``numpy`` requires little extra coding and provides competitive speed gains compared to Fortran. In some cases, working with ``numba`` can require some creative engineering, but the speed gains and convenience of using python more generally are worth it. + +Most importantly, I believe that concerns over choosing a language for speed are generally misplaced. In an academic environment the limiting factor for developing fast code tends to be developer time. More developer time means that the developer can spend more time profiling the code, and refining the algorithms used. This will generally give larger speed increases then re-implementing the same algorithm in a (slightly) faster language. I believe that this is particularly the case for non-speed critical operations, where the convenience of having a pre-existing data structure, like a non-fixed length list, or a dictionary, greatly outweighs the marginal speed losses for using one. + +-Andrew Whalen + +Numpy +~~~~~ + +``numpy`` is a commonly used matrix library for python. It provides sensible matrix algebra, and because of the C/Fortran/MKL framework that underlies it, allows these operations to be very high-performance particularly for large matracies. +It also interfaces well with ``numba``. We use ``numpy`` arrays to store most of our genotypic data across our programs. + +Because ``numba`` is type sensitive, and much of ``tinyhouse`` was developed to be ``numba`` aware, we tend to be explicit about the matrix type when declaring a numpy array. +Failing to be explicit about types can lead to downstream compilation errors (which are generally not very informative, and can be tricky to debug). For example, if you want to allocate an empty array with 10 elements, use: + +.. code-block:: python + + newArray = np.full(10, 0, dtype = np.float32) + +or + +.. code-block:: python + + newArray = np.full(10, 0, dtype = np.int8) + +over + +.. code-block:: python + + newArray = np.full(10, 0) + +As a general rule, use the following types for the following data structures: + +* ``float32``: General purpose floating point number. Most of the computations we make are not influenced by floating point errors (particularly in ``AlphaImpute`` and ``AlphaPeel``), where the error rates or margins of uncertainty are much larger than the floating point precision. Because of this, it makes sense to store information like phenotypes, genotype probabilities, or genotype dosages as a ``float32`` over a ``float64``. + +* ``int64``: general purpose integer. Also used for sequence data read counts. There is a concern that an ``int32`` is not large enough to handle future datasets (e.g. for very large pedigrees), but an ``int64`` should be more than enough. + +* ``int8``: We store genotype values and haplotype values as an ``int8``. Under a traditional coding scheme, Genotypes can take a value 0, 1, or 2. Haplotypes will take a value of 0 or 1. For historical reasons, we use 9 to denote a missing value. Potentially a more-informative missing value, like ``NA`` might be usable, but we would need to ensure ``numba`` and ``numpy`` compatibility. + +Some other notes: + +* When indexing matrices that have ``nLoci`` elements, I tend to have the last column be the loci, e.g. if we have a matrix of genotypes for three individuals, Use + +.. code-block:: python + + np.full((3, nLoci), 0, dtype = np.int8) + + +over + +.. code-block:: python + + np.full((nLoci, 3), 0, dtype = np.int8) + + +numba +~~~~~ + +``numba`` is a just in time (jit) compiler for python. The idea is that you take a normal python function, and add on a decorator [*]_. +``numba`` will then compile the function and give potentially massive speed gains. +The existence and usability of ``numba`` is one of the key features that makes python a feasible language for us in the long term. + +As an example, here is a function that adds two numbers together: + +.. code-block:: python + + def add(a, b) : + return a + b + add(1, 2) + +Here is a jit compiled version of the same function: + +.. code-block:: python + + from numba import jit + @jit(nopython=True) + def add(a, b) : + return a + b + add(1, 2) + + +In the code we use the decorator ``@jit(nopython=True)`` or ``@njit`` to compile a function. If the ``@jit`` flag is given without arguments, then the code may not be compiled, particularly if it contains non-``numba`` compatible elements. +Using the ``nopython=True`` arguments, or ``njit`` forces the function to be compiled. An error will be thrown if the function cannot be compiled. + +The types of arguments that ``numba`` can accept is growing. Generally it can take most base python objects including numpy arrays. You can also write just-in-time classes. See the ``jit_family`` class in ``pedigree.py`` for an example. + +Some notes: + +* Generally in base python it is faster to use a numpy vectorized operation (e.g., for matrix addition). In ``numba`` this is not always the case. If the vector or matrix is small (under 100 elements) it is usually faster to write the explicit for loop. The speed increase for the for loop can be an order of magnitude greater than the vectorized operation. This occurs a lot in ``AlphaPeel`` since we are working with genotype probabilities that are of length 4. + +* Auxiliary data structures: Although many objects can be passed to ``numba``, our ``individual`` and ``pedigree`` objects cannot. Although it may be possible to make them ``numba`` compatible, I think this would take a lot of work and would decrease their usability in the long term. One way around this is to create wrappers around a ``jit`` function which take an individual (or set of individuals) and performs a common operation on e.g. their genotypes. Alternatively we can use ``jit`` versions of a class, where e.g., individuals are replaced with integer id numbers. + +* For individual programs, it can often make sense to create "information" objects which are holders for a large number of matrices. These matrices are generally indexed by an individuals id number, ``idn``. Unlike traditional pedigree objects, these information objects are usually much easier to turn into a ``jit`` class. It may seem a little bit weird that information on an individual is not directly linked to the individual object, but so far this has been an effective work around. + +.. [*] Decorators are functions that get applied to a function and return a function. They are denoted with the ``@`` symbol. We use two common decorators, ``numba``'s ``@jit`` and ``kernprof``'s ``@profile``. See, e.g., https://realpython.com/primer-on-python-decorators/ + +Parallelization +~~~~~~~~~~~~~~~ + +There are a number of ways to parallelize python code. The most common are to use ``numpy``'s internal parallelized functions, or to use ``concurrent.futures.ThreadPoolExecutor`` or ``concurrent.futures.ProcessPoolExecutor``. +``numpy``'s default parallelization is a good low-overhead option if most of the computational bottlenecks are large matrix operations. +However it doesn't not always provide substantial improvements for complex matrix operations, and may not scale to a large number of processes. In contrast, both the ``ThreadPoolExecutor`` and ``ProcessPoolExecutor`` can parallelize arbitrary functions. +The primary difference between them is: + +- :strong:`ThreadPoolExecutor` This executes the function across several :strong:`threads`. By default these threads share memory. However Python is still bound by the global interpreter lock, which prevents multiple threads executing their commands simultaneously. This means that ``ThreadPoolExecutor`` will generally not lead to any performance gains since only a single thread is active at a given time, but may allow for some asynchronous tasks to be performed with python. It is possible to get around the global interpreter lock with ``numba``. + +- :strong:`ProcessPoolExecutor` This executes the functions across several python :strong:`processes`. These processes do not share memory, but have separate global interpreters. This means that you can get sizable speed gains by executing function calls across multiple processes, but may occur overhead for transferring data between processes. + +In our code we use both ``ThreadPoolExecutor`` and ``ProcessPoolExecutor``, depending on what situation we are in. If we need to parallelize a large section of ``jit`` compiled ``numba`` code, we use a ``ThreadPoolExecutor`` and flag the function with ``@jit(nopython=True, nogil=True)``. +For example, see ``Peeling.py`` in ``tinypeel``. If we need to parallelize non-numba functions, we use a ``ProcessPoolExecutor`` instead. + +For more information about Python's global interpreter lock, see e.g., https://wiki.python.org/moin/GlobalInterpreterLock + +Profiling +~~~~~~~~~ + + +For profiling, Kernprof seems to do a pretty good job. It requires adding the ``@profile`` decorator to functions. This will cause errors when not running the profiler. Because of this, many of the packages have the following lines in the header: + +.. code-block:: python + + try: + profile + except: + def profile(x): + return x + + +These lines look to see if profile is a valid function (i.e. if currently running with kernprof), otherwise it turns ``profile`` into the identity function. + +Style +~~~~~ + +We vaugley follow PEP8: https://www.python.org/dev/peps/pep-0008/ + +Some exceptions: + +* camelCase is generally used for variable names. +* underscores (\_) are inconsistently used to denote ``@jit`` functions and classes. + +Folder layouts +-------------- + +All of the python programs should follow a similar layout. For example, here is the layout for the program "AlphaCall". + +:: + + . + ├── build_pipeline.sh + ├── docs + ├── example + │   ├── out.called.0.98 + │   ├── reads.txt + │   └── runScript.sh + ├── setup.py + ├── src + │   └── alphacall + │   ├── AlphaCall-Convert.py + │   ├── AlphaCall.py + │   └── tinyhouse + │   ├── BasicHMM.py + │   ├── BurrowsWheelerLibrary.py + │   ├── HaplotypeLibrary.py + │   ├── HaplotypeOperations.py + │   ├── InputOutput.py + │   ├── Pedigree.py + │   ├── ProbMath.py + └── tests + ├── basedata + │   ├── reads.txt + │   └── target.txt + ├── checkResults.r + ├── outputs + │   └── out.called.0.98 + └── runTests.sh + + +In this folder, there are a number of mandatory files, and some optional files. + +- **build_pipeline.sh** This bash script should contain all of the commands required to build and install the python module. It may be as simple as ``python setup.py -bdist_wheel``, or may contain some additional code to clean and maintain the code base. Making sure this is up to date is highly recommended, since it is easy to forget exactly what arguments need to be passed to ``setup.py``. + +- **setup.py** This is a python file that contains instructions for how to compile the python program. For an example, see the ``setup.py`` in ``AlphaCall``. There may be a broader explanation for how ``setup.py`` files work in the future. + +- **src** This folder contains all of the python source code for the project. A version of ``tinyhouse`` is likely also included here. In this folder there should be a sub-folder with the name of the package, e.g. ``alphacall``. The main scripts should be included in this sub-folder. Due to the way python handles relative imports, there may be some scripts in ``src`` to enable running the code directly, without having to first install the program. These are there to help with debugging and testing functionality without having to build and re-install first. + +- **tests** This folder should contain scripts to test the functionality of the program. This may also include datasets required to run those tests (and potentially scripts to generate the data). The tests should be run-able with ``./runTests.sh``, and should output success or failure. In the future there may be a more general way of running tests across multiple programs. + +- **example:** This folder should contain a simple example of the program. The example should run by calling ``./runScript.sh``. + +- **docs:** This folder should contain all of the documentation for the project. If the documentation needs to be compiled, all of the files to compile it should be included. If there is a relevant paper, it may be a good idea to place it in here as well. + + +Some files +---------- + +``InputOutput.py`` +~~~~~~~~~~~~~~~~~~ + +Need a lot of words here. + +``BasicHMM.py`` +~~~~~~~~~~~~~~~ + +This module contains code to run a simple Li and Stephens style HMM based on a set of reference haplotypes. This was originally generated for TinyPlantImpute. There is a haploid and a diploid version. Both need work. The primary functions are ``haploidHMM`` and ``diploidHMM``. The haploid HMM takes in a single haplotype and a reference panel. The diploid HMM takes in an individual and a set of haplotype panels (for each sire/dam). There is an option to output either the called genotypes, dosages, or maximum likelihood (Viterbi) path, although not all of these algorithms are currently implemented for both options. + +``Pedigree.py`` +~~~~~~~~~~~~~~~ + +This module contains functions for storing data for individuals in a structured way. There are three main classes, ``Individual``, ``Pedigree``, and ``Family``. The ``Indivdiual`` class provides a space for storing data related to an individual. This includes common data structures like genotypes, haplotypes, or read counts, and relationships with other individuals like sires, dams, or offspring. A ``Family`` object is a container for a full sib family (a single sire and dam, and their offspring). A ``Pedigree`` object is the default class for holding individuals. The primary container is a dictionary that contains all of the individuals indexed by ID number. It also includes an iterator to iterate over individuals in "generation" order (which places offspring after their parents). A large portion of the input/output is handled here and should be separated out. + +``ProbMath.py`` +~~~~~~~~~~~~~~~ + +This module contains some probability math that is shared between programs. The key function is ``getGenotypeProbabilities_ind`` which takes in an individual and returns a ``4 x nLoci`` matrix of genotype probabilities based on the individuals genotype and read count data. This is also where the transmission matrix lives for ``AlphaPeel``, ``AlphaAssign``, and ``AlphaFamImpute``. + +``HaplotypeOperations.py`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This module contains some simple operations that can be performed on haplotypes. + +``BurrowsWheelerLibrary.py`` and ``HaplotypeLibrary.py`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These two modules include two different ways of constructing haplotype +libraries. These are solely used by ``AlphaImpute2``, and will be better documented when ``AlphaImpute2`` gets a bit more mature. + diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt new file mode 100644 index 0000000..637690e --- /dev/null +++ b/docs/source/requirements.txt @@ -0,0 +1,2 @@ +numpy>=1.19 +numba>=0.50.0 \ No newline at end of file diff --git a/example/simple_example/simple_genotype.txt b/example/simple_example/simple_genotype.txt new file mode 100644 index 0000000..1ba5edb --- /dev/null +++ b/example/simple_example/simple_genotype.txt @@ -0,0 +1,3 @@ +A 2 2 0 0 0 +B 0 0 2 2 2 +C 9 9 9 9 9 \ No newline at end of file diff --git a/example/simple_example/simple_pedigree.txt b/example/simple_example/simple_pedigree.txt new file mode 100644 index 0000000..3b5fdab --- /dev/null +++ b/example/simple_example/simple_pedigree.txt @@ -0,0 +1,3 @@ +A 0 0 +B 0 0 +C A B \ No newline at end of file