In [18]:
import sys, os
sys.path.append('/home/dnelson/project/msprime')
import msprime
import collections

import pandas as pd
import numpy as np

In [3]:
regions_file = '/home/dnelson/project/pedigree_msp/data/BALSAC/BALSAC_proband_regions.txt'

In [4]:
df = pd.read_csv(regions_file, sep='\t', header=None, names=['ID', 'region'])
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [7]:
regions = set(df['region']).difference(['NA'])

In [35]:
sample_ids = []
for r in regions:
    s = np.random.choice(df[df['region'] == r]['ID'], size=100, replace=False)
    sample_ids.extend(s)    

[6440972  249309 6441154 6440860 6439007 7951649  290441 6439471 6440633
 6438014]


In [36]:
pedfile = '/home/dnelson/project/pedigree_msp/data/BALasc_probands1930.txt'

P = Pedigree(pedfile, samples=sample_ids)

[0, 1, 2]


In [37]:
P.build_ll_array()

In [38]:
id_set = set(sample_ids)
num_samples = 0
for row in P.ll_ped_array:
    ID, _, _, _, is_sample = row
    if ID in id_set:
        assert is_sample == 1
        num_samples += 1
    else:
        assert is_sample == 0
    
print(num_samples, "loaded")

In [41]:
np.save('/home/dnelson/project/pedigree_msp/data/BALasc_probands1930_100samples_per_region.npy', P.ll_ped_array)

In [20]:
class Pedigree(object):
    """
    Python class for loading pedigree into numpy for export to C library
    """
    def __init__(self, pedfile, num_samples=None, samples=None, ploidy=2,
                 cols=None):
        assert(num_samples is not None or samples is not None)
        self.pedfile = pedfile
        self.ploidy = ploidy

        self.inds = None
        self.parents = None
        self.times = None
        self.is_sample = None

        self.ll_ped_array = None
        self.num_children = None

        self.samples = samples
        self.num_samples = num_samples
        self.ninds = 0
        self.ind_to_index_dict = None

        self.cols = {
            "inds": 0,
            "parents": [1, 2],
            "time": None,
            "sex": None,
            "is_sample": None}
        if cols is not None:
            self.cols = cols
        assert(len(self.cols["parents"]) == ploidy)

        if pedfile is not None:
            self.load(self.pedfile)

            if self.is_sample is None:
                self.set_samples()
            if self.times is None:
                self.assign_times()

    def load(self, pedfile):
        usecols = []
        for c in self.cols.values():
            if isinstance(c, collections.Iterable):
                usecols.extend(c)
            elif c is not None:
                usecols.append(c)
        usecols = sorted(usecols)
        print(usecols)

        data = np.genfromtxt(pedfile, skip_header=1, usecols=usecols,
                             dtype=float)
        self.inds = data[:, self.cols["inds"]].astype(int)

        if min(self.inds) < 0:
            raise ValueError("Individual array indices must be >= 0")

        self.parents = data[:, self.cols["parents"]].astype(int)
        if self.cols["time"] is not None:
            self.times = data[:, self.cols["time"]]
        if self.cols["is_sample"] is not None:
            self.is_sample = data[:, self.cols["is_sample"]].astype(int)

        self.ninds = len(self.inds)
        self.ind_to_index_dict = dict(zip(self.inds, range(self.ninds)))
        # Add special case for 0, which indicates an unknown individual
        assert 0 not in self.ind_to_index_dict
        self.ind_to_index_dict[0] = -1

    def set_samples(self, probands_only=True):
        if self.is_sample is not None:
            print("Samples already set.")
            return

        self.is_sample = np.zeros((self.ninds), dtype=int)
        if (self.num_samples is not None and self.samples is not None):
            raise ValueError("Cannot specify both samples and num_samples.")

        probands = set(self.inds).difference(self.parents.ravel())

        random_samples = []
        if self.num_samples is not None:
            if self.num_samples > len(probands):
                raise ValueError((
                        "Cannot specify more samples ({}) than there are "
                        "probands in the pedigree ({}) "
                        ).format(self.num_samples, len(probands)))
            random_samples = np.random.choice(
                list(probands), size=self.num_samples, replace=False)

        self.samples += list(random_samples)

        assert(self.samples is not None)
        for s in self.samples:
            s_idx = self.ind_to_index_dict[s]
            if probands_only is True:
                assert(s in probands)
            self.is_sample[s_idx] = 1

    def get_ind_time(self, ind=None, ind_idx=None):
        if ind is not None:
            assert ind_idx is None
            ind_idx = self.ind_to_index_dict[ind]

        return self.times[ind_idx]

    def set_ind_time(self, time, ind=None, ind_idx=None):
        if ind is not None:
            assert ind_idx is None
            ind_idx = self.ind_to_index_dict[ind]

        self.times[ind_idx] = time

    def get_ind_parents(self, ind=None, ind_idx=None):
        if ind is not None:
            assert ind_idx is None
            ind_idx = self.ind_to_index_dict[ind]

        parents = []
        for p in self.parents[ind_idx]:
            if p == 0:
                parents.append(None)
            else:
                parents.append(p)

        return parents

    def assign_times(self, check=False):
        """
        For pedigrees without specified times, crudely assigns times to
        all individuals.
        """
        if self.times is not None:
            print("Times already assigned.")
            if check is True:
                self.check_times()
            return

        self.times = np.zeros((self.ninds), dtype=int)
        assert len(self.samples) > 0

        climbers = [s for s in self.samples]
        t = 0
        while len(climbers) > 0:
            next_climbers = []
            for climber in climbers:
                if self.get_ind_time(climber) < t:
                    self.set_ind_time(t, climber)
                for parent in self.get_ind_parents(climber):
                    if parent is not None:
                        next_climbers.append(parent)
            climbers = next_climbers
            t += 1

        if check is True:
            self.check_times()

    def check_times(self):
        for ind in self.samples:
            for parent in self.get_ind_parents(ind):
                if parent is not None:
                    t1 = self.get_ind_time(ind)
                    t2 = self.get_ind_time(parent)
                    if t1 >= t2:
                        print("Error! Ind", ind, "time:", t1,
                              "parent", parent, "time:", t2)

    def build_ll_array(self):
        if self.ploidy != 2:
            raise NotImplementedError("Only ploidy=2 is currently supported")

        if self.ll_ped_array is not None:
            print("Low-level pedigree array already built.")
            return

        # Cols:
        # ind (1), parents (ploidy), time (1), is_sample (1)
        ncols = self.ploidy + 3

        # This is a bit awkward
        if min(self.inds) < 0:
            raise ValueError("Individual array indices must be >= 0")

        self.ll_ped_array = np.zeros((self.ninds, ncols), dtype=np.int32)
        self.ll_ped_array[:, 0] = self.inds
        self.ll_ped_array[:, 3] = self.times
        self.ll_ped_array[:, 4] = self.is_sample

        for i in range(self.ninds):
            father, mother = self.parents[i]
            self.ll_ped_array[i][1] = self.ind_to_index_dict[father]
            self.ll_ped_array[i][2] = self.ind_to_index_dict[mother]