In [88]:
import sys, os
sys.path.append('/home/dnelson/project/msprime')
import msprime

import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
from collections import defaultdict

In [8]:
df.head()

Unnamed: 0,ind,father,mother,sex
0,10086,0,0,1
1,10087,0,0,2
2,10102,0,0,1
3,10103,0,0,2
4,10128,0,0,1


In [229]:
class PedFiller:
    def __init__(self, ped_df):
        self.ped_df = ped_df
        self.min_ID = 0
        self.probands = set()
        # TODO: Would be good to have 'parent_cohorts' where
        # we can choose from pre-existing couples rather than
        # just random individuals from each cohort
        self.node_cohorts = defaultdict(list)
        self.founder_cohorts = defaultdict(list)

    @staticmethod
    def read_ped(pedfile, header=0, sep=' ', dtype=int):
        df = pd.read_csv(pedfile, header=header, sep=sep, dtype=dtype)
        for expected_col in ['ind', 'mother', 'father']:
            assert(expected_col in df.columns)
        
        return PedFiller(df)

    def partial_order_times(self):
        ind_dict = dict(zip(self.ped_df['ind'], range(len(self.ped_df['ind']))))
        not_mothers = set(self.ped_df['ind']).difference(self.ped_df['mother'])
        self.probands = not_mothers.difference(self.ped_df['father'])

        climbers = self.probands
        times = np.zeros(len(self.ped_df['ind']))
        t = 0
        while len(climbers) > 0:
            next_climbers = set()
            for c in climbers:
                idx = ind_dict[c]
                time = times[idx]
                if t > time:
                    times[idx] = t

                mother = self.ped_df['mother'][idx]
                father = self.ped_df['father'][idx]
                if mother != 0:
                    next_climbers.add(mother)
                if father != 0:
                    next_climbers.add(father)

            climbers = next_climbers
            t += 1
        
        self.ped_df['time'] = times
        
    def build_cohorts(self):
        for i, row in self.ped_df.iterrows():
            if row.mother == 0 and row.father == 0:
                self.founder_cohorts[row.time].append(int(row.ind))
            else:
                self.node_cohorts[row.time].append((row.ind))
            
    def get_wf_parents(self, n, N, min_ID, monogamous=True):
        if monogamous is not True:
            raise NotImplementedError
            
        num_couples = N / 2
        mothers = np.arange(min_ID, min_ID + num_couples, dtype=int).reshape(-1, 1)
        fathers = np.arange(min_ID + num_couples, min_ID + (num_couples * 2), dtype=int).reshape(-1, 1)
        
        if monogamous is True:
            np.random.shuffle(fathers)
            couple_indices = np.random.randint(0, num_couples, size=n)
            couples = np.concatenate([mothers, fathers], axis=1)[couple_indices]
            
        return couples
    
    def complete_pedigree(self, N=100, reconnection_rate=0.05, max_gen_diff=1):
        assert(len(self.founder_cohorts) > 0)
        assert(len(self.node_cohorts) > 0)
        
        max_time = int(np.max(self.ped_df['time']))
        max_ID = np.max(self.ped_df['ind'])
        self.new_ped_rows = []
        
        
        for time in range(max_time - 1):
            founders = self.founder_cohorts[time]
            if len(set(founders)) != len(founders):
                print("Error at time", time)
                print(founders)
                break
            if len(founders) == 0:
                continue
                
            possible_parents = []
            for i in range(1, max_gen_diff + 1):
                possible_parents.extend(self.node_cohorts[time + i])
            assert(len(possible_parents) == len(set(possible_parents)))

            num_to_reconnect = np.random.binomial(len(founders), reconnection_rate)
            np.random.shuffle(founders)
            to_reconnect = founders[:num_to_reconnect]
            not_reconnected = founders[num_to_reconnect:]
            
            new_parents = self.get_wf_parents(len(not_reconnected), N, max_ID)
            max_ID = np.max(new_parents)
            for founder, parents in zip(not_reconnected, new_parents):
                mother, father = parents
                row = [founder, mother, father, time]
                self.new_ped_rows.append(row)
                
            # Add new parents to next founder generation
            self.founder_cohorts[time + 1].extend(set(list(new_parents.ravel())))
                
            for founder in to_reconnect:
                mother, father = np.random.choice(possible_parents, size=2)
                row = [founder, mother, father, time]
                self.new_ped_rows.append(row)
                
        # Now make pandas dataframe out of new rows and merge/replace with original


In [230]:
P = PedFiller.read_ped('/Users/dnelson/project/anc_finder/data/pedEx.txt')

In [231]:
# P.get_wf_parents(10, 4, 0)

In [232]:
P.partial_order_times()

In [233]:
P.build_cohorts()

In [234]:
P.complete_pedigree()

In [236]:
pd.DataFrame(P.new_ped_rows)

Unnamed: 0,0,1,2,3
0,136981,900527.0,900560.0,2
1,861894,900524.0,900587.0,2
2,297723,900537.0,900581.0,2
3,863186,900554.0,900578.0,2
4,714720,900521.0,900576.0,2
5,136980,900524.0,900587.0,2
6,863185,900526.0,900572.0,2
7,454426,900524.0,900587.0,2
8,861893,900535.0,900592.0,2
9,717640,900551.0,900589.0,2


In [46]:
P.ped_df.head()

Unnamed: 0,ind,father,mother,sex,time
0,10086,0,0,1,9.0
1,10087,0,0,2,9.0
2,10102,0,0,1,7.0
3,10103,0,0,2,7.0
4,10128,0,0,1,8.0


In [15]:
cohorts = {}
for i in range(20):
    cohorts[i] = df['mother'][(df['cohort'] == i) & (df['mother'] != 0)]

In [16]:
[len(v) for v in cohorts.values()]

[1504150,
 615427,
 339214,
 215890,
 143180,
 93538,
 59021,
 37927,
 24690,
 15764,
 9445,
 5384,
 2989,
 1585,
 457,
 96,
 9,
 1,
 0,
 0]

In [17]:
naive_partner = dict(zip(df['mother'], df['father']))

In [18]:
naive_partner[8232207]

1868793

In [19]:
max_time = 12
ped_list = []
num_assigned = 0
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    mother = row.mother
    father = row.father
    if row.mother == 0 and row.cohort <= max_time:
        num_assigned += 1
        parent_choices = np.concatenate([cohorts[t] for t in range(int(row['first']), int(row['last'])+1)])
        try:
            mother = np.random.choice(parent_choices)
        except:
            print(i)
            print(row)
            raise
        father = naive_partner[mother]
        
    ped_list.append([row.ind, mother, father])
    

100%|██████████| 3423179/3423179 [06:56<00:00, 8221.20it/s] 


In [20]:
num_assigned

349601

In [21]:
with open('/Users/dnelson/project/pedigree_msp/data/BALSAC/BALasc_probands1930_12gens_reconnected.txt', 'w') as f:
    f.write('ind\tfather\tmother\n')
    for row in tqdm(ped_list, total=len(ped_list)):
        f.write('\t'.join([str(x) for x in row]) + '\n')

100%|██████████| 3423179/3423179 [00:08<00:00, 394751.93it/s]


In [22]:
[r for r in ped_list if 2331548 in r]

[[2340939, 2331548, 2331549],
 [2750947, 2331548, 2385356],
 [2331548, 8232298, 2381228]]

In [23]:
[r for r in ped_list if 1104914 in r]

[[1104914, 2333034, 2333035], [1104916, 1104914, 1104915]]

In [25]:
Counter(df['cohort'].values)

Counter({0: 1504150,
         1: 825970,
         2: 409270,
         3: 246316,
         4: 159167,
         5: 101958,
         6: 62748,
         7: 39983,
         8: 26434,
         9: 17826,
         10: 11239,
         11: 6770,
         12: 4389,
         13: 3413,
         14: 2533,
         15: 817,
         16: 176,
         17: 18,
         18: 2})