# Counting ancestral histories


This note grew out of a working notebook I develloped to count histories. It now serves to give a brief overview of how the code in **count_histories.py** works.

## Step 1: setting the stage

In [1]:
#%matplotlib inline
import sys

#simulations_dir = '../../coalescent-simulations' ## update if coalescent-simulations at different path on local machine
#sys.path.append(simulations_dir)
count_histories_dir = '../'
sys.path.append(count_histories_dir)

In [2]:
import numpy as np

In [3]:
import count_histories as ch

In [4]:
#reload(ch)

## Step 2: playing arounf a bit.

We create some simple trees and show how to count the number of ancestral histories. The first tree considered (**T**) corresponds to the example from figure 2.10 in Hein et al. 2004.

In [5]:
T = ch.unordered_rooted_leaf_labelled_tree_from_nested_tuples( (((1,(2,)),),(3,4),5) )
#T3 = ch.unordered_rooted_leaf_labelled_tree_from_nested_tuples( (((1,(2,)),),(3,4),5,) )

In [6]:
str(T)

'(((1, (2))), 5, (3, 4))'

In [7]:
#count histories returns a number (thr number of histories of tyhe input argument)
#and a dictioanry containing all trees consderted when computing the number of histories.
hist, hist_dict = ch.count_histories(T,dict())

In [8]:
print 'Tree          Histories'
print '-----------------------'
for T,hist in hist_dict.items():
    print T,'\t'*4,hist

Tree          Histories
-----------------------
1 				1
2 				1
5 				1
(((1, (2))), 5, (3, 4)) 				71
(2) 				1
((1, (2))) 				1
(1, (2)) 				1
(3, 4) 				1


We can also try on a slightly larger tree:

In [9]:
T2 = ch.unordered_rooted_leaf_labelled_tree_from_nested_tuples( ((1,4,(7,(9,))),(((((2,3),8),),),5,6),(((((10,),),),),11),(((((((12,13),14),15),16,20,21),17),18),19)) )
#T4 = ch.unordered_rooted_leaf_labelled_tree_from_nested_tuples( ((0,0,(0,(0,))),(((((0,0),0),),),0,0),(((((0,),),),),0),(((((((0,0),0),0),0,0,0),0),0),0)) )

hist, hist_dict = ch.count_histories(T2,dict())

print 'Tree          Histories'
print '-----------------------'
for T,hist in hist_dict.items():
    print T,'\t'*4,hist

Tree          Histories
-----------------------
1 				1
(19, (18, ((16, 20, 21, (((12, 13), 14), 15)), 17))) 				135
4 				1
5 				1
6 				1
7 				1
8 				1
9 				1
10 				1
11 				1
14 				1
15 				1
16 				1
17 				1
18 				1
((9), 7) 				1
20 				1
21 				1
(((12, 13), 14), 15) 				1
(((10))) 				1
(6, 5, (((8, (2, 3))))) 				9
(1, ((9), 7), 4) 				6
((((10)))) 				1
((16, 20, 21, (((12, 13), 14), 15)), 17) 				135
((19, (18, ((16, 20, 21, (((12, 13), 14), 15)), 17))), (1, ((9), 7), 4), (6, 5, (((8, (2, 3))))), (11, ((((10)))))) 				1185892528292222567909400
(10) 				1
((12, 13), 14) 				1
(((8, (2, 3)))) 				1
(9) 				1
(11, ((((10))))) 				1
((8, (2, 3))) 				1
(18, ((16, 20, 21, (((12, 13), 14), 15)), 17)) 				135
(16, 20, 21, (((12, 13), 14), 15)) 				135
(12, 13) 				1
(2, 3) 				1
19 				1
(8, (2, 3)) 				1
((10)) 				1


In [10]:
#ch.count_histories(T4,dict())

The case when the tree has height 2 (i.e. when all non-root nodes are a leaf) is hangled separately. Below, we vetify that histories are easy to count in this special case.

In [11]:
for i in range(10):
    Tn = ch.unordered_rooted_leaf_labelled_tree_from_nested_tuples(tuple(range(i)))
    #print [t.rootDegree() for t in Tn.getSubtrees_unique()]
    print str(Tn),ch.count_histories(Tn,dict())[0]

 1
(0) 1
(0, 1) 1
(0, 1, 2) 3
(0, 1, 2, 3) 18
(0, 1, 2, 3, 4) 180
(0, 1, 2, 3, 4, 5) 2700
(0, 1, 2, 3, 4, 5, 6) 56700
(0, 1, 2, 3, 4, 5, 6, 7) 1587600
(0, 1, 2, 3, 4, 5, 6, 7, 8) 57153600


# Step 3: Running basic tests

We consider two basic tests: that the dataset from hein et al should have 71 histories, and that a tree consisting of $n$ leaves directly below the root should have $\prod_{i = 2}^n \binom{i}{2}$ histories.

In [12]:
ch.testcase_hein_et_al()

considering tree T = (((1, (2))), 5, (3, 4))
...
Passed!


In [13]:
ch.testcase_single_root(200,supress_individual_tests=True)

Considering trees  T = {1,2}, {1,2,3}, ... , {1,...,200}
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
All cases passed!


# Step 4: Count histories given snp-data

We represnent an observed dataset by a matrix $S \in \{ 0, 1 \}^{n \times s}$. Each row corresponds to an observed sequence, each collumn to a genetic locus, and $S_{ij} = 1$ holds if and only if the $i$th sequence is segregating at the $j$th locus. Given such a dataset, we wish to determine the number of ancestral histories.

In [14]:
#dataset corresponfing to Hein et al. data
S = np.array([[1, 1, 0, 0] ,[1, 1, 0, 1] , [0, 0, 0, 0] , [0, 0, 1, 0] , [0, 0, 1, 0]], dtype = int)

In [15]:
S

array([[1, 1, 0, 0],
       [1, 1, 0, 1],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [16]:
T = ch.unordered_rooted_leaf_labelled_tree_from_haplotype_matrix(S,column_labels=['a','b','c','d'],seq_labels=[1,2,3,4,5])

In [17]:
print(T)

(((1, (2)d)a)b, 3, (4, 5)c)


# Step 5: Count histories for simulated data.

In [None]:
np.random.seed(1729) #seed the random number generator with a special number (R.I.P. Ramanujan)

In [22]:
N = 20
ns = [2,3,4,5,6,7,8,9,10]
ss = [20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]

for n in ns:
    for s in ss:
        #count histories
        history_counts = ch.simulate_data_and_count_histories(n,s,N,add_header = False)

        #write output
        f = open('history_counts.csv','a')
        f.write(history_counts + '\n')
        f.close()

2 ;	20 ;	184756 ;	(((((((((((1)))))))))), ((((((((((0))))))))))) ;	ms 2 1 -seeds 1491453794 -s 20 -rho 0.0 1000000 
2 ;	19 ;	92378 ;	(((((((((((1)))))))))), (((((((((0)))))))))) ;	ms 2 1 -seeds 1491453794 -s 19 -rho 0.0 1000000 
2 ;	18 ;	43758 ;	(((((((((((1)))))))))), ((((((((0))))))))) ;	ms 2 1 -seeds 1491453794 -s 18 -rho 0.0 1000000 
2 ;	17 ;	24310 ;	(((((((((1)))))))), (((((((((0)))))))))) ;	ms 2 1 -seeds 1491453794 -s 17 -rho 0.0 1000000 
2 ;	16 ;	8008 ;	(((((((((((0)))))))))), ((((((1))))))) ;	ms 2 1 -seeds 1491453794 -s 16 -rho 0.0 1000000 
2 ;	15 ;	5005 ;	(((((((1)))))), (((((((((0)))))))))) ;	ms 2 1 -seeds 1491453794 -s 15 -rho 0.0 1000000 
2 ;	14 ;	2002 ;	((((((((((0))))))))), (((((1)))))) ;	ms 2 1 -seeds 1491453794 -s 14 -rho 0.0 1000000 
2 ;	13 ;	1287 ;	(((((((((0)))))))), (((((1)))))) ;	ms 2 1 -seeds 1491453794 -s 13 -rho 0.0 1000000 
2 ;	12 ;	495 ;	(((((1)))), ((((((((0))))))))) ;	ms 2 1 -seeds 1491453794 -s 12 -rho 0.0 1000000 
2 ;	11 ;	330 ;	(((((1)))), (((((((0)))))))