# Setting the Stage

In [2]:
import numpy as np
import cProfile, pstats, StringIO
import sys
#import re
sys.path.append('..')
import formula_outline as fo

In [3]:
path_prefix = '../testdata/'

file_names_of_test_cases_csv = [
    'fromHeinsBook_L_1000.csv',
    'fromHeinsBook_incompatibleColumn-pair_L_1000.csv',
    'fromHeinsBook_with_a_2_L_1000.csv',
    'hammer1995_0123_L_1000.csv',
    'noMutations_N_5_L_1000000000.csv',
    'noMutations_N_5_L_1000.csv',
    'noMutations_N_5_L_10.csv',
    'noMutations_N_5_L_4.csv',
    'aquardoAndGreenberg_fig1_0123_L_1000.csv'
    #'WardEtAl1991_0123_L_1000.csv',
    #'WardEtAl1991_0123_L_27.csv'
]

file_paths = [path_prefix+file_name for file_name in file_names_of_test_cases_csv]

In [4]:
def psiFromCSV(fileName,tidyUp = True):
    """
    Takes the path of a .csv-file as input. The last column is presumed to
    encode the "n"-vector enumerating the occurrance of different alleles; the first collumn is presumed to
    enumerate the multiplicity of each column. The other columns are presuemd to encode haplotypes.

    If the below were the contents of mycsv:
    "
    5, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0
    0, 1, 1, 0, 0, 2, 0, 0, 3, 3, 22
    0, 0, 2, 1, 0, 3, 0, 0, 2, 1, 18
    1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 12
    0, 1, 1, 0, 2, 2, 0, 0, 3, 3, 6
    0, 1, 1, 0, 0, 2, 0, 0, 0, 3, 6
    1, 2, 1, 0, 0, 0, 2, 0, 0, 0, 4
    1, 1, 1, 3, 0, 3, 0, 0, 3, 3, 3
    1, 1, 1, 0, 0, 2, 0, 0, 3, 3, 3
    0, 1, 1, 2, 0, 2, 0, 0, 3, 3, 3
    1, 0, 1, 1, 2, 2, 0, 0, 0, 0, 3
    0, 1, 1, 0, 2, 2, 0, 1, 3, 3, 3
    0, 3, 1, 0, 0, 2, 0, 0, 3, 3, 3
    3, 1, 1, 0, 0, 2, 0, 2, 3, 3, 3
    0, 3, 1, 1, 0, 3, 0, 0, 0, 1, 2
    1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2
    0, 0, 1, 1, 2, 3, 0, 0, 2, 1, 2
    0, 0, 1, 1, 0, 3, 0, 0, 0, 1, 1
    0, 0, 2, 1, 0, 3, 0, 0, 0, 1, 1
    1, 0, 1, 1, 2, 0, 0, 0, 0, 0, 1
    1, 1, 1, 3, 0, 3, 0, 3, 3, 3, 1
    3, 2, 2, 1, 0, 3, 0, 0, 0, 1, 1
    "
    Then psiFromCSV("mycsv") should return S,nr,nc whereby:

    >>> S
    array([[0, 1, 1, 0, 0, 2, 0, 0, 3, 3],
       [0, 0, 2, 1, 0, 3, 0, 0, 2, 1],
       [1, 0, 1, 0, 0, 2, 0, 0, 0, 0],
       [0, 1, 1, 0, 2, 2, 0, 0, 3, 3],
       [0, 1, 1, 0, 0, 2, 0, 0, 0, 3],
       [1, 2, 1, 0, 0, 0, 2, 0, 0, 0],
       [1, 1, 1, 3, 0, 3, 0, 0, 3, 3],
       [1, 1, 1, 0, 0, 2, 0, 0, 3, 3],
       [0, 1, 1, 2, 0, 2, 0, 0, 3, 3],
       [1, 0, 1, 1, 2, 2, 0, 0, 0, 0],
       [0, 1, 1, 0, 2, 2, 0, 1, 3, 3],
       [0, 3, 1, 0, 0, 2, 0, 0, 3, 3],
       [3, 1, 1, 0, 0, 2, 0, 2, 3, 3],
       [0, 3, 1, 1, 0, 3, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 2, 1, 0, 0, 0],
       [0, 0, 1, 1, 2, 3, 0, 0, 2, 1],
       [0, 0, 1, 1, 0, 3, 0, 0, 0, 1],
       [0, 0, 2, 1, 0, 3, 0, 0, 0, 1],
       [1, 0, 1, 1, 2, 0, 0, 0, 0, 0],
       [1, 1, 1, 3, 0, 3, 0, 3, 3, 3],
       [3, 2, 2, 1, 0, 3, 0, 0, 0, 1]])
    >>> nr
    array([22, 18, 12,  6,  6,  4,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  1,
        1,  1,  1,  1])
    >>> nc
    array([5, 1, 1, 1, 1, 1, 1, 1, 1, 2])
    """
    raw = np.genfromtxt(fileName, delimiter=',')
    # print raw
    
#     if raw.ndim == 1: # If we only have one line, raw will have only 1 axis
#         n = np.array(raw[-1], dtype = int, ndmin = 1)
#         S = np.array(raw[:-1], dtype = int,ndmin = 2)
#    else:
#         n = np.array(raw[:,-1], dtype = int)
#         S = np.array(raw[:,:-1], dtype = int)

    nr = np.array(raw[1:,-1],  dtype = int)
    nc = np.array(raw[0,:-1],  dtype = int)
    S  = np.array(raw[1:,:-1], dtype = int)


    if tidyUp:
        #S = withoutNullColumns(S)
        S,nr = removeDuplicateRows(S,nr)
        St, nc = removeDuplicateRows(np.transpose(S),nc)
        S = np.transpose(St)

    return S,nr,nc

# def withoutNullColumns(S):
#     rows,columns = S.shape
#     nonNullColumns = [i for i in range(columns) if max(S[:,i]) > 0]
#     columns_new = len(nonNullColumns)

#     if columns_new > 0:
#         S_new = S[:,nonNullColumns]
#         S_new.shape = (rows,columns_new)
#     else:
#         S_new = np.zeros((rows,1))

#     return S_new


def removeDuplicateRows(S,n):

    rows,columns = S.shape
    row_counts = {}

    rows = S.shape[0]

    for row_index in range(rows):
        row = S[row_index]
        rowAsTuple = tuple(row)
        if rowAsTuple in row_counts:
            row_counts[rowAsTuple] += n[row_index]
        else:
            row_counts[rowAsTuple] = n[row_index]

    rowList = row_counts.keys()
    rowCounts = row_counts.values()

    S_new = np.array(np.r_["0,2",rowList],dtype = int)
    n_vec = np.array(rowCounts,dtype = int)

    return S_new,n_vec

def countMinMutationsWithoutRegardForIncompatiabilities(S,nc):
    deviants = set((1,2,3))
    minMutations = sum( [ len(deviants.intersection(set([S[i,j] for i in xrange(S.shape[0])]))) * nc[j] for j in xrange(S.shape[1]) ] )
    return minMutations


# p1 = re.compile('_L_\d+')
# p2 = re.compile('\d+')
# def getL(fileName):
#     m1 = p1.search(fileName)
#     m2 = p2.search(m1.group())
#     return int(m2.group())

# Test Cases

# Code for testing

In [44]:
for file_path in file_paths:
    S,nr,nc = psiFromCSV(file_path)
    L = sum(nc)
    theta = 10**-3 * L
    b_lowerBound = countMinMutationsWithoutRegardForIncompatiabilities(S,nc)

    print 'Now considering testdata loaded from file \'%s\' '%file_path
    print np.c_[np.r_[np.matrix(nc),S], np.transpose(np.matrix([0]+[nr[i] for i in range(len(nr))]))]
    #print S,nr,nc
    print 'L = %i'%sum(nc)
    print 'b_lowerBound = %i'% countMinMutationsWithoutRegardForIncompatiabilities(S,nc)
    print '\n'
    b_list = [b_lowerBound + i for i in range(3)]

    for b in b_list:
        print \
        '''CASE: b = %i'''%b
        pr = cProfile.Profile()
        pr.enable()

        prob = fo.prob_External(S,nr,nc,b,theta,returnTable=False)

        s = StringIO.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print s.getvalue()

    print '='*80
    print '\n\n'
        

Now considering testdata loaded from file '../testdata/fromHeinsBook_L_1000.csv' 
[[  2   1 996   1   0]
 [  1   0   0   1   1]
 [  1   0   0   0   1]
 [  0   0   0   0   1]
 [  0   1   0   0   2]]
L = 1000
b_lowerBound = 4


CASE: b = 4
         765678 function calls (758851 primitive calls) in 2.727 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.182    3.182 ../formula_outline.py:28(prob_External)
    426/1    0.022    0.000    3.182    3.182 ../formula_outline.py:49(prob)
    95393    1.591    0.000    2.047    0.000 {sum}
      393    0.008    0.000    1.803    0.005 ../configurationTable.py:273(add)
     1499    0.096    0.000    1.548    0.001 ../configurationTable.py:29(__eq__)
     1664    0.011    0.000    1.255    0.001 ../configurations.py:23(__hash__)
     3328    0.017    0.000    1.237    0.000 {map}
    12380    0.039    0.000    1.160    0.000 ../configurationTable.py:310(

KeyboardInterrupt: 