In [1]:
# Using https://github.com/fastlmm/PySnpTools/commit/23236b80 (but any should do)
#    and
#       https://github.com/CarlKCarlK/sgkit-plink/commit/309684b9
#            python setup.py build_ext --inplace    

In [1]:
# util.mapreduce1: Run loops in parallel on multiple processes, threads, or clusters.
#       API Docs: https://fastlmm.github.io/PySnpTools/#module-pysnptools.util.mapreduce1

from pysnptools.util.mapreduce1 import map_reduce
from pysnptools.util.mapreduce1.runner import LocalMultiThread, Local
import time

def slow_square(x):
    time.sleep(1)
    return x*x

def square_sum(count,runner=None):
    ss= map_reduce(range(count),
               mapper=slow_square,
               reducer=sum,
               runner=runner)
    return ss

square_sum(3)

5

In [2]:
%%time

square_sum(10,runner=Local())

Wall time: 10.1 s


285

In [3]:
%%time

#Multithreading is faster

square_sum(10,runner=LocalMultiThread(10))

Wall time: 1.01 s


285

In [9]:
# Read from a Bed file in (perhaps parallel) batches and return the mean

import os
import numpy as np
from sgkit_plink._open_bed import open_bed

os.chdir(r'D:\OneDrive\programs\pstsgkit\tests')

def thread_read(filename, sid_end=None, runner=None, verbose=False):
    with open_bed(filename) as bed:
        bed.shape # causes the lazy metadata to read the number of lines in the two metadata files
        batch_size = 1000
        def read_and_report(start):
            if verbose:
                print(start)
            val = bed.read(index=np.s_[:,start:start+batch_size],dtype='int8')
            return np.nanmean(val), val.shape

        sid_end = sid_end or bed.sid_count #!!!but what if sid_end is 0?
        report = map_reduce(range(0, sid_end,batch_size),
                   mapper=read_and_report,
                   runner=runner)
        return report
    

thread_read('datasets/all_chr.maf0.001.N300.bed')

Loading fam file datasets\all_chr.maf0.001.N300.fam
Loading bim file datasets\all_chr.maf0.001.N300.bim


[(0.18768, (300, 1000)), (0.4428888888888889, (300, 15))]

In [10]:
%%time

bigfile = r'M:\deldir\genbgen\2\merged_487400x220000.1.bed'
open_bed(bigfile).shape

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
Wall time: 1.81 s


(487400, 220000)

In [15]:
%%time

thread_read(bigfile,sid_end=10000,runner=Local(),verbose=False)

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
Wall time: 19.2 s


[(-27.73091854944604, (487400, 1000)),
 (-25.430085713992614, (487400, 1000)),
 (-26.12273082273287, (487400, 1000)),
 (-26.46684416085351, (487400, 1000)),
 (-27.040890730406236, (487400, 1000)),
 (-26.0071763377103, (487400, 1000)),
 (-27.04188338736151, (487400, 1000)),
 (-24.859726891670086, (487400, 1000)),
 (-24.741750547804678, (487400, 1000)),
 (-25.892419505539596, (487400, 1000))]

In [16]:
%%time
#The verbose messages arrive out of order, so we know it's multithreading
thread_read(bigfile,sid_end=10000,runner=LocalMultiThread(10),verbose=False)

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
Wall time: 14.2 s


[(-27.73091854944604, (487400, 1000)),
 (-25.430085713992614, (487400, 1000)),
 (-26.12273082273287, (487400, 1000)),
 (-26.46684416085351, (487400, 1000)),
 (-27.040890730406236, (487400, 1000)),
 (-26.0071763377103, (487400, 1000)),
 (-27.04188338736151, (487400, 1000)),
 (-24.859726891670086, (487400, 1000)),
 (-24.741750547804678, (487400, 1000)),
 (-25.892419505539596, (487400, 1000))]

In [17]:
%%time

thread_read(bigfile,sid_end=100*1000,runner=LocalMultiThread(10),verbose=False)

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
Wall time: 2min 8s


[(-27.73091854944604, (487400, 1000)),
 (-25.430085713992614, (487400, 1000)),
 (-26.12273082273287, (487400, 1000)),
 (-26.46684416085351, (487400, 1000)),
 (-27.040890730406236, (487400, 1000)),
 (-26.0071763377103, (487400, 1000)),
 (-27.04188338736151, (487400, 1000)),
 (-24.859726891670086, (487400, 1000)),
 (-24.741750547804678, (487400, 1000)),
 (-25.892419505539596, (487400, 1000)),
 (-27.73091854944604, (487400, 1000)),
 (-25.430085713992614, (487400, 1000)),
 (-26.12273082273287, (487400, 1000)),
 (-26.46684416085351, (487400, 1000)),
 (-27.040890730406236, (487400, 1000)),
 (-26.0071763377103, (487400, 1000)),
 (-27.04188338736151, (487400, 1000)),
 (-24.859726891670086, (487400, 1000)),
 (-24.741750547804678, (487400, 1000)),
 (-25.892419505539596, (487400, 1000)),
 (-27.73091854944604, (487400, 1000)),
 (-25.430085713992614, (487400, 1000)),
 (-26.12273082273287, (487400, 1000)),
 (-26.46684416085351, (487400, 1000)),
 (-27.040890730406236, (487400, 1000)),
 (-26.007176337

In [10]:
#Single threaded: 1000 variants: 4.5 seconds
#Multi            1000 variants: 3.8 seconds
#Multi          10,000 variants: 20 seconds