In [1]:
# Using https://github.com/fastlmm/PySnpTools/commit/23236b80 (but any should do)
#    and
#       https://github.com/CarlKCarlK/sgkit-plink/commit/309684b9
#            python setup.py build_ext --inplace    

In [2]:
# util.mapreduce1: Run loops in parallel on multiple processes, threads, or clusters.
#       API Docs: https://fastlmm.github.io/PySnpTools/#module-pysnptools.util.mapreduce1

from pysnptools.util.mapreduce1 import map_reduce
from pysnptools.util.mapreduce1.runner import LocalMultiThread, Local
import time

def slow_square(x):
    time.sleep(1)
    return x*x

def square_sum(count,runner=None):
    ss= map_reduce(range(count),
               mapper=slow_square,
               reducer=sum,
               runner=runner)
    return ss

square_sum(10)

285

In [3]:
%%time

square_sum(10,runner=Local())

Wall time: 10.1 s


285

In [4]:
%%time

#Multithreading is faster

square_sum(10,runner=LocalMultiThread(10))

Wall time: 1.01 s


285

In [10]:
# Read from a Bed file in (perhaps parallel) batches and return the mean

import os
import numpy as np
from sgkit_plink._open_bed import open_bed

os.chdir(r'D:\OneDrive\programs\pstsgkit\tests')

def thread_read(filename, sid_end=None, runner=None, verbose=False):
    with open_bed(filename) as bed:
        bed.shape # causes the lazy metadata to read the number of lines in the two metadata files
        batch_size = 100
        def read_and_report(start):
            if verbose:
                print(start)
            val = bed.read(index=np.s_[:,start:start+batch_size],force_python_only=True)
            return np.nanmean(val), val.shape

        sid_end = sid_end or bed.sid_count #!!!but what if sid_end is 0?
        report = map_reduce(range(0, sid_end,batch_size),
                   mapper=read_and_report,
                   runner=runner)
        return report
    

thread_read('datasets/all_chr.maf0.001.N300.bed')

Loading fam file datasets\all_chr.maf0.001.N300.fam
Loading bim file datasets\all_chr.maf0.001.N300.bim


[(0.13873333333333332, (300, 100)),
 (0.22336666666666666, (300, 100)),
 (0.1944, (300, 100)),
 (0.20983333333333334, (300, 100)),
 (0.21043333333333333, (300, 100)),
 (0.18643333333333334, (300, 100)),
 (0.19606666666666667, (300, 100)),
 (0.13453333333333334, (300, 100)),
 (0.19976666666666668, (300, 100)),
 (0.18323333333333333, (300, 100)),
 (0.4428888888888889, (300, 15))]

In [6]:
%%time

bigfile = r'M:\deldir\genbgen\2\merged_487400x220000.1.bed'
open_bed(bigfile).shape

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
Wall time: 3.16 s


(487400, 220000)

In [7]:
%%time

thread_read(bigfile,sid_end=1000,runner=Local(),verbose=True)

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
0
100
200
300
400
500
600
700
800
900
Wall time: 10.7 s


[(-29.79690145670907, (487400, 100)),
 (-24.055449343455066, (487400, 100)),
 (-34.39397287648748, (487400, 100)),
 (-29.801632334837915, (487400, 100)),
 (-32.10032076323348, (487400, 100)),
 (-37.84243836684448, (487400, 100)),
 (-19.455485945835044, (487400, 100)),
 (-22.905076590069758, (487400, 100)),
 (-24.05008110381617, (487400, 100)),
 (-22.907826713171932, (487400, 100))]

In [8]:
%%time
#The verbose messages arrive out of order, so we know it's multithreading
thread_read(bigfile,sid_end=1000,runner=LocalMultiThread(10),verbose=True)

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
0100

200
300
400
500
600
700
800
900
Wall time: 5.48 s


[(-29.79690145670907, (487400, 100)),
 (-24.055449343455066, (487400, 100)),
 (-34.39397287648748, (487400, 100)),
 (-29.801632334837915, (487400, 100)),
 (-32.10032076323348, (487400, 100)),
 (-37.84243836684448, (487400, 100)),
 (-19.455485945835044, (487400, 100)),
 (-22.905076590069758, (487400, 100)),
 (-24.05008110381617, (487400, 100)),
 (-22.907826713171932, (487400, 100))]

In [13]:
%%time

#10K variants
thread_read(bigfile,sid_end=10*1000,runner=LocalMultiThread(10),verbose=True)

Loading fam file M:\deldir\genbgen\2\merged_487400x220000.1.fam
Loading bim file M:\deldir\genbgen\2\merged_487400x220000.1.bim
01000

20003000

4000
5000
6000
7000
8000
9000
41005100

1100
1002100

8100610091007100

3100


42005200

1200
200
6200
72009200

3200
2200
8200
5300
4300
1300
300
2300
93003300

7300
63008300

5400
4400
1400
400
9400
7400
3400
2400
84006400

5500
4500
1500
9500
2500
35008500

500
7500
6500
5600
4600
1600
2600
9600
3600
8600
5700
4700
600
7600
6600
1700
2700
9700
58008700

3700
4800
7700
700
6700
1800
2800
9800
4900
5900
8800
3800
800
7800
6800
1900
2900
9900
89006900

3900
900
7900
Wall time: 23.8 s


[(-29.79690145670907, (487400, 100)),
 (-24.055449343455066, (487400, 100)),
 (-34.39397287648748, (487400, 100)),
 (-29.801632334837915, (487400, 100)),
 (-32.10032076323348, (487400, 100)),
 (-37.84243836684448, (487400, 100)),
 (-19.455485945835044, (487400, 100)),
 (-22.905076590069758, (487400, 100)),
 (-24.05008110381617, (487400, 100)),
 (-22.907826713171932, (487400, 100)),
 (-20.599556114074684, (487400, 100)),
 (-20.60552800574477, (487400, 100)),
 (-27.499685227739022, (487400, 100)),
 (-24.048973553549445, (487400, 100)),
 (-25.19879571194091, (487400, 100)),
 (-30.94615664751744, (487400, 100)),
 (-21.753124969224455, (487400, 100)),
 (-33.246497496922444, (487400, 100)),
 (-29.798468178087813, (487400, 100)),
 (-20.604071235125154, (487400, 100)),
 (-28.648547004513745, (487400, 100)),
 (-27.498296819860485, (487400, 100)),
 (-21.759837566680346, (487400, 100)),
 (-30.948608904390642, (487400, 100)),
 (-27.497819798933115, (487400, 100)),
 (-27.50512839556832, (487400, 10

In [None]:
#Single threaded: 1000 variants: 10 seconds
#Multi            1000 variants:  5 seconds
#Multi          10,000 variants: 23 seconds