# The Immune System Challenge

A group of immunologists want to explore the Compliment System in the human body. They have access to a supercomputer and they plan to interatively simulate different Compliment System configurations.

In [27]:
compliment_system = ['C3b', 'C3a', 'Bb', 'Ba', 'C4b', 'C4a', 'C2b', 'C2a', 'D', 'P', 'C1q', 'C1r', 'C1s', 'MBL', 'MASP-1', 'MASP-2', 'C5b', 'C5a', 'C6', 'C7', 'C8', 'C9', 'C1INH', 'MCP', 'DAF', 'H', 'C4bp', 'CD59', 'CR1', 'CR2', 'CR3', 'CR4']

compression1 = {
    'detectors': [1,2],
    'responders': [4,22],
    'activators': [5,23,31],
    'enablers': [1,9,10],
    'catalyzer': [12,29],
    'upgraders': [23,24,25],
    'chains': [27],
    'trappers': [13],
    'finishers': [16]
}

The following code has been made for this purpose:

In [46]:
def expand(array, pair):
    """ 
      Take in an list of dictionaries and outputs an expanded list based on a [key, value] pair
      :param list array: list of all dictionaries such as [{'detectors': 1, ...}, {'detectors': 2, ...}]
      :param list pair: the pair such as ['responders', [4,22]]
      :returns output: an expanded list
      :rtype list:
    """
    output = [];
    for candidate in array: #go into the previously expanded list
        for option_index in pair[1]: #go into each item of the vlaue (the list of indexes)
            candidate[pair[0]] = compliment_system[option_index] #change the list into a single value
            output.append(candidate.copy()) #put it in the output
    return output #so out1 can keep expanding

def process(in1):
    """
      Go over items in a dictionary and calls expands for each to individualize the values.
      :param dict in1:
      :returns out1: individulized items of the dictionary.
      :rtype list:
    """
    out1 = [in1] #make an output value which is a list
    
    for k,v in in1.items(): # [('detectors', [1,2])...]
        out1 = expand(out1, [k,v])  # Is calling expand for every [key, pair] set
        
    return out1

In [24]:
process(compression1.copy()) #this is how the output looks like

[{'detectors': 'C3a',
  'responders': 'C4b',
  'activators': 'C4a',
  'enablers': 'C3a',
  'catalyzer': 'C1s',
  'upgraders': 'MCP',
  'chains': 'CD59',
  'trappers': 'MBL',
  'finishers': 'C5b'},
 {'detectors': 'C3a',
  'responders': 'C4b',
  'activators': 'C4a',
  'enablers': 'C3a',
  'catalyzer': 'C1s',
  'upgraders': 'DAF',
  'chains': 'CD59',
  'trappers': 'MBL',
  'finishers': 'C5b'},
 {'detectors': 'C3a',
  'responders': 'C4b',
  'activators': 'C4a',
  'enablers': 'C3a',
  'catalyzer': 'C1s',
  'upgraders': 'H',
  'chains': 'CD59',
  'trappers': 'MBL',
  'finishers': 'C5b'},
 {'detectors': 'C3a',
  'responders': 'C4b',
  'activators': 'C4a',
  'enablers': 'C3a',
  'catalyzer': 'CR2',
  'upgraders': 'MCP',
  'chains': 'CD59',
  'trappers': 'MBL',
  'finishers': 'C5b'},
 {'detectors': 'C3a',
  'responders': 'C4b',
  'activators': 'C4a',
  'enablers': 'C3a',
  'catalyzer': 'CR2',
  'upgraders': 'DAF',
  'chains': 'CD59',
  'trappers': 'MBL',
  'finishers': 'C5b'},
 {'detectors': 'C

The scientists want to perform a more comprehensive test, but the size of the test soon proves to be an issue. The first tiem they try `compression2` they quickly run out of ram. 

Even with buying lost of ram and significantly reducing the size of the lists, there is a chance that there is a failure as the individualized files are being sent to the supercomputer. They have no way of finding out what was the last successful run to progress from there.

In [7]:
compression2 = {
    'detectors': [0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'responders': [0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'activators': [0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'enablers': [0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'catalyzer': [0 ,1 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'upgraders': [0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'chains': [0 ,1 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'trappers': [0 ,1 ,2 ,3 ,4 ,5 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31],
    'finishers': [0 ,1 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30, 31]
}

In [20]:
global out 
out = [] 

for k,v in compression2.items(): 
    print(v)
    out.append(v)

out

array([list([0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
       list([0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 

**Homework: Create a function called `bigDataProcess` that makes the same output as `process` but does two things:**


1.   Can give you only a small batch of the total results everytime you call it. For example, it can give you the first 1000 (or any RAM-friendly number) upon request, then another 1000 with another request.
2.   Can tell you which was the last batch or individualized dictionary that was processed; in case there was a failure.

Make sure you use a docstring of your choice to explain your code. Use a debugger in case of any issues.


In [4]:
# Using Dask 
import numpy as np
import dask.array as da   

#using arange to create an array with values from 0 to 10
X = da.arange(11, chunks=5)
X.compute() 

#to see size of each chunk
X.chunks

x = np.arange(10)
y = da.from_array(out, chunks=5)
y.compute() #results in a dask array


((5, 5, 1),)

In [25]:
def batchLength(dictionary): 
    global lens, total_len
    
    total_len = 1 
    
    item_id = []
    row_len = [] 
    total_rowlen = []

    for i in enumerate(dictionary, start=1):
        item_id.append(i[0])
    
    for k,v in dictionary.items(): 
        total_len = total_len*len(v)    
        row_len.append(len(v))
        total_rowlen.append(total_len)
        
    lens = list(zip(item_id, row_len, total_rowlen))
        
    return total_len, lens

In [28]:
batchLength(compression1)

(216,
 [(1, 2, 2),
  (2, 2, 4),
  (3, 3, 12),
  (4, 3, 36),
  (5, 2, 72),
  (6, 3, 216),
  (7, 1, 216),
  (8, 1, 216),
  (9, 1, 216)])

In [31]:
start = 0 
end = 20
batchsize = 5

def bigDataProcess(dictionary, start, end, batchsize):
    
    for i,x,y in lens:
        if y <= end and y >= start: 
            iterations = (end - start)/batchsize 
            print(f'number of iterations for start end, batchsize {iterations}')
            print(f'parts that will process {dict(enumerate(dictionary, start=1)).get(i)}')
            
            ##TODO look up index 

bigDataProcess(compression1, start, end, batchsize)

number of iterations for start end, batchsize 4.0
parts that will process detectors
number of iterations for start end, batchsize 4.0
parts that will process responders
number of iterations for start end, batchsize 4.0
parts that will process activators
