In [1]:
import numpy as np
import pandas as pd
import math
import multiprocessing as mp
import datetime
from simple_slurm import Slurm
from waiting import wait
import os

### **Step 0. Formulate your research objective and understand your data**

**Research Objective:** Identify which rice varieties are identical or closely related genetically to other rice varieties from the 3000 Rice Genomes Project  
**Problem Type:** Sequence marker comparison  
**Study Period:** 2017-2021   
**Data:**

- Marker data across approximately 3000 rice varieties



### Step 1: Read in the dataframe that has an additive recessive allele matrix for chromosome 1
In this structure, the rows are labeled by "**chromosomal position**"_"**minor allele**", and the columns are labled by **variety name**. The values in each cell represent the **number of recessive alleles present (0, 1, or 2)** at this position in each variety.

In [2]:
df = pd.read_csv("chr1_A_matrix.csv", index_col=0)

In [3]:
df.head()

Unnamed: 0,B001,B002,B003,B004,B005,B006,B007,B008,B009,B010,...,IRIS_313-15901,IRIS_313-15902,IRIS_313-15903,IRIS_313-15904,IRIS_313-15905,IRIS_313-15906,IRIS_313-15907,IRIS_313-15908,IRIS_313-15909,IRIS_313-15910
1178_T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0
1203_C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,2.0,0.0,2.0,0.0,0.0,2.0,2.0,0.0
1248_A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1282_A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1299_C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0


In [4]:
df.shape

(42466, 3024)

### Step 2: Subset input data to run in-class exercises quickly

In this full dataset for Chromosome 1, we clearly have entries for 42,466 markers across 3024 rice varieties/samples.

Out of the 3024 varieties that have been genotyped we will use the first **200** to demonstrate the advantage of vectorized code and parallel processing.

In [5]:
df_slice = df.iloc[:, 0:200]

In [6]:
df_slice.head()

Unnamed: 0,B001,B002,B003,B004,B005,B006,B007,B008,B009,B010,...,B205,B207,B208,B210,B212,B213,B214,B215,B216,B217
1178_T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1203_C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
1248_A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1282_A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1299_C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_slice.shape

(42466, 200)

In [8]:
#Saving it to workspace
df_slice.to_csv("df_slice.csv")

### Step 3: Create pairwise column index comparison matrix

We need to compare each sample with every other sample. So we will generate all unique pairs of columns taking care not to repeat as that will be computationally expensive and redundant!

In [9]:
import math
math.comb(200,2)

19900

For the original dataset that contains 3024 samples that would mean a much larger number

In [10]:
math.comb(3024,2)

4570776

Define the function that compares elements of the two columns (**combn**, e.g., [1, 3] or [4, 7]) and calculates percentage difference in the columns. In this function, we will fix the dataframe argument to take in the data frame (**df**) of interest and let combination indices (**combn**) remain a variable to iterate over.


In [11]:
def Perc_diff(combn, df = df):
    pair = df.iloc[:, combn]
    pairNoNA = pair.dropna()
    TotalSites = pairNoNA.shape[0]
    DiffSites = np.sum(pairNoNA[pairNoNA.columns[0]] != pairNoNA[pairNoNA.columns[1]])
    PercDiff = round((DiffSites * 100)/ TotalSites, 2)
    # Generate a list as output
    result = [pair.columns[0], pair.columns[1], TotalSites, DiffSites, PercDiff]
    return result


In [12]:
Perc_diff([1,3], df_slice)

['B002', 'B004', 42466, 2077, 4.89]

### Step 4: Performance comparisons!
#### (1) Brute force nested loops

In our first approach, let's ignore the function above and just run the calculations directly as nested for loops. Beware that this is probably the slowest and most inefficient way to do this!

In [13]:
def SNP_compare_loops(df):
    col_names = ["Sample1", "Sample2", "Total_sites", "Diff_Sites", "Percent_Diff"]
    results = pd.DataFrame(columns = col_names)
    for i in range(len(df_slice.columns)-1):
        for j in range(i+1,len(df_slice.columns)):
            pair = df_slice.iloc[:, [i,j]]
            pairNoNA = pair.dropna()
            TotalSites = pairNoNA.shape[0]
            DiffSites = np.sum(pairNoNA[pairNoNA.columns[0]] != pairNoNA[pairNoNA.columns[1]])
            PercDiff = round((DiffSites * 100)/ TotalSites, 2)
            tmp = pd.DataFrame([[pair.columns[0], pair.columns[1], TotalSites, DiffSites, PercDiff]], columns=col_names)
            results = pd.concat([results, tmp], ignore_index=True)
    return results


In [14]:
%%time
SNP_compare_loops(df=df_slice)

CPU times: user 1min 26s, sys: 68.7 ms, total: 1min 26s
Wall time: 1min 27s


Unnamed: 0,Sample1,Sample2,Total_sites,Diff_Sites,Percent_Diff
0,B001,B002,42466,2437,5.74
1,B001,B003,42466,3278,7.72
2,B001,B004,42466,1994,4.70
3,B001,B005,42466,1933,4.55
4,B001,B006,42466,8492,20.00
...,...,...,...,...,...
19895,B214,B216,42466,6299,14.83
19896,B214,B217,42466,4563,10.75
19897,B215,B216,42466,8478,19.96
19898,B215,B217,42466,8405,19.79


#### (2) Replacing for loops with vectorization

We will try and vectorize this by removing for loops to check execution time

In [15]:
def SNP_compare_noloops(df):
    
    col_names = ["Sample1", "Sample2", "Total_sites", "Diff_Sites", "Percent_Diff"]
    # Generate all unique combinations of columns and store as a list in pair
    pairs = [[i, j] for i in range(len(df.columns)-1) for j in range(i+1, len(df.columns))]

    # Apply the function over the dataframe
    results = [Perc_diff(combn, df_slice) for combn in pairs]
    results = pd.DataFrame(results, columns=col_names)
    return results


In [16]:
%%time
SNP_compare_noloops(df=df_slice)

CPU times: user 36.3 s, sys: 86.4 ms, total: 36.3 s
Wall time: 36.6 s


Unnamed: 0,Sample1,Sample2,Total_sites,Diff_Sites,Percent_Diff
0,B001,B002,42466,2437,5.74
1,B001,B003,42466,3278,7.72
2,B001,B004,42466,1994,4.70
3,B001,B005,42466,1933,4.55
4,B001,B006,42466,8492,20.00
...,...,...,...,...,...
19895,B214,B216,42466,6299,14.83
19896,B214,B217,42466,4563,10.75
19897,B215,B216,42466,8478,19.96
19898,B215,B217,42466,8405,19.79


#### Profiling to see nitty-gritty of where the time is spent

In [17]:
%%prun
SNP_compare_noloops(df=df_slice)

 

#### (3) Speedup using multiprocessing

Now we will try to use parallel processing along with the vectorized code to speed this up even further. Notice if you only have just a few (e.g., 2-4) processors, all the overhead of mp will **slow you down**, and not speed things up!

In [18]:
def SNP_compare_mprocess(df):
    
    # Generate all unique combinations of columns and store as a list in pair
    pair = [[i, j] for i in range(len(df.columns)-1) for j in range(i+1, len(df.columns))]

    #Using multiprocessing for parallel computation, instantiate pool and assign the total number of cores available using mp.cpu_count()
    pool = mp.Pool(mp.cpu_count())
    # Apply the funtion over one subarray of pair_split at a time usine pool.map as follows
    results = pool.map(Perc_diff, [combn for combn in pair])
    # Do not forget to close the pool
    pool.close()
    # Convert the results list into a dataframe
    Results = pd.DataFrame(results, columns = ['Sample1', 'Sample2', 'Total_sites', 'Diff_Sites', 'Percent_Diff'])
    return Results


In [19]:
%%time
SNP_compare_mprocess(df=df_slice)

CPU times: user 662 ms, sys: 5.54 s, total: 6.2 s
Wall time: 14.6 s


Unnamed: 0,Sample1,Sample2,Total_sites,Diff_Sites,Percent_Diff
0,B001,B002,42466,2437,5.74
1,B001,B003,42466,3278,7.72
2,B001,B004,42466,1994,4.70
3,B001,B005,42466,1933,4.55
4,B001,B006,42466,8492,20.00
...,...,...,...,...,...
19895,B214,B216,42466,6299,14.83
19896,B214,B217,42466,4563,10.75
19897,B215,B216,42466,8478,19.96
19898,B215,B217,42466,8405,19.79


#### (4) An embarassingly parallel solution

Now we do the exact same thing but we use job arrays to breakdown a large task into multiple smaller tasks and submit them one after the other automatically. The python code for this task is already available in your workspace in the file called SNP_compare_parallel.py. You can go through the file before you run this code block. <br>

We use the job submission manager, Slurm, to generate and submit the job array to the supercomputer. We use a wrapper called simple_slurm so we can execute it from within our notebook here. <br>

We have saved the smaller slice of our dataset as df_slice.csv in our workspace. We will try the solution first using this subset and then further also demonstrate the rapidity of this method using the full dataset.

In [20]:
def job_is_done(outprefix):
    # make sure all 10 files have been written. If even one is not done this test fails!
    for i in range(0,10):
        if not os.path.exists(outprefix+'/'+outprefix+'_'+str(i)+'.csv'):  
            return False
    return True

def SNP_compare_slurm(outprefix, df_file):
    # Configure the SLURM request 
    slurm = Slurm(
        array=range(0,10),
        cpus_per_task=8,
        job_name='SNP_parallel',
        output=f'{Slurm.JOB_NAME}_{Slurm.JOB_ARRAY_ID}.out',
        time=datetime.timedelta(days=0, hours=0, minutes=3, seconds=0),
    )
    # Be careful, by default our system still loads python2 by default, and that won't work!
    slurm.add_cmd('module load python3')

    # Submit the SLURM batch request
    slurm.sbatch('python SNP_compare_parallel.py '+outprefix+' '+df_file+' $SLURM_ARRAY_TASK_ID', Slurm.SLURM_ARRAY_TASK_ID)
    
    # Multiple files are generated in the Results folder with an individual pairwise comparison.
    # We will concatenate them into a single dataframe and examine them further. 
    # But we have to wait until the slurm job is done!
    wait(lambda: job_is_done(outprefix), timeout_seconds=600, waiting_for="slurm job to be done")

    result_files = []
    for i in range(0,10):
        df = pd.read_csv(outprefix+'/'+outprefix+"_%s.csv" %i, index_col = None, header = 0)
        result_files.append(df)
    
    df_results = pd.concat(result_files, axis=0, ignore_index=True)
    return(df_results)  


##### First, with the smaller slice of the dataset.

In [21]:
outprefix = 'Results'
if not os.path.exists(outprefix):
    os.makedirs(outprefix)

In [24]:
%%time
df_results = SNP_compare_slurm(outprefix=outprefix, df_file='df_slice.csv')

Submitted batch job 17883812

CPU times: user 38.7 ms, sys: 80.7 ms, total: 119 ms
Wall time: 15.3 s


In [25]:
df_results.shape

(19900, 5)

##### Now with the full dataset, which would have been prohibitively slow any other way!

In [26]:
outprefix = 'Results2'
if not os.path.exists(outprefix):
    os.makedirs(outprefix)

In [27]:
%%time
df_results = SNP_compare_slurm(outprefix=outprefix, df_file='chr1_A_matrix.csv')

Submitted batch job 17883828

CPU times: user 2.6 s, sys: 739 ms, total: 3.34 s
Wall time: 2min 41s


In [28]:
df_results.shape

(4570776, 5)