# GenoSurf API Example Use Case: 
## Extract ENCODE pairs of items belonging to a same sample but mapped to different assemblies

###Import necessary libraries

In [0]:
import requests
from ast import literal_eval
import pandas as pd
from functools import reduce

###Call API with query

The ```data``` variable contains the GenoSurf query in JSON format (it can be downloaded from the interface clicking on MODIFY, then 'COPY TO CLIPBOARD').

The ```payload``` variable contains the JSON payload of the API POST request.  For advanced use, note that: 
* ```agg``` allows to switch from aggregated to replicated mode,
* ```order_col``` defines the ordering w.r.t. a specific column, 
* ```order_dir``` switches between 'asc' and 'desc' values, 
* ```rel_distance``` defines the ontological depth set for the query (at the moment we only support up to 3 levels)






In [0]:
url = 'http://geco.deib.polimi.it/genosurf/api/query/table'

data = '{"gcm":{"source":["encode"],"disease":["metastatic neuroblastoma from bone marrow"]},"type":"original","kv":{}}'

data_dict = literal_eval(data)

payload = {'agg': True}
#payload = {'agg': True, 'order_col': 'item_source_id', 'order_dir':'asc', 'rel_distance':3}

###Load result dataset into Pandas DataFrame

Use ```len(response_json)``` to visualize the number of extracted items.

Use ```df.shape[1]``` to visualize the number of columns

Use ```df.columns``` for the complete list of columns


In [0]:
response = requests.post(url, json=data_dict, params=payload)
response_json = response.json()

df = pd.DataFrame(response_json)

#optionally predefine a set of columns interesting for your query
#df = df[['assembly','biosample_source_id','file_format','item_source_id',]]

**Define the role of columns**

*   ```class_column```: used for distinguishing data in classes (e.g., ```assembly``` or ```is_healthy```)
*   ```grouped_columns```: used for grouping
*   ```other_columns```: includes the class column (i.e., ```assembly```), those derived from it (i.e., ```dataset_name```, all columns that are unique for items (e.g. id, URLs), and all columns that have only null values for the considered itemset (e.g., ```tissue```, ```ethnicity```, ```alt_biosample_source_id```)



In [0]:
class_column = 'assembly' 
other_columns = ['assembly',
                 'dataset_name',
                 'item_source_id','source_url','local_url','source_page','date','size',
                 'tissue','ethnicity','alt_biosample_source_id','alt_donor_source_id','alt_item_source_id',]
grouped_columns = list(filter(lambda x: not x in other_columns, df.columns))

**Compute desired output**

Use ```full_result.head(10)``` to visualize first rows of the final dataframe.

In the dataframe:
*   First columns are from ```grouped_columns```
*   Then we have columns used for distinguishing classes (column names are formed as ```column_name_<class_value>``` to separate classes, e.g. 
```assembly_GRCh38``` and ```assembly_hg19```)


In [5]:
df = df[grouped_columns + other_columns]

#define possible values of class column
classes = df[class_column].sort_values().unique()

#perform group by on the dataframe
grouped_df = df.groupby(grouped_columns)

result = []

#compute the pairs
for group_key, group_df in grouped_df:

  inner_grouped_df = group_df.groupby(class_column)
  
  has_all_classes = reduce((lambda x, y: x and y), map(lambda x: x in inner_grouped_df.groups.keys(), classes))
    
  if has_all_classes:
    class_result = []
    for c in classes:
      inner_group_df = inner_grouped_df.get_group(c) 
      inner_group_df.columns = map(lambda x: x if x in grouped_columns else x + "_" + c.lower() ,inner_group_df.columns)
      class_result.append(inner_group_df)
    
    merged = reduce(lambda left,right: pd.merge(left,right,on=grouped_columns), class_result)

    result.append(merged)
    
result

#builds the whole final dataframe result    
full_result = pd.concat(result, axis=0, join='outer')

print("Number of pairs: ",full_result.shape[0])

full_result.head(10)

Number of pairs:  197


Unnamed: 0,age,antibody,biological_replicate_count,biological_replicate_number,biosample_source_id,biosample_type,cell,content_type,data_type,disease,donor_source_id,feature,file_format,gender,is_annotation,is_healthy,pipeline,platform,project_name,source,source_site,species,target,technical_replicate_count,technical_replicate_number,technique,assembly_grch38,dataset_name_grch38,item_source_id_grch38,source_url_grch38,local_url_grch38,source_page_grch38,date_grch38,size_grch38,tissue_grch38,ethnicity_grch38,alt_biosample_source_id_grch38,alt_donor_source_id_grch38,alt_item_source_id_grch38,assembly_hg19,dataset_name_hg19,item_source_id_hg19,source_url_hg19,local_url_hg19,source_page_hg19,date_hg19,size_hg19,tissue_hg19,ethnicity_hg19,alt_biosample_source_id_hg19,alt_donor_source_id_hg19,alt_item_source_id_hg19
0,1460,AR:AB_10845793,1,1,ENCBS324TZB,cell line,SK-N-SH,peaks,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"broad histone mark, histone",narrowPeak,female,False,False,Histone ChIP-seq,Illumina HiSeq 2000,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,H3F3A,1,1_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF741ZLK,https://www.encodeproject.org/files/ENCFF741ZL...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF741ZLK,2016-10-19,4503668,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF607WBJ,https://www.encodeproject.org/files/ENCFF607WB...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF607WBJ,2016-10-19,4487405,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10845793,1,2,ENCBS500EWH,cell line,SK-N-SH,peaks,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"broad histone mark, histone",narrowPeak,female,False,False,Histone ChIP-seq,Illumina HiSeq 2500,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,H3F3A,1,2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF052QOR,https://www.encodeproject.org/files/ENCFF052QO...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF052QOR,2016-10-19,3425748,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF379HKR,https://www.encodeproject.org/files/ENCFF379HK...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF379HKR,2016-10-19,3403473,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10845793,2,1 | 2,ENCBS324TZB | ENCBS500EWH,cell line,SK-N-SH,peaks,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"broad histone mark, histone",narrowPeak,female,False,False,Histone ChIP-seq,"Illumina HiSeq 2000, Illumina HiSeq 2500",ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,H3F3A,2,1_1 | 2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF996FFY,https://www.encodeproject.org/files/ENCFF996FF...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF996FFY,2016-10-19,4469076,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF801FXF,https://www.encodeproject.org/files/ENCFF801FX...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF801FXF,2016-10-19,4439926,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10845793,2,1 | 2,ENCBS324TZB | ENCBS500EWH,cell line,SK-N-SH,replicated peaks,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"broad histone mark, histone",narrowPeak,female,False,False,Histone ChIP-seq,"Illumina HiSeq 2000, Illumina HiSeq 2500",ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,H3F3A,2,1_1 | 2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF549AYZ,https://www.encodeproject.org/files/ENCFF549AY...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF549AYZ,2016-10-19,2682337,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF202WFY,https://www.encodeproject.org/files/ENCFF202WF...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF202WFY,2016-10-19,2677996,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10860536,1,1,ENCBS500EWH,cell line,SK-N-SH,peaks and background as input for IDR,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"chromatin remodeller, other post-translational...",narrowPeak,female,False,False,Transcription factor ChIP-seq,Illumina HiSeq 2500,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,EZH2phosphoT487,1,1_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF948DTB,https://www.encodeproject.org/files/ENCFF948DT...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF948DTB,2017-01-15,4977524,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF139FDW,https://www.encodeproject.org/files/ENCFF139FD...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF139FDW,2017-01-15,5120884,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10860536,1,2,ENCBS018DDZ,cell line,SK-N-SH,peaks and background as input for IDR,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"chromatin remodeller, other post-translational...",narrowPeak,female,False,False,Transcription factor ChIP-seq,Illumina HiSeq 2500,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,EZH2phosphoT487,1,2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF093XEF,https://www.encodeproject.org/files/ENCFF093XE...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF093XEF,2017-01-15,6618039,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF978OGV,https://www.encodeproject.org/files/ENCFF978OG...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF978OGV,2017-01-15,6664109,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10860536,2,1 | 2,ENCBS018DDZ | ENCBS500EWH,cell line,SK-N-SH,conservative idr thresholded peaks,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"chromatin remodeller, other post-translational...",narrowPeak,female,False,False,Transcription factor ChIP-seq,Illumina HiSeq 2500,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,EZH2phosphoT487,2,1_1 | 2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF280JJA,https://www.encodeproject.org/files/ENCFF280JJ...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF280JJA,2017-01-15,26030,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF507HLD,https://www.encodeproject.org/files/ENCFF507HL...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF507HLD,2017-01-15,25403,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10860536,2,1 | 2,ENCBS018DDZ | ENCBS500EWH,cell line,SK-N-SH,optimal idr thresholded peaks,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"chromatin remodeller, other post-translational...",narrowPeak,female,False,False,Transcription factor ChIP-seq,Illumina HiSeq 2500,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,EZH2phosphoT487,2,1_1 | 2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF643ZSI,https://www.encodeproject.org/files/ENCFF643ZS...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF643ZSI,2017-01-15,34275,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF640LEB,https://www.encodeproject.org/files/ENCFF640LE...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF640LEB,2017-01-15,35158,N/D,N/D,N/D,N/D,
0,1460,AR:AB_10860536,2,1 | 2,ENCBS018DDZ | ENCBS500EWH,cell line,SK-N-SH,peaks and background as input for IDR,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"chromatin remodeller, other post-translational...",narrowPeak,female,False,False,Transcription factor ChIP-seq,Illumina HiSeq 2500,ENCODE,ENCODE,"Bradley Bernstein, Broad",Homo sapiens,EZH2phosphoT487,2,1_1 | 2_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF090MFL,https://www.encodeproject.org/files/ENCFF090MF...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF090MFL,2017-01-15,6898090,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF252AKV,https://www.encodeproject.org/files/ENCFF252AK...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF252AKV,2017-01-15,6943679,N/D,N/D,N/D,N/D,
0,1460,AR:AB_1603773,1,1,ENCBS007RAH,cell line,SK-N-SH,peaks and background as input for IDR,peaks,Metastatic neuroblastoma from bone marrow,ENCDO000ABD,"RNA binding protein, transcription factor",narrowPeak,female,False,False,Transcription factor ChIP-seq,Illumina Genome Analyzer IIx,ENCODE,ENCODE,"Michael Snyder, Stanford",Homo sapiens,CHD2,1,1_1,ChIP-seq,GRCh38,GRCh38_ENCODE_NARROW_2019_01,ENCFF958AXK,https://www.encodeproject.org/files/ENCFF958AX...,http://www.gmql.eu/gmql-rest/datasets/public.G...,https://www.encodeproject.org/files/ENCFF958AXK,2018-01-26,5210174,N/D,N/D,N/D,N/D,,hg19,HG19_ENCODE_NARROW_2019_01,ENCFF506WYU,https://www.encodeproject.org/files/ENCFF506WY...,http://www.gmql.eu/gmql-rest/datasets/public.H...,https://www.encodeproject.org/files/ENCFF506WYU,2018-01-26,5386237,N/D,N/D,N/D,N/D,


**Visualize the list of pairs**

If you are interested only in some of the the pairs list, extract if from the ```full_result``` and exploit it further.

In [6]:
pairs = full_result[['biosample_source_id', 'assembly_grch38', 'item_source_id_grch38', 'local_url_grch38', 'assembly_hg19', 'item_source_id_hg19', 'local_url_hg19']]

pairs.head(10)

Unnamed: 0,biosample_source_id,assembly_grch38,item_source_id_grch38,local_url_grch38,assembly_hg19,item_source_id_hg19,local_url_hg19
0,ENCBS324TZB,GRCh38,ENCFF741ZLK,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF607WBJ,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS500EWH,GRCh38,ENCFF052QOR,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF379HKR,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS324TZB | ENCBS500EWH,GRCh38,ENCFF996FFY,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF801FXF,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS324TZB | ENCBS500EWH,GRCh38,ENCFF549AYZ,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF202WFY,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS500EWH,GRCh38,ENCFF948DTB,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF139FDW,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS018DDZ,GRCh38,ENCFF093XEF,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF978OGV,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS018DDZ | ENCBS500EWH,GRCh38,ENCFF280JJA,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF507HLD,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS018DDZ | ENCBS500EWH,GRCh38,ENCFF643ZSI,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF640LEB,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS018DDZ | ENCBS500EWH,GRCh38,ENCFF090MFL,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF252AKV,http://www.gmql.eu/gmql-rest/datasets/public.H...
0,ENCBS007RAH,GRCh38,ENCFF958AXK,http://www.gmql.eu/gmql-rest/datasets/public.G...,hg19,ENCFF506WYU,http://www.gmql.eu/gmql-rest/datasets/public.H...


**Save the list of pairs**

In [0]:
pairs.to_csv("GenoSurf_rest_ENCODE_assemblies.csv")