# GenoSurf API Example Use Case: 
## Extract TCGA pairs of items aligned to hg19 legacy assembly and to new GRCh38 assembly (from Genomic Data Commons portal)

###Import necessary libraries

In [0]:
import requests
from ast import literal_eval
import pandas as pd
from functools import reduce

###Call API with query

The ```data``` variable contains the GenoSurf query in JSON format (it can be downloaded from the interface clicking on MODIFY, then 'COPY TO CLIPBOARD').

The ```payload``` variable contains the JSON payload of the API POST request. For advanced use refer to the first notebook (GenoSurf_rest_ENCODE_assemblies.ipynb).


In [0]:
url = 'http://geco.deib.polimi.it/genosurf/api/query/table'

data = '{"gcm":{"project_name":["tcga-chol"]},"type":"original","kv":{}}'

data_dict = literal_eval(data)

payload = {'agg': True}

###Load result dataset into Pandas DataFrame

Use ```len(response_json)``` to visualize the number of extracted items.

Use ```df.shape[1]``` to visualize the number of columns

Use ```df.columns``` for the complete list of columns

In [0]:
response = requests.post(url, json=data_dict, params=payload)
response_json = response.json()

df = pd.DataFrame(response_json)

#restrict the dataframe to only interesting columns
df = df[['item_source_id','alt_item_source_id','local_url','source_url', 'assembly','data_type']]

**Define the role of columns**

*   ```class_column```: used for distinguishing data in classes
*   ```grouped_columns```: used for grouping
*   ```other_columns```: all columns that are not used for grouping 



In [0]:
class_column = 'assembly'
grouped_columns = ['alt_item_source_id']
other_columns = list(filter(lambda x: not x in grouped_columns, df.columns))

**Compute desider output**

Use ```full_result.head(10)``` to visualize first rows of the final dataframe.

In the dataframe:
*   First columns are from ```grouped_columns```
*   Then we have columns used for distinguishing classes (column names are formed as ```column_name_<class_value>``` to separate classes, e.g. 
```assembly_GRCh38``` and ```assembly_hg19```)


In [5]:
df = df[grouped_columns + other_columns]

#define possible values of class column
classes = df[class_column].sort_values().unique()

#perform group by on the dataframe
grouped_df = df.groupby(grouped_columns)

result = []

#compute the pairs
for group_key, group_df in grouped_df:

  inner_grouped_df = group_df.groupby(class_column)
  
  has_all_classes = reduce((lambda x, y: x and y), map(lambda x: x in inner_grouped_df.groups.keys(), classes))
    
  if has_all_classes:
    class_result = []
    for c in classes:
      inner_group_df = inner_grouped_df.get_group(c) 
      inner_group_df.columns = map(lambda x: x if x in grouped_columns else x + "_" + c.lower() ,inner_group_df.columns)
      class_result.append(inner_group_df)
    
    merged = reduce(lambda left,right: pd.merge(left,right,on=grouped_columns), class_result)

    result.append(merged)

#builds the whole final dataframe result    
full_result = pd.concat(result, axis=0, join='outer')

print("Number of pairs: ",full_result.shape[0])

full_result.head(10)

Number of pairs:  573


Unnamed: 0,alt_item_source_id,item_source_id_grch38,local_url_grch38,source_url_grch38,assembly_grch38,data_type_grch38,item_source_id_hg19,local_url_hg19,source_url_hg19,assembly_hg19,data_type_hg19
0,TCGA-3X-AAV9-01A-72D-A416-01,f81a2b3c-372a-4bd9-91d0-231fe860a9f2-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Copy Number Segment,TCGA-3X-AAV9-01A-72D-A416-01__cnv,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/cnv/TCGA-3X-...,HG19,cnv
1,TCGA-3X-AAV9-01A-72D-A416-01,f81a2b3c-372a-4bd9-91d0-231fe860a9f2-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Masked Copy Number Segment,TCGA-3X-AAV9-01A-72D-A416-01__cnv,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/cnv/TCGA-3X-...,HG19,cnv
0,TCGA-3X-AAV9-01A-72D-A417-09,dd621b60-9752-48be-967c-43ee49990150-msm,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Masked Somatic Mutation,TCGA-3X-AAV9-01A-72D-A417-09__dnaseq,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/dnaseq/TCGA-...,HG19,dna seq
0,TCGA-3X-AAV9-01A-72D-A418-05,a1d1c50a-efa8-431c-8a42-088150184c74-mbv,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Methylation Beta Value,TCGA-3X-AAV9-01A-72D-A418-05__dnamethylation,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/dnamethylati...,HG19,dna methylation
0,TCGA-3X-AAV9-01A-72R-A41D-13,23f90507-8e6f-4ce6-9d7c-232ce5bee650-ieq,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Isoform Expression Quantification,TCGA-3X-AAV9-01A-72R-A41D-13__mirnaseq_isoform,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/mirnaseq/iso...,HG19,isoform quantification
1,TCGA-3X-AAV9-01A-72R-A41D-13,23f90507-8e6f-4ce6-9d7c-232ce5bee650-ieq,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Isoform Expression Quantification,TCGA-3X-AAV9-01A-72R-A41D-13__mirnaseq_mirna,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/mirnaseq/mir...,HG19,mirna quantification
2,TCGA-3X-AAV9-01A-72R-A41D-13,23f90507-8e6f-4ce6-9d7c-232ce5bee650-meq,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,miRNA Expression Quantification,TCGA-3X-AAV9-01A-72R-A41D-13__mirnaseq_isoform,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/mirnaseq/iso...,HG19,isoform quantification
3,TCGA-3X-AAV9-01A-72R-A41D-13,23f90507-8e6f-4ce6-9d7c-232ce5bee650-meq,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,miRNA Expression Quantification,TCGA-3X-AAV9-01A-72R-A41D-13__mirnaseq_mirna,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/mirnaseq/mir...,HG19,mirna quantification
0,TCGA-3X-AAV9-01A-72R-A41I-07,59012a78-0e8f-4b99-af97-0dbb1d3d0513-geq,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Gene Expression Quantification,TCGA-3X-AAV9-01A-72R-A41I-07__rnaseqv2_exon,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/rnaseqv2/exo...,HG19,exon quantification
1,TCGA-3X-AAV9-01A-72R-A41I-07,59012a78-0e8f-4b99-af97-0dbb1d3d0513-geq,http://www.gmql.eu/gmql-rest/datasets/public.G...,ftp://bioinformatics.iasi.cnr.it/opengdc/bed/t...,GRCh38,Gene Expression Quantification,TCGA-3X-AAV9-01A-72R-A41I-07__rnaseqv2_gene,http://www.gmql.eu/gmql-rest/datasets/public.H...,ftp://bioinf.iasi.cnr.it/bed/chol/rnaseqv2/gen...,HG19,gene quantification


**Visualize and save the list of pairs**

If you are interested only in some of the the pairs list, extract if from the ```full_result``` and exploit it further.

In [0]:
pairs = full_result[['alt_item_source_id','local_url_grch38','local_url_hg19']]

pairs.to_csv("GenoSurf_rest_TCGA_assemblies.csv")