# GenoSurf API Example Use Case: 
## Extract pairs of TCGA items from same patient and corresponding to normal/tumoral tissue

###Import necessary libraries

In [0]:
import requests
from ast import literal_eval
import pandas as pd
from functools import reduce

###Call API with query

The ```data``` variable contains the GenoSurf query in JSON format (it can be downloaded from the interface clicking on MODIFY, then 'COPY TO CLIPBOARD').

The ```payload``` variable contains the JSON payload of the API POST request. For advanced use refer to the first notebook (GenoSurf_rest_ENCODE_assemblies.ipynb)




In [0]:
url = 'http://geco.deib.polimi.it/genosurf/api/query/table'

data = '{"gcm":{"source":["tcga"],"disease":["lung adenocarcinoma"]},"type":"original","kv":{}}'

data_dict = literal_eval(data)

payload = {'agg': True}

###Load result dataset into Pandas DataFrame

Use ```len(response_json)``` to visualize the number of extracted items.

Use ```df.shape[1]``` to visualize the number of columns

Use ```df.columns``` for the complete list of columns

In this example we are interested in:
* ```item_source_id``` and ```local_url``` to completely identify the item
* ```assembly```, ```donor_source_id```, ```data_type```, ```pipeline```, ```platform``` to group items
* ```is_healthy``` to distinguish items

In [0]:
response = requests.post(url, json=data_dict, params=payload)
response_json = response.json()

df = pd.DataFrame(response_json)

#restrict the dataframe to only interesting columns
df = df[['item_source_id','local_url','donor_source_id','is_healthy', 'assembly','data_type','pipeline','platform']]

**Define the role of columns**

*   ```class_column```: used for distinguishing data in classes (e.g., ```assembly``` or ```is_healthy```)
*   ```grouped_columns```: used for grouping
*   ```other_columns```: includes the class column (and those derived from this) plus all columns that are unique for items (e.g. id, URLs) but also 



In [0]:
class_column = 'is_healthy'
grouped_columns = ['assembly','donor_source_id','data_type','pipeline','platform']
other_columns = list(filter(lambda x: not x in grouped_columns, df.columns))

**Compute desider output**

Use ```full_result.head(10)``` to visualize first rows of the final dataframe.

In the dataframe:
*   First columns are from ```grouped_columns```
*   Then we have columns used for distinguishing classes (column names are formed as ```column_name_<class_value>``` to separate classes, e.g. 
```is_healthy_false``` and ```is_healthy_true```)


In [5]:
df = df[grouped_columns + other_columns]

#define possible values of class column
classes = df[class_column].sort_values().unique()

#perform group by on the dataframe
grouped_df = df.groupby(grouped_columns)

result = []

#compute the pairs
for group_key, group_df in grouped_df:

  inner_grouped_df = group_df.groupby(class_column)
  
  has_all_classes = reduce((lambda x, y: x and y), map(lambda x: x in inner_grouped_df.groups.keys(), classes))
    
  if has_all_classes:
    class_result = []
    for c in classes:
      inner_group_df = inner_grouped_df.get_group(c) 
      inner_group_df.columns = map(lambda x: x if x in grouped_columns else x + "_" + c.lower() ,inner_group_df.columns)
      class_result.append(inner_group_df)
    
    merged = reduce(lambda left,right: pd.merge(left,right,on=grouped_columns), class_result)

    result.append(merged)

#builds the whole final dataframe result    
full_result = pd.concat(result, axis=0, join='outer')

print("Number of pairs: ",full_result.shape[0])

full_result.head(10)

Number of pairs:  1436


Unnamed: 0,assembly,donor_source_id,data_type,pipeline,platform,item_source_id_false,local_url_false,is_healthy_false,item_source_id_true,local_url_true,is_healthy_true
0,GRCh38,0075437e-ba1a-46be-86d6-9773209a2b5e,Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,5f224fa1-13d6-4310-a238-7482cd3bdbc6-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,d3a4622d-65ae-44c2-85ba-c0d7bffdc551-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,0075437e-ba1a-46be-86d6-9773209a2b5e,Masked Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,5f224fa1-13d6-4310-a238-7482cd3bdbc6-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,d3a4622d-65ae-44c2-85ba-c0d7bffdc551-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,009be09b-f9f6-43b7-8f45-4a648f8123ce,Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,124240f8-701b-4f9e-95ea-6e30c3e3209a-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,c754420d-d8b2-4f07-b9f5-8f6b7b8b5223-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,009be09b-f9f6-43b7-8f45-4a648f8123ce,Masked Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,124240f8-701b-4f9e-95ea-6e30c3e3209a-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,c754420d-d8b2-4f07-b9f5-8f6b7b8b5223-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,01e9888d-b5b9-48f1-8ba6-8a89af108a04,Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,2ffac0f4-2223-40a2-a315-ce26ec2244e4-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,5ed63725-ae72-4157-bcec-c082cc0048c0-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,01e9888d-b5b9-48f1-8ba6-8a89af108a04,Masked Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,2ffac0f4-2223-40a2-a315-ce26ec2244e4-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,5ed63725-ae72-4157-bcec-c082cc0048c0-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,0232d299-4cdf-4fd7-9a5e-8d13c208b40c,Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,17c02f48-b544-45ad-8f10-62292b26d04d-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,427300d7-c99a-42bf-8fc0-e3c234309737-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,0232d299-4cdf-4fd7-9a5e-8d13c208b40c,Masked Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,17c02f48-b544-45ad-8f10-62292b26d04d-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,427300d7-c99a-42bf-8fc0-e3c234309737-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,028e99e9-5b9a-4954-bb6e-6d4709a3cea8,Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,1253f3d7-92ff-419b-946a-6962d84d8fb6-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,6ef5a769-1330-46fb-9e3d-8608295701f9-cns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True
0,GRCh38,028e99e9-5b9a-4954-bb6e-6d4709a3cea8,Masked Copy Number Segment,DNAcopy,Affymetrix SNP 6.0,1253f3d7-92ff-419b-946a-6962d84d8fb6-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,False,6ef5a769-1330-46fb-9e3d-8608295701f9-mcns,http://www.gmql.eu/gmql-rest/datasets/public.G...,True


**Visualize and save the list of pairs**

If you are interested only in some of the the pairs list, extract if from the ```full_result``` and exploit it further.

In [0]:
pairs = full_result[grouped_columns + ['item_source_id_false','is_healthy_false','item_source_id_true','is_healthy_true']]

pairs.to_csv("GenoSurf_rest_TCGA_healthy_vs_tumor.csv")