# Example Data Requests

Exploring the data availabe from the Champions API

In [1]:
import requests
import pandas as pd
import string
import random
import pprint
import json
import time
import os
pp = pprint.PrettyPrinter(width=79, compact=True)

### Define your credentials

Credentials can be found on [your Lumin Workspace page](https://database.championsoncology.com/lumin/workstation/)

In [2]:
# Define user credentials
username = 'user@example.com'
client_id = 0
user_id = 0
token = ''

Below is a function we use to retrieve data.

We use this to abstract away the work of a data request.

You shouldn't need to change anything here.

Just run the cell so the function is available in the next step

In [3]:
def request_data(table="expression", cancer_type = ["all"], genes = [], samples = [], drugs = None, source = "PDX", data_set_name = None):
    
    if data_set_name is None:
        # generate a folder name if none was specified
        res = ''.join(random.choices(string.ascii_uppercase +
                             string.digits, k = 6))
        data_set_name = 'Data-'+res
    else:
        # Remove any special characters
        data_set_name = ''.join(e for e in data_set_name if e.isalnum())

    # Define the data request dictionary
    d = {
        "request_data_type": table,
        "request_cancer_type": cancer_type,
        "request_genes": genes,
        "request_models": samples,
        "request_agents": None,
        "request_dataset": source,
        "request_workspace_id": username,
        "request_client": client_id,
        "request_user": user_id,
        "request_mode": True,
        "request_display_error": False,
        "preview": True
    }

    # create the request
    headers={"authorization":"Bearer "+token}
    response = requests.post('https://lumin-fast-api.championsoncology.com/workstation/',json=d, headers=headers)
    task_id = None
    task_id = response.json()['task_id']

    if task_id is None:
        print("Error: No task_id returned. Please check the request details")
        return None
    else:
        print('')
        print("Data Request ID: {}".format(task_id))

        # check the status of the request until it's complete
        while True:
            response = requests.get('https://lumin-fast-api.championsoncology.com/tasks/poll/'+task_id+'?old_progress=0',json=d, headers=headers)
            state = response.json()['state']
            if state == 'SUCCESS':
                result = response.json()['result']
                rd = json.loads(result)
                #pp.pprint(rd)
                folder = rd['name']
                break
            elif state == 'PROGRESS':
                print("Still loading...")
            elif state == 'error':
                print("Error: {}".format(response.json()['error']))
                return None, None
            else:
                print("Status: {}".format(state))
                time.sleep(20)
            #pp.pprint(response.json())

        # Change the folder name to the data set name
        directory = os.path.expanduser("~")
        #print(directory)
        os.rename(directory+'/'+folder, directory+'/'+data_set_name)

        # combine the files and save as a CSV
        folder = directory+"/"+data_set_name
        data = pd.DataFrame()
        i = 0
        for file in os.listdir(folder):
            i = i+1
            print(f'reading file {i} {file} ...')
            df = pd.read_json(folder+'/'+file)
            data = pd.concat([data, df])
    
        #print(data)
        data.to_csv(folder+'/'+data_set_name+'.csv', encoding='utf-8')

        # return the file name and location to the user
        return data,folder+'/'+data_set_name+'.csv'

print("Function loaded")

Function loaded


### 2. Request Data

#### Load list of gene/samples of interest from from a file

In [4]:
# Load csv with genes of interest
# Either treated or untreated with taxane
genes_df = pd.read_csv('../Lists/100_random_oncogenes.csv', names=['gene'])
# sort by model id
genes_df.sort_values('gene')
print(genes_df)

      gene
0     MEN1
1    FGFR3
2    KIF5B
3     SDC4
4    FOXA1
..     ...
95   KDM6A
96   FANCA
97  STAT5B
98   ASXL1
99     ELN

[100 rows x 1 columns]


#### Request RNAseq with genes of interest loaded from list

In [None]:
# First Run
# Load RNAseq data for 100 random oncogenes
df, location = request_data(table="expression", genes=list(genes_df['gene']), data_set_name="Example_genes")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

# Subsequent Runs
# Example of loading from CSV next time
#location = '../../100genesRNAseq/100genesRNAseq.csv'
#data_df = pd.read_csv(location)
#data_df = data_df[['model_name','log.tpm','tumor_type','gene']]
#print(data_df.head(5))
#print(data_df.shape)

#### Request RNAseq with sample IDs

In [None]:
df, location = request_data(table="expression", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### Request Mutations with sample IDs

In [None]:
df, location = request_data(table="mutations", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### Request Copy Number Variations with sample IDs

In [None]:
df, location = request_data(table="copynumber", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### Request Fusions with sample IDs

In [None]:
df, location = request_data(table="fusions", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### Clinical metadata

In [None]:
df, location = request_data(table="clinical", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### Clinical response data

In [None]:
df, location = request_data(table="clinical_treatments", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### TGI

In [None]:
df, location = request_data(table="TGI", samples=['CTG-0009'], data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### TCGA

In [None]:
df, location = request_data(source="TCGA", genes=list(genes_df['gene']), table="", data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

#### CPTAC

In [None]:
df, location = request_data(source="TCGA", genes=list(genes_df['gene']), table="", data_set_name="Example_samples")
print("Data saved to {} and available as df".format(location))
print("Data preview:")
print(df)

### Coming Soon

#### GTEX data and example notebook