In [2]:
import pandas as pd
from google.cloud import storage
import os

In [3]:
BUCKET_NAME = 'build_hackathon_dnanyc'
PROJECT_ID = 'gcp-nyc'
# GCS_PREFIX = 'pivot_data_v2/'

In [4]:
def configure_gcs(project_id=PROJECT_ID):
    client = storage.Client(project=project_id)
    return client

In [5]:
def download_data(client, bucket_name, gcs_prefix, nb_partition=10,
                  destination='input_data/', debug=False):
    # Set up bucket
    bucket = client.bucket(bucket_name)

    # Get file list
    partition_list = client.list_blobs(bucket_name, prefix=gcs_prefix)
    partition_list = [elt.name for elt in partition_list]
    print(partition_list)
    downloaded_files = []

    # Create folder if it does not exist
    if not os.path.isdir(destination):
        os.mkdir(destination)

    # Download files
    for blob_path in partition_list[:nb_partition]:
        if blob_path[-1] != '/':
            blob = bucket.get_blob(blob_path)
            filename = blob_path.split('/')[-1]
            print(destination + filename)
            blob.download_to_filename(destination + filename)
            downloaded_files.append(destination + filename)
            if debug:
                print("Downloaded {} to {}".format(blob_path, destination + filename))

    return downloaded_files

In [6]:
def read_columns_to_keep(files):
    columns = []
    for file in files:
        if '_SUCCESS' not in file:
            with open(file, 'r') as f:
                l = f.readlines()
                l = [elt.strip() for elt in l]
                columns.extend(l)
    return columns

def get_patient_list(project_id, project_name):
    query = f"""
    SELECT case_barcode
    from `build_hackathon_dnanyc.patient_cancer_stage_v4`
    where project_short_name = '{project_name}'
    """
    df = pd.read_gbq(query, project_id=project_id)
    return df['case_barcode'].values

def read_labels(project_id, project_name):
    query = f"""
    SELECT case_barcode, pathologic_stage, pathologic_T, pathologic_N, pathologic_M
    from `build_hackathon_dnanyc.patient_cancer_stage_v4`
    where project_short_name = '{project_name}'
    """
    df = pd.read_gbq(query, project_id=project_id)
    return df

In [7]:
def download_from_bigquery(project_id, list_of_columns, list_of_patients):
    formated_columns = "', '".join(list_of_columns)
    formated_patients = "', '".join(list_of_patients)
    query = f"""
    SELECT *
    from `build_hackathon_dnanyc.brca_betas_clustered_efficient`
    where CpG_probe_id in ('{formated_columns}') and
          participant_id in ('{formated_patients}')
    """
    df = pd.read_gbq(query, project_id=project_id)
    return df

def download_from_bigquery2(project_id, list_of_columns):
    formated_columns = "', '".join(list_of_columns)
    query = f"""
    SELECT beta_value, CpG_probe_id, participant_id, sample_id, aliquot_barcode
    from `build_hackathon_dnanyc.brca_betas_clustered_efficient`
    where CpG_probe_id in ('{formated_columns}')
    """
    df = pd.read_gbq(query, project_id=project_id)
    return df

In [8]:
def merge_and_pivot(df_betas, df_patients, label_to_keep):
    df_patients = df_patients.drop_duplicates()
    df_p = df_betas.pivot(index="aliquot_barcode", columns='CpG_probe_id',
         values='beta_value')
    df_p['label_barcode'] = df_p.index.str[:16]
    df_p = df_p.reset_index(drop=True)
    df_p['patient_id'] = df_p['label_barcode'].str[:12]
    df_final = df_p.merge(df_patients[['case_barcode', label_to_keep]],
                          how='left', left_on='patient_id',
                          right_on='case_barcode')
    df_final = df_final.drop('patient_id', axis=1)
    df_final = df_final.drop('case_barcode', axis=1)
    df_final = df_final.set_index('label_barcode')
    return df_final

In [9]:
# Label column
# column = 'pathologic_M'
column = ''
# Get GCS bucket
client = configure_gcs()
# Download files from GCS
files = download_data(client, BUCKET_NAME,
              'columns_to_keep/' + column,
              nb_partition=20,
              destination='columns_to_download/')
# Process columns to keep
columns = read_columns_to_keep(files)
# Get the list of patient to keep
# patients = get_patient_list('gcp-nyc', 'TCGA-BRCA')



TransportError: HTTPSConnectionPool(host='oauth2.googleapis.com', port=443): Max retries exceeded with url: /token (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1108)')))

In [19]:
# Download beta values from BQ
df = download_from_bigquery2('gcp-nyc', columns)

AttributeError: module 'google.api_core' has no attribute 'gapic_v1'

In [26]:
# Download labels from BQ
patient_df = read_labels('gcp-nyc', 'TCGA-BRCA')
# Merge betas and labels and pivot data
df_final = merge_and_pivot(df, patient_df, column)

Downloading: 100%|██████████| 2196/2196 [00:00<00:00, 7645.42rows/s]


In [27]:
# Visualize dataset
df_final.head()

Unnamed: 0_level_0,cg00000292,cg00005847,cg00008493,cg00011459,cg00013618,cg00020533,cg00022866,cg00025991,cg00031162,cg00033773,...,cg27625732,cg27626424,cg27631256,cg27631817,cg27641018,cg27643859,cg27651218,cg27652350,cg27653134,pathologic_M
label_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-3C-AAAU-01A,0.678483,0.875122,0.954225,0.940438,0.936751,0.902312,0.88877,0.809445,0.366665,0.683956,...,0.912783,0.717687,0.936664,0.899484,0.423288,0.91791,0.965003,0.866693,0.913666,MX
TCGA-3C-AALI-01A,0.261045,0.68086,0.93386,0.943097,0.921116,0.907548,0.801373,0.703536,0.849201,0.842764,...,0.262924,0.884109,0.681356,0.818097,0.363782,0.941199,0.874776,0.696772,0.933253,M0
TCGA-3C-AALJ-01A,0.850628,0.774736,0.941596,0.9519,0.897482,0.908703,0.825329,0.719674,0.82189,0.509204,...,0.811744,0.928437,0.867457,0.850609,0.812567,0.934334,0.959235,0.715644,0.856873,M0
TCGA-3C-AALK-01A,0.733004,0.626697,0.936462,0.954865,0.883727,0.932274,0.78214,0.695371,0.814834,0.739096,...,0.822574,0.752031,0.894706,0.854904,0.873687,0.927892,0.971607,0.640974,0.850226,M0
TCGA-4H-AAAK-01A,0.796488,0.638573,0.94529,0.923485,0.913926,0.931722,0.784301,0.635154,0.747426,0.528892,...,0.534087,0.924514,0.916629,0.803331,0.820642,0.941901,0.959459,0.761174,0.817014,M0


In [28]:
# Save dataset to GCS
gcs_path = 'gs://build_hackathon_dnanyc/training_data/columns_to_keep_v4/'
file_name = f'tcga-brca-data-{column}.csv'
df_final.to_csv(gcs_path + file_name)