In [1]:
import pandas as pd
import numpy as np

In [2]:
from data_engineering.get_data import read_dataset
from data_engineering.preprocessing import preprocessing

# Get data from GCS

In [3]:
betas, labels, cpg_sites, index = read_dataset('input/', gcs_prefix='pivot_data_v2/', nb_partition=110)

Using already downloaded data
Loaded dataset. Shape = (11122, 5000)


Then, let's build a Pandas Dataframe only containing the sample index and their labels.

In [4]:
df = pd.DataFrame(betas, columns=cpg_sites, index=index)

In [5]:
df.shape

(11122, 5000)

In [6]:
df['sample_id'] = df.index
df.head(5)

Unnamed: 0,cg23563234,cg23317501,cg23326689,cg15873301,cg18722841,cg25612480,cg06906435,cg04598121,cg14011639,cg21624282,...,cg26372517,cg05846716,cg26516759,cg02506908,cg07221454,cg17491456,cg06254453,cg26764244,cg19166347,sample_id
TCGA-AB-2911-03A,0.113222,0.452157,0.77849,0.803926,0.222304,0.500368,0.091369,0.385591,0.055292,0.948478,...,0.898581,0.181463,0.228306,0.959315,0.924679,0.412821,0.023767,0.293016,0.063232,TCGA-AB-2911-03A
TCGA-B4-5835-01A,0.239118,0.144815,0.237799,0.135512,0.102661,0.766249,0.465791,0.191826,0.113149,0.952183,...,0.358637,0.119976,0.15573,0.559,0.724539,0.101908,0.028191,0.178989,0.127335,TCGA-B4-5835-01A
TCGA-CJ-4887-01A,0.183911,0.099618,0.556424,0.192873,0.075017,0.906606,0.4931,0.525695,0.056658,0.572657,...,0.45256,,0.056214,0.480438,0.735401,0.074271,0.022588,0.11981,0.072984,TCGA-CJ-4887-01A
TCGA-BR-4194-01A,0.620407,0.614873,0.622979,0.503012,0.613254,0.44084,0.381805,0.661913,0.517967,0.656719,...,0.240547,0.072926,0.157968,0.541921,0.95805,0.075851,0.018219,0.139576,0.057367,TCGA-BR-4194-01A
TCGA-97-7937-01A,0.395937,0.576372,0.479536,0.287898,0.481378,0.279648,0.269111,0.42599,0.342929,0.424294,...,0.210352,0.178541,0.362542,0.40884,0.819064,0.192656,0.50924,0.095587,0.139774,TCGA-97-7937-01A


# Add cancer stage data

Add a participant id column

In [7]:
df['participant_id'] = df.index.str.split('-').str[:3].str.join('-')
participant_ids = tuple(df['participant_id'].values)

Query the bigquery table associated containing the cancer stage information

In [8]:
query = f"""
    SELECT *
    FROM `gcp-nyc.build_hackathon_dnanyc.patient_cancer_stage_v3`
    WHERE case_barcode in {participant_ids}
    """

In [9]:
clinical_stages = pd.read_gbq(query, project_id='gcp-nyc')

Downloading: 100%|██████████████████████████████████████████████████████████| 19922/19922 [00:01<00:00, 11799.96rows/s]


In [10]:
def combine_values(series):
    s = series.drop_duplicates().dropna()
    if s.shape[0] > 0:
        value = s.iloc[0]
    else:
        value = np.nan
    return value

In [11]:
clinical_stages = clinical_stages.dropna(axis=0, how='all', subset=['clinical_stage', 'clinical_T', 'clinical_N',
                                                     'clinical_M', 'pathologic_stage', 'pathologic_T',
                                                     'pathologic_N', 'pathologic_M'])
clinical_stages = clinical_stages.groupby('case_barcode').agg({"clinical_stage": combine_values, "clinical_T":combine_values,
                                            "clinical_N":combine_values, "clinical_M":combine_values,
                                            "pathologic_stage":combine_values, "pathologic_T":combine_values,
                                            "pathologic_N":combine_values, "pathologic_M":combine_values})

In [12]:
clinical_stages.head()

Unnamed: 0_level_0,clinical_stage,clinical_T,clinical_N,clinical_M,pathologic_stage,pathologic_T,pathologic_N,pathologic_M
case_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TCGA-04-1331,Stage IIIC,,,,,,,
TCGA-04-1332,Stage IIIC,,,,,,,
TCGA-04-1335,Stage IB,,,,,,,
TCGA-04-1336,Stage IIIB,,,,,,,
TCGA-04-1337,Stage IIIC,,,,,,,


Merge the two datasets

In [13]:
labels_merged = df.merge(clinical_stages,
                         left_on='participant_id', right_on='case_barcode', how='left')
# labels_merged = labels_merged.set_index('sample_id').reindex(index=index).reset_index()
# labels_merged = labels_merged[['sample_id', 'stage_to_predict']]
# new_labels = labels_merged['stage_to_predict'].values

# Drop NAs

As they are patients for which the stage is not known, we are dropping those rows

In [14]:
labels_merged = labels_merged.dropna(how='all', axis='index', subset=['clinical_stage', 'clinical_T', 'clinical_N',
                                                                      'clinical_M', 'pathologic_stage', 'pathologic_T',
                                                                      'pathologic_N', 'pathologic_M'])
labels_merged.shape

(9378, 5010)

In [15]:
labels_merged = labels_merged.set_index('sample_id').drop('participant_id', axis=1)

In [16]:
labels_merged.head()

Unnamed: 0_level_0,cg23563234,cg23317501,cg23326689,cg15873301,cg18722841,cg25612480,cg06906435,cg04598121,cg14011639,cg21624282,...,cg26764244,cg19166347,clinical_stage,clinical_T,clinical_N,clinical_M,pathologic_stage,pathologic_T,pathologic_N,pathologic_M
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-B4-5835-01A,0.239118,0.144815,0.237799,0.135512,0.102661,0.766249,0.465791,0.191826,0.113149,0.952183,...,0.178989,0.127335,,,,,Stage I,T1,N0,M0
TCGA-CJ-4887-01A,0.183911,0.099618,0.556424,0.192873,0.075017,0.906606,0.4931,0.525695,0.056658,0.572657,...,0.11981,0.072984,,,,,Stage IV,T3a,NX,M1
TCGA-97-7937-01A,0.395937,0.576372,0.479536,0.287898,0.481378,0.279648,0.269111,0.42599,0.342929,0.424294,...,0.095587,0.139774,,,,,Stage IB,T2a,N0,MX
TCGA-EB-A5UL-06A,0.3257,0.201301,0.632854,0.475798,0.421415,0.334104,0.291824,0.173058,0.256726,0.456166,...,0.150554,0.093429,,,,,Stage III,TX,N1,M0
TCGA-V4-A9EC-01A,0.749972,0.080037,0.615426,0.035988,0.025803,0.394041,0.924438,0.045183,0.09521,0.747002,...,0.020322,0.067649,Stage IIB,T3a,N0,M0,Stage IIB,T3a,N0,M0


# Upload to GCS

In [17]:
labels_merged.shape

(9378, 5008)

Create partitions of 100 rows and uploads dataframe into partitioned csv into GCS

In [18]:
idx = np.arange(0, 9378, 100)

In [19]:
for i in idx:
    sub_df = labels_merged.iloc[i:i+100, :]
    sub_df.to_csv(f'gs://build_hackathon_dnanyc/pivot_data_v4/betas_partition_{i+1}.csv',
                 line_terminator='\n')

# Old Transformation

Mapping cancer stage text to cancer stage label (keep only the stage and discard the substage)

In [20]:
cancer_stages = {None: np.nan,
                 'IS': np.nan,
                 'Stage 0': 0,
                 'Stage I': 1,
                 'Stage X': np.nan,
                 'I/II NOS': np.nan,
                 'Stage IA': 1, 'Stage IB': 1,
                 'Stage II': 2,
                 'Stage IV': 4,
                 'Stage IIA': 2, 'Stage IIB': 2, 'Stage IIC': 2,
                 'Stage III': 3,
                 'Stage IVA': 4, 'Stage IVB': 4, 'Stage IVC': 4,
                 'Stage IIIA': 3, 'Stage IIIB': 3, 'Stage IIIC': 3,
                 'Stage IC': 1, 'Stage IS': np.nan,
                 'Stage IA1': 1, 'Stage IA2': 1, 'Stage IB1': 1, 'Stage IB2': 1,
                 'Stage IIA1': 2, 'Stage IIA2': 2, 'Stage IIIC1': 3, 'Stage IIIC2': 3}

In [21]:
clinical_stages['cancer_stage'] = np.where(clinical_stages['clinical_stage'].isna(),
                                           clinical_stages['pathologic_stage'],
                                           clinical_stages['clinical_stage'])
clinical_stages['cancer_stage'] = clinical_stages['cancer_stage'].map(cancer_stages)