In [1]:
import pandas as pd
import numpy as np

In [2]:
from data_engineering.get_data import read_dataset
from data_engineering.preprocessing import preprocessing

# Get data from GCS

In [20]:
betas, labels, cpg_sites, index = read_dataset('input/', gcs_prefix='pivot_data_v2/', nb_partition=110)

Downloading data...
Loaded dataset. Shape = (1000, 5000)


Then, let's build a Pandas Dataframe only containing the sample index and their labels.

In [4]:
df_labels = pd.DataFrame({'sample_id': index, 'sample_label': labels})

In [5]:
df_labels.head(5)

Unnamed: 0,sample_id,sample_label
0,TCGA-AB-2911-03A,tumor
1,TCGA-B4-5835-01A,tumor
2,TCGA-CJ-4887-01A,tumor
3,TCGA-BR-4194-01A,tumor
4,TCGA-97-7937-01A,tumor


# Add cancer stage data

Add a participant id column

In [6]:
df_labels['participant_id'] = df_labels['sample_id'].str.split('-').str[:3].str.join('-')
participant_ids = tuple(df_labels['participant_id'].values)

Query the bigquery table associated containing the cancer stage information

In [7]:
query = f"""
    SELECT *
    FROM `gcp-nyc.build_hackathon_dnanyc.patient_cancer_stage_v2`
    WHERE case_barcode in {participant_ids}
    """

In [8]:
clinical_stages = pd.read_gbq(query, project_id='gcp-nyc')

Downloading: 100%|████████████████████████████████████████████████████████████| 9943/9943 [00:00<00:00, 13583.20rows/s]


Mapping cancer stage text to cancer stage label (keep only the stage and discard the substage)

In [9]:
cancer_stages = {None: np.nan,
                 'IS': np.nan,
                 'Stage 0': 0,
                 'Stage I': 1,
                 'Stage X': np.nan,
                 'I/II NOS': np.nan,
                 'Stage IA': 1, 'Stage IB': 1,
                 'Stage II': 2,
                 'Stage IV': 4,
                 'Stage IIA': 2, 'Stage IIB': 2, 'Stage IIC': 2,
                 'Stage III': 3,
                 'Stage IVA': 4, 'Stage IVB': 4, 'Stage IVC': 4,
                 'Stage IIIA': 3, 'Stage IIIB': 3, 'Stage IIIC': 3,
                 'Stage IC': 1, 'Stage IS': np.nan,
                 'Stage IA1': 1, 'Stage IA2': 1, 'Stage IB1': 1, 'Stage IB2': 1,
                 'Stage IIA1': 2, 'Stage IIA2': 2, 'Stage IIIC1': 3, 'Stage IIIC2': 3}

In [10]:
clinical_stages['cancer_stage'] = np.where(clinical_stages['clinical_stage'].isna(),
                                           clinical_stages['pathologic_stage'],
                                           clinical_stages['clinical_stage'])
clinical_stages['cancer_stage'] = clinical_stages['cancer_stage'].map(cancer_stages)

Merge the two datasets

In [11]:
labels_merged = df_labels.merge(clinical_stages[['case_barcode', 'cancer_stage']],
                             left_on='participant_id', right_on='case_barcode')
labels_merged = labels_merged.drop('case_barcode', axis=1)
labels_merged['stage_to_predict'] = np.where(labels_merged['sample_label'] == 'normal', 0,
                                            labels_merged['cancer_stage'])
labels_merged = labels_merged.set_index('sample_id').reindex(index=index).reset_index()
labels_merged = labels_merged[['sample_id', 'stage_to_predict']]
new_labels = labels_merged['stage_to_predict'].values

In [12]:
labels_merged.head(5)

Unnamed: 0,sample_id,stage_to_predict
0,TCGA-AB-2911-03A,
1,TCGA-B4-5835-01A,1.0
2,TCGA-CJ-4887-01A,4.0
3,TCGA-BR-4194-01A,
4,TCGA-97-7937-01A,1.0


In [13]:
labels_merged['stage_to_predict'].value_counts(dropna=False)

NaN    2377
1.0    2367
2.0    2221
3.0    2209
0.0     999
4.0     949
Name: stage_to_predict, dtype: int64

# Drop NAs

As they are patients for which the stage is not known, we are dropping those rows

In [14]:
idx_to_drop = [idx for idx, elt in enumerate(labels_merged['stage_to_predict'].isna().values) if elt]

In [15]:
new_betas = np.delete(betas, idx_to_drop, axis=0)
new_labels = np.delete(new_labels, idx_to_drop, axis=0)
new_index = np.delete(index, idx_to_drop, axis=0)

# Upload to GCS

In [16]:
df = pd.DataFrame(new_betas, columns=cpg_sites, index=new_index)
df['label'] = new_labels

In [17]:
df.shape

(8745, 5001)

Create partitions of 100 rows and uploads dataframe into partitioned csv into GCS

In [18]:
idx = np.arange(0, 8745, 100)

In [19]:
for i in idx:
    sub_df = df.iloc[i:i+100, :]
    sub_df.to_csv(f'gs://build_hackathon_dnanyc/pivot_data_v3/betas_partition_{i+1}.csv',
                 line_terminator='\n')