# TCGA Data Preprocessing
---

Preprocessing the TCGA dataset from the Pancancer paper (https://www.ncbi.nlm.nih.gov/pubmed/29625048).

The Cancer Genome Atlas (TCGA), a landmark cancer genomics program, molecularly characterized over 20,000 primary cancer and matched normal samples spanning 33 cancer types. This joint effort between the National Cancer Institute and the National Human Genome Research Institute began in 2006, bringing together researchers from diverse disciplines and multiple institutions.

## Importing the necessary packages

In [None]:
import os                                  # os handles directory/workspace changes
import yaml                                # Save and load YAML files
import numpy as np                         # NumPy to handle numeric and NaN operations
from functools import reduce               # Parallelize functions

In [None]:
import os
import pandas 

# Set base data path
data_path = '/Users/elenalickel/Desktop/Thesis/IRR_Code/TCGA-Pancancer/'

# File names
rna_file = 'EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv'
abs_file = 'TCGA_mastercalls.abs_segtabs.fixed.txt'
cdr_file = 'TCGA-CDR-SupplementalTableS1.xlsx'

# Full paths
rna_path = os.path.join(data_path, rna_file)
abs_path = os.path.join(data_path, abs_file)
cdr_path = os.path.join(data_path, cdr_file)

# Load RNA and transpose
rna_df = pandas.read_csv(rna_path, sep='\t')
rna_df = rna_df.set_index('gene_id').transpose()
print("RNA shape:", rna_df.shape)
display(rna_df.head())

# Load ABSOLUTE CNV data
abs_df = pandas.read_csv(abs_path, sep='\t')
print("ABSOLUTE shape:", abs_df.shape)
display(abs_df.head())

# Load Clinical Data Resource (CDR)
cdr_df = pandas.read_excel(cdr_path)
print("CDR shape:", cdr_df.shape)
display(cdr_df.head())

In [None]:
import modin.pandas as pd                  # Optimized distributed version of Pandas

Allow pandas to show more columns:

In [None]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

Set the random seed for reproducibility:

In [None]:
np.random.seed(42)

## RNA

### Normalizing data

In [None]:
rna_df.describe().transpose()

The data is not (well) normalized yet. All columns should have 0 mean and 1 standard deviation.

Save the dataframe before normalizing:

In [None]:
import os

# Ensure the output directory exists
output_dir = os.path.join(data_path, 'cleaned/unnormalized')
os.makedirs(output_dir, exist_ok=True)

# Now save the file
rna_df.to_csv(os.path.join(output_dir, 'rna.csv'))

Normalize the data into a new dataframe:

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit and transform the RNA data
rna_df_norm = pandas.DataFrame(
    scaler.fit_transform(rna_df),
    index=rna_df.index,
    columns=rna_df.columns
)

rna_df_norm.head()


Confirm that everything is ok through the `describe` method:

In [None]:
rna_df_norm.describe().transpose()

Save the normalized dataframe:

In [None]:
normalized_dir = os.path.join(data_path, 'cleaned/normalized')
os.makedirs(normalized_dir, exist_ok=True)

# Save the normalized RNA data
rna_df_norm.to_csv(os.path.join(normalized_dir, 'rna.csv'))


In [None]:
rna_df_norm.head()

## ABSOLUTE-annotated seg data

This dataframe contains copy-number and copy-ratio related data.

Copy number alterations/aberrations (CNAs) are changes in copy number that have arisen in somatic tissue (for example, just in a tumor), copy number variations (CNVs) originated from changes in copy number in germline cells (and are thus in all cells of the organism).

The rows correspond to contiguous chunks along the chromosome with the same DNA copy-number. "Chromosome" is the chromosome, can be 1-22, X or Y (see human genome). Start is the physical start location for the segment along said linear chromosome, end is the end coordinate. Num_probes is the number of SNP-array probes falling within the segment (these were used to call copy numbers). Reference: https://www.biostars.org/p/244374/

### Loading the data

In [None]:
abs_anttd_seg_df = abs_df

In [None]:
len(abs_anttd_seg_df)

### Converting categorical features to numeric

In [None]:
abs_anttd_seg_df.solution.value_counts()

In [None]:
abs_anttd_seg_df.solution = abs_anttd_seg_df.solution.apply(lambda x: 1 if x == 'new' else 0)
abs_anttd_seg_df = abs_anttd_seg_df.rename(columns={'solution': 'new_solution'})
abs_anttd_seg_df.new_solution.value_counts()

### Removing unneeded features

Columns `Start`, `End`, `Num_Probes` and `Length` don't seem to be relevant as we don't need to know so much detail of each chromosome nor experiment specific information.

In [None]:
abs_anttd_seg_df = abs_anttd_seg_df.drop(columns=['Start', 'End', 'Num_Probes', 'Length'], axis=1)
abs_anttd_seg_df.head()

### Normalizing data

In [None]:
abs_anttd_seg_df.describe().transpose()

The data is not (well) normalized yet. All columns should have 0 mean and 1 standard deviation.

Save the dataframe before normalizing:

In [None]:
abs_anttd_seg_df.to_csv(f'{data_path}cleaned/unnormalized/copy_number_ratio.csv')

Normalize the data into a new dataframe:

In [None]:
# Exclude the categorical column(s)
categorical_cols = ['Chromosome']
numeric_cols = abs_anttd_seg_df.select_dtypes(include='number').columns.difference(categorical_cols)

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform numeric columns
abs_anttd_seg_df[numeric_cols] = scaler.fit_transform(abs_anttd_seg_df[numeric_cols])

# The categorical column stays unchanged
abs_anttd_seg_df.head()


Confirm that everything is ok through the `describe` method:

In [None]:
abs_anttd_seg_df.describe().transpose()

### Aggregating sample data

#### Missing value imputation

We can't join rows correctly if there are missing values

In [None]:
nan_idx = abs_anttd_seg_df[abs_anttd_seg_df.Ccf_ci95_high_a2.isnull()].index
nan_idx

In [None]:
abs_anttd_seg_df.iloc[nan_idx].head()

In [None]:
abs_anttd_seg_df.head(125).tail(25)

In [None]:
# Set index if 'Sample' should be preserved during interpolation
abs_anttd_seg_df = abs_anttd_seg_df.set_index('Sample')

# Interpolate numeric values (column-wise)
abs_anttd_seg_df = abs_anttd_seg_df.interpolate(method='linear', axis=0)

# Fill any remaining NaNs with forward fill, then backward fill
abs_anttd_seg_df = abs_anttd_seg_df.fillna(method='ffill', axis=0).fillna(method='bfill', axis=0)

# Reset index if needed
abs_anttd_seg_df = abs_anttd_seg_df.reset_index()

# Display the head
abs_anttd_seg_df.head()


In [None]:
# Percentage of missing values per column
missing_percent = abs_anttd_seg_df.isnull().mean().sort_values(ascending=False) * 100

# Display as a DataFrame
missing_percent_df = missing_percent[missing_percent > 0].to_frame(name='missing_percentage')
missing_percent_df


In [None]:
abs_anttd_seg_df.head(125).tail(25)

#### Average groupby aggregation

Join all the data of each sample's chromosome through an average groupby aggregation:

In [None]:
abs_anttd_seg_df = abs_anttd_seg_df.groupby(['Sample', 'Chromosome']).mean()
abs_anttd_seg_df.head(25)

#### Dividing chromosome data into different columns

Separate each chromosome's information into their own features.

OR

Create lists for each feature, containing each chromosome's value, and then apply an embedding bag on it.

In [None]:
abs_anttd_seg_df[abs_anttd_seg_df.index.get_level_values('Chromosome') == 1].head()

In [None]:
# List that will contain multiple dataframes, one for each chromosome
df_list = []
# Go through each chromosome and create its own dataframe, with properly labeled columns
for chrom in range(1, 23):
    # Filter for the current chromosome's dataframe
    tmp_df = abs_anttd_seg_df[abs_anttd_seg_df.index.get_level_values('Chromosome') == chrom]
    # Change the column names to identify the chromosome
    tmp_df.columns = [f'{col}_chromosome_{chrom}' for col in tmp_df.columns]
    # Remove now redundant `Chromosome` column
    tmp_df = tmp_df.reset_index().drop(columns='Chromosome', axis=1)
    # Add to the dataframes list
    df_list.append(tmp_df)

In [None]:
df_list[3]

In [None]:
abs_anttd_seg_df = reduce(lambda x, y: pandas.merge(x, y, on='Sample'), df_list)
abs_anttd_seg_df.head()

In [None]:
abs_anttd_seg_df.Sample.nunique()

In [None]:
len(abs_anttd_seg_df)

Remove duplicate columns (redundant features that are independent of the chromosome):

In [None]:
unique_features = set([col.split('_chromosome')[0] for col in abs_anttd_seg_df.columns])
unique_features

In [None]:
[col for col in abs_anttd_seg_df.columns if 'Cancer_cell_frac_a1' in col]

In [None]:
# Save the feature names that are redundant (i.e. no difference between chromosomes)
redundant_features = []

for feature in unique_features:
    # Flag that indicates if all of the feature's columns are equal
    all_cols_equal = True
    # List of column names that are part of the same unique feature
    chrom_cols = [col for col in abs_anttd_seg_df.columns if feature in col]
    
    for i in range(len(chrom_cols) - 1):
        # Check if the current pair of columns are completely equal
        if not (abs_anttd_seg_df[chrom_cols[i]] == abs_anttd_seg_df[chrom_cols[i+1]]).all():
            all_cols_equal = False
            break

    if all_cols_equal:
        redundant_features.append(feature)

redundant_features


In [None]:
column_duplicates = [col for col in abs_anttd_seg_df.columns if 'new_solution' in col]
column_duplicates.remove('new_solution_chromosome_1')
column_duplicates

In [None]:
abs_anttd_seg_df = abs_anttd_seg_df.drop(columns=column_duplicates, axis=1)
abs_anttd_seg_df = abs_anttd_seg_df.rename(columns={'new_solution_chromosome_1': 'new_solution'})
abs_anttd_seg_df.head()

In [None]:
[col for col in abs_anttd_seg_df.columns if 'new_solution' in col]

### Setting the index

Set `sample_id` column to be the index:

In [None]:
abs_anttd_seg_df = abs_anttd_seg_df.set_index('Sample')
abs_anttd_seg_df.head()

Fix the index name:

In [None]:
abs_anttd_seg_df.index.name = 'sample_id'
abs_anttd_seg_df.head()


Save the normalized dataframe:

In [None]:
abs_anttd_seg_df.to_csv(f'{data_path}cleaned/normalized/copy_number_ratio.csv')

## Clinical outcome (TCGA-CDR) data

Description

### Loading the data

In [None]:
cdr_df.dtypes

### Setting the index

Set `sample_id` column to be the index:

In [None]:
cdr_df = cdr_df.set_index('bcr_patient_barcode')
cdr_df.head()

Fix the index name:

In [None]:
cdr_df.index.name = 'sample_id'
cdr_df.head()


### Checking for missing values

Considerable percentage of missing values on `ajcc_pathologic_tumor_stage` (\~37%) and `clinical_stage` (\~76%).
Considering the real percentages of missing values, which are higher than what we got before standardizing the missing values representation, the main features to use from this table should be `gender`, `vital_status`, `age_at_initial_pathologic_diagnosis`, `tumor_status`, `race` and `ajcc_pathologic_tumor_stage`.

### Removing unneeded features

Remove columns that have more than 40% missing values:

In [None]:
# Set the threshold percentage
nan_percent_thrsh = 40  # percent
threshold = nan_percent_thrsh / 100.0

# Drop columns with more than threshold% NaNs
cdr_df = cdr_df.loc[:, cdr_df.isnull().mean() <= threshold]


In [None]:
# Show count and percentage of missing values per column
missing_values = cdr_df.isnull().sum()
missing_percent = (missing_values / len(cdr_df)) * 100

missing_df = pandas.DataFrame({
    'Missing Count': missing_values,
    'Missing Percent': missing_percent
})

# Display only columns with missing values
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(by='Missing Percent', ascending=False)
print(missing_df)


Features such as overall survival (`OS`), progression-free interval (`PFI`), disease-specific survival (`DSS`), `vital_status`, `tumor_status`, `initial_pathologic_dx_year`, `birth_days_to` and `last_contact_days_to`,  might not be relevant for this use case. Also, `histological_type` is redundant with `type`, which is our intended label, while having more missing values, so we must remove it.

In [None]:
cdr_df = cdr_df.drop(columns=['Unnamed: 0', 'OS', 'PFI', 'DSS',
                              'OS.time', 'DSS.time', 'PFI.time',
                              'vital_status', 'tumor_status', 
                              'initial_pathologic_dx_year', 'birth_days_to',
                              'last_contact_days_to', 'histological_type'], axis=1)
cdr_df.head()

Change label name to a more intuitive one:

In [None]:
cdr_df = cdr_df.rename(columns={'type': 'tumor_type_label'})
cdr_df.head()

### Converting categorical features to numeric

In [None]:
cdr_df.gender.value_counts()

In [None]:
cdr_df.race.value_counts()

In [None]:
cdr_df.ajcc_pathologic_tumor_stage.value_counts()

Encode gender:

In [None]:
cdr_df.gender = cdr_df.gender.apply(lambda x: 1 if x.lower() == 'male' else 0)
cdr_df.gender.value_counts()

Encode race and tumor stage:

In [None]:
features_to_encode = ['race', 'ajcc_pathologic_tumor_stage']

In [None]:
# Dictionary that will contain the mapping between the categories and their encodings
encod_dict = dict([('gender', dict([('male' , 1), ('female', 0)]))])

In [None]:
encod_dict = {}

for feature in features_to_encode:
    # Factorize returns encoded values and unique categories
    codes, uniques = pandas.factorize(cdr_df[feature])
    cdr_df[feature] = codes
    encod_dict[feature] = dict(enumerate(uniques))


In [None]:
cdr_df.head()

In [None]:
encod_dict

### Normalizing data

In this table, we only need to normalize the age.

In [None]:
cdr_df.describe().transpose()

Save the dataframe before normalizing:

In [None]:
# Ensure the directory exists
unnorm_dir = os.path.join(data_path, 'cleaned/unnormalized')
os.makedirs(unnorm_dir, exist_ok=True)

# Now save the file
cdr_df.to_csv(os.path.join(unnorm_dir, 'clinical_outcome.csv'), index=False)


Normalize the data into a new dataframe:

In [None]:
scaler = StandardScaler()

# Copy original dataframe
cdr_df_norm = cdr_df.copy()

# Normalize the specified column
column = 'age_at_initial_pathologic_diagnosis'
cdr_df_norm[column] = scaler.fit_transform(cdr_df[[column]])

cdr_df_norm.head()


Confirm that everything is ok through the `describe` method:

In [None]:
cdr_df_norm.describe().transpose()

Save the normalized dataframe:

In [None]:
cdr_df_norm.to_csv(f'{data_path}cleaned/normalized/clinical_outcome.csv')

## Saving enumeration encoding mapping

Save the dictionary that maps from the original categories/strings to the new numerical encondings.

In [None]:
stream = open(f'{data_path}cleaned/encod_dict.yaml', 'w')
yaml.dump(encod_dict, stream, default_flow_style=False)