In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import pickle

#### Data Loading

In [None]:
# Whether to use gene-mapped data (True) or raw probe-based data (False)

gene_mapping = False

In [None]:
# Define the base folder path to your project directory in Google Drive
Ds_project_folder_path = '/content/drive/MyDrive/DS_project/'

# Folder where raw input data is stored
raw_data_path = 'Data/1_raw_data/'

# Folder where processed output data will be saved
gene_name_mapping_data_path = 'Data/2_gene_name_mapping_data/'

clinical_data_path = 'Data/0.clinical_rdata_to_csv/'

filtered_by_plate_rna_seq_data_path = 'Data/3_RNA_seq_sample_filtering_by_plate/'

data_clearning_path = 'Data/4_Intra-cohort-preprocessing/4.1.Data_clearning/'

data_spliting_path = 'Data/5_Inter-cohort_preprocessing/5.1.data_spliting/'

splited_data_save_path = 'Data/5_Inter-cohort_preprocessing/5.2.data_cleaning(Feature_filtering)/'

In [None]:
save_dir = Ds_project_folder_path + splited_data_save_path

In [None]:
GBM_data_split_result_name = 'GBM_data_split_result.pickle'
LGG_data_split_result_name = 'LGG_data_split_result.pickle'

In [None]:
with open(Ds_project_folder_path + data_spliting_path + GBM_data_split_result_name, 'rb') as f:
    GBM_data_split_result = pickle.load(f)
with open(Ds_project_folder_path + data_spliting_path + LGG_data_split_result_name, 'rb') as f:
    LGG_data_split_result = pickle.load(f)

In [None]:
def RNA_seq_and_CNV_data_load(gene_mapping = True):
  RNA_seq_path = Ds_project_folder_path + filtered_by_plate_rna_seq_data_path

  if gene_mapping:
    CNV_path = Ds_project_folder_path + gene_name_mapping_data_path
    GBM_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-GBM_mRNA_gene_map_filtered_by_plate.csv', index_col=0)
    LGG_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-LGG_mRNA_gene_map_filtered_by_plate.csv', index_col=0)

    GBM_CNV_data = pd.read_csv(CNV_path + 'TCGA-GBM_CNV_gene_mapping.csv', index_col=0)
    LGG_CNV_data = pd.read_csv(CNV_path + 'TCGA-LGG_CNV_gene_mapping.csv', index_col=0)

  else:
    CNV_path = Ds_project_folder_path + raw_data_path

    GBM_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-GBM_mRNA_filtered_chromosome.csv', index_col=0)
    LGG_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-LGG_mRNA_filtered_chromosome.csv', index_col=0)

    GBM_CNV_data = pd.read_csv(RNA_seq_path + 'TCGA-GBM_CNV_filtered_chromosome.csv', index_col=0)
    LGG_CNV_data = pd.read_csv(RNA_seq_path + 'TCGA-LGG_CNV_filtered_chromosome.csv', index_col=0)

  return GBM_rna_seq_data, LGG_rna_seq_data, GBM_CNV_data, LGG_CNV_data

In [None]:
GBM_SNV_data = pd.read_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-GBM_SNV_filtered_chromosome.csv', index_col = 0)
LGG_SNV_data = pd.read_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-LGG_SNV_filtered_chromosome.csv', index_col = 0)
GBM_protein_data = pd.read_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-GBM_protein_filtered_chromosome.csv', index_col = 0)
LGG_protein_data = pd.read_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-LGG_protein_filtered_chromosome.csv', index_col = 0)

In [None]:
with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA_GBM_mval_methylation.tsv.gz', 'rt') as f:
    GBM_methlylation_data = pd.read_csv(f, sep='\t')
with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA_LGG_mval_methylation.tsv.gz', 'rt') as f:
    LGG_methlylation_data = pd.read_csv(f, sep='\t')

GBM_rna_seq_data, LGG_rna_seq_data, GBM_CNV_data, LGG_CNV_data = RNA_seq_and_CNV_data_load(gene_mapping)

In [None]:
GBM_CNV_data.index = GBM_CNV_data['Ensembl_ID']
GBM_CNV_data.drop(columns=['Ensembl_ID'], inplace=True)

LGG_CNV_data.index = LGG_CNV_data['Ensembl_ID']
LGG_CNV_data.drop(columns=['Ensembl_ID'], inplace=True)

#### Data spliting (train test val)

In [None]:
def data_split(df, split_info):
  train_data_list = [s for s in split_info['Train'] if s in df.columns]
  val_data_list = [s for s in split_info['Validation'] if s in df.columns]
  external_test_data_list = [s for s in split_info['External Test'] if s in df.columns]
  internal_test_data_list = [s for s in split_info['Internal Test'] if s in df.columns]

  train_data = df[train_data_list]
  val_data = df[val_data_list]
  external_test_data = df[external_test_data_list]
  internal_test_data = df[internal_test_data_list]

  return train_data, val_data, external_test_data, internal_test_data

In [None]:
GBM_rna_seq_train, GBM_rna_seq_val, GBM_rna_seq_external_test, GBM_rna_seq_internal_test = data_split(GBM_rna_seq_data, GBM_data_split_result)
LGG_rna_seq_train, LGG_rna_seq_val, LGG_rna_seq_external_test, LGG_rna_seq_internal_test = data_split(LGG_rna_seq_data, LGG_data_split_result)

GBM_CNV_train, GBM_CNV_val, GBM_CNV_external_test, GBM_CNV_internal_test = data_split(GBM_CNV_data, GBM_data_split_result)
LGG_CNV_train, LGG_CNV_val, LGG_CNV_external_test, LGG_CNV_internal_test = data_split(LGG_CNV_data, LGG_data_split_result)

GBM_methylation_train, GBM_methylation_val, GBM_methylation_external_test, GBM_methylation_internal_test = data_split(GBM_methlylation_data, GBM_data_split_result)
LGG_methylation_train, LGG_methylation_val, LGG_methylation_external_test, LGG_methylation_internal_test = data_split(LGG_methlylation_data, LGG_data_split_result)

GBM_protein_train, GBM_protein_val, GBM_protein_external_test, GBM_protein_internal_test = data_split(GBM_protein_data, GBM_data_split_result)
LGG_protein_train, LGG_protein_val, LGG_protein_external_test, LGG_protein_internal_test = data_split(LGG_protein_data, LGG_data_split_result)

GBM_SNV_train, GBM_SNV_val, GBM_SNV_external_test, GBM_SNV_internal_test = data_split(GBM_SNV_data, GBM_data_split_result)
LGG_SNV_train, LGG_SNV_val, LGG_SNV_external_test, LGG_SNV_internal_test = data_split(LGG_SNV_data, LGG_data_split_result)

In [None]:
total = 0
for k, v in GBM_data_split_result.items():
  total += len(v)
  print('GBM', k, len(v))
print(total)

total = 0
for k, v in LGG_data_split_result.items():
  total += len(v)
  print('LGG', k, len(v))
print(total)

GBM Train 293
GBM Validation 55
GBM External Test 64
GBM Internal Test 83
495
LGG Train 312
LGG Validation 65
LGG External Test 66
LGG Internal Test 72
515


In [None]:
GBM_sample_sets = [
    GBM_CNV_train.columns, GBM_CNV_val.columns, GBM_CNV_external_test.columns, GBM_CNV_internal_test.columns,
    GBM_methylation_train.columns, GBM_methylation_val.columns, GBM_methylation_external_test.columns, GBM_methylation_internal_test.columns,
    GBM_rna_seq_train.columns, GBM_rna_seq_val.columns, GBM_rna_seq_external_test.columns, GBM_rna_seq_internal_test.columns,
    GBM_protein_train.columns, GBM_protein_val.columns, GBM_protein_external_test.columns, GBM_protein_internal_test.columns,
    GBM_SNV_train.columns, GBM_SNV_val.columns, GBM_SNV_external_test.columns, GBM_SNV_internal_test.columns
]

GBM_all_samples = sorted(set().union(*GBM_sample_sets))

LGG_sample_sets = [
    LGG_CNV_train.columns, LGG_CNV_val.columns, LGG_CNV_external_test.columns, LGG_CNV_internal_test.columns,
    LGG_methylation_train.columns, LGG_methylation_val.columns, LGG_methylation_external_test.columns, LGG_methylation_internal_test.columns,
    LGG_rna_seq_train.columns, LGG_rna_seq_val.columns, LGG_rna_seq_external_test.columns, LGG_rna_seq_internal_test.columns,
    LGG_protein_train.columns, LGG_protein_val.columns, LGG_protein_external_test.columns, LGG_protein_internal_test.columns,
    LGG_SNV_train.columns, LGG_SNV_val.columns, LGG_SNV_external_test.columns, LGG_SNV_internal_test.columns
]

LGG_all_samples = sorted(set().union(*LGG_sample_sets))

print("Total # of GBM samples:", len(GBM_all_samples))
print("Total # of LGG samples:", len(LGG_all_samples))

Total # of GBM samples: 495
Total # of LGG samples: 515


#### Remove features which have high NA value

In [None]:
def filter_features_by_missing(df, threshold=0.8, missing_value = 'NA'):
    """
    Remove features (rows) where more than `threshold` proportion of values are missing or 0.
    """
    if missing_value == 'NA':
      is_missing = df.isna()
    else:
      raise ValueError("missing_value must be 'NA'")

    missing_ratio = is_missing.sum(axis=1) / df.shape[1]
    return df[missing_ratio < threshold]

In [None]:
# if each feature's NA ratio value over threshold, they are removed.
threshold = 0.5

**Protein**

In [None]:
GBM_protein_train_filtered = filter_features_by_missing(GBM_protein_train, threshold=threshold)
LGG_protein_train_filtered = filter_features_by_missing(LGG_protein_train, threshold=threshold)

In [None]:
protein_common_features = GBM_protein_train_filtered.index.intersection(LGG_protein_train_filtered.index)

In [None]:
print('# of eliminated features:', len(GBM_protein_train.index) - len(protein_common_features))

# of eliminated features: 23


In [None]:
(len(GBM_protein_train.index) - len(protein_common_features))/len(GBM_protein_train.index) * 100

5.912596401028278

In [None]:
GBM_protein_train_final = GBM_protein_train_filtered.loc[protein_common_features]
GBM_protein_val_final = GBM_protein_val.loc[protein_common_features]
GBM_protein_external_test_final = GBM_protein_external_test.loc[protein_common_features]
GBM_protein_internal_test_final = GBM_protein_internal_test.loc[protein_common_features]

LGG_protein_train_final = LGG_protein_train_filtered.loc[protein_common_features]
LGG_protein_val_final = LGG_protein_val.loc[protein_common_features]
LGG_protein_external_test_final = LGG_protein_external_test.loc[protein_common_features]
LGG_protein_internal_test_final = LGG_protein_internal_test.loc[protein_common_features]

In [None]:
print(len(GBM_protein_train_final.index))

366


In [None]:
GBM_protein_train_final.to_csv(f'{save_dir}GBM_protein_train.csv')
GBM_protein_val_final.to_csv(f'{save_dir}GBM_protein_val.csv')
GBM_protein_external_test_final.to_csv(f'{save_dir}GBM_protein_external_test.csv')
GBM_protein_internal_test_final.to_csv(f'{save_dir}GBM_protein_internal_test.csv')

LGG_protein_train_final.to_csv(f'{save_dir}LGG_protein_train.csv')
LGG_protein_val_final.to_csv(f'{save_dir}LGG_protein_val.csv')
LGG_protein_external_test_final.to_csv(f'{save_dir}LGG_protein_external_test.csv')
LGG_protein_internal_test_final.to_csv(f'{save_dir}LGG_protein_internal_test.csv')

**Methylation**

In [None]:
GBM_methylation_train_filtered = filter_features_by_missing(GBM_methylation_train, threshold=threshold)
LGG_methylation_train_filtered = filter_features_by_missing(LGG_methylation_train, threshold=threshold)

In [None]:
methylation_common_features = GBM_methylation_train_filtered.index.intersection(LGG_methylation_train_filtered.index)

In [None]:
print('# of eliminated features:', len(GBM_methylation_train.index) - len(methylation_common_features))

# of eliminated features: 18


In [None]:
(len(GBM_methylation_train.index) - len(methylation_common_features))/len(GBM_methylation_train.index) * 100

0.09235979270357637

In [None]:
GBM_methylation_train_final = GBM_methylation_train_filtered.loc[methylation_common_features]
GBM_methylation_val_final = GBM_methylation_val.loc[methylation_common_features]
GBM_methylation_external_test_final = GBM_methylation_external_test.loc[methylation_common_features]
GBM_methylation_internal_test_final = GBM_methylation_internal_test.loc[methylation_common_features]

LGG_methylation_train_final = LGG_methylation_train_filtered.loc[methylation_common_features]
LGG_methylation_val_final = LGG_methylation_val.loc[methylation_common_features]
LGG_methylation_external_test_final = LGG_methylation_external_test.loc[methylation_common_features]
LGG_methylation_internal_test_final = LGG_methylation_internal_test.loc[methylation_common_features]

In [None]:
GBM_methylation_train_final.to_csv(f'{save_dir}GBM_methylation_train.csv')
GBM_methylation_val_final.to_csv(f'{save_dir}GBM_methylation_val.csv')
GBM_methylation_external_test_final.to_csv(f'{save_dir}GBM_methylation_external_test.csv')
GBM_methylation_internal_test_final.to_csv(f'{save_dir}GBM_methylation_internal_test.csv')

LGG_methylation_train_final.to_csv(f'{save_dir}LGG_methylation_train.csv')
LGG_methylation_val_final.to_csv(f'{save_dir}LGG_methylation_val.csv')
LGG_methylation_external_test_final.to_csv(f'{save_dir}LGG_methylation_external_test.csv')
LGG_methylation_internal_test_final.to_csv(f'{save_dir}LGG_methylation_internal_test.csv')

**RNA_seq**

In [None]:
GBM_rna_seq_train_filtered = filter_features_by_missing(GBM_rna_seq_train, threshold=threshold)
LGG_rna_seq_train_filtered = filter_features_by_missing(LGG_rna_seq_train, threshold=threshold)

In [None]:
rna_seq_common_features = GBM_rna_seq_train_filtered.index.intersection(LGG_rna_seq_train_filtered.index)

In [None]:
print('# of eliminated features:', len(GBM_rna_seq_train.index) - len(GBM_rna_seq_train_filtered))

# of eliminated features: 0


In [None]:
GBM_rna_seq_train_train_final = GBM_rna_seq_train_filtered.loc[rna_seq_common_features]
GBM_rna_seq_val_final = GBM_rna_seq_val.loc[rna_seq_common_features]
GBM_rna_seq_external_test_final = GBM_rna_seq_external_test.loc[rna_seq_common_features]
GBM_rna_seq_internal_test_final = GBM_rna_seq_internal_test.loc[rna_seq_common_features]

LGG_rna_seq_train_final = LGG_rna_seq_train_filtered.loc[rna_seq_common_features]
LGG_rna_seq_val_final = LGG_rna_seq_val.loc[rna_seq_common_features]
LGG_rna_seq_external_test_final = LGG_rna_seq_external_test.loc[rna_seq_common_features]
LGG_rna_seq_internal_test_final = LGG_rna_seq_internal_test.loc[rna_seq_common_features]

In [None]:
GBM_rna_seq_train_train_final.to_csv(f'{save_dir}GBM_rna_seq_train.csv')
GBM_rna_seq_val_final.to_csv(f'{save_dir}GBM_rna_seq_val.csv')
GBM_rna_seq_external_test_final.to_csv(f'{save_dir}GBM_rna_seq_external_test.csv')
GBM_rna_seq_internal_test_final.to_csv(f'{save_dir}GBM_rna_seq_internal_test.csv')

LGG_rna_seq_train_final.to_csv(f'{save_dir}LGG_rna_seq_train.csv')
LGG_rna_seq_val_final.to_csv(f'{save_dir}LGG_rna_seq_val.csv')
LGG_rna_seq_external_test_final.to_csv(f'{save_dir}LGG_rna_seq_external_test.csv')
LGG_rna_seq_internal_test_final.to_csv(f'{save_dir}LGG_rna_seq_internal_test.csv')

**CNV**

In [None]:
GBM_CNV_train_filtered = filter_features_by_missing(GBM_CNV_train, threshold=threshold)
LGG_CNV_train_filtered = filter_features_by_missing(LGG_CNV_train, threshold=threshold)

In [None]:
CNV_common_features = GBM_CNV_train_filtered.index.intersection(LGG_CNV_train_filtered.index)

In [None]:
print('# of eliminated features:', len(GBM_CNV_train.index) - len(GBM_CNV_train_filtered))

# of eliminated features: 4983


In [None]:
(len(GBM_CNV_train.index) - len(GBM_CNV_train_filtered))/len(GBM_CNV_train.index) * 100

8.219652607096318

In [None]:
GBM_CNV_train_train_final = GBM_CNV_train_filtered.loc[CNV_common_features]
GBM_CNV_val_final = GBM_CNV_val.loc[CNV_common_features]
GBM_CNV_external_test_final = GBM_CNV_external_test.loc[CNV_common_features]
GBM_CNV_internal_test_final = GBM_CNV_internal_test.loc[CNV_common_features]

LGG_CNV_train_final = LGG_CNV_train_filtered.loc[CNV_common_features]
LGG_CNV_val_final = LGG_CNV_val.loc[CNV_common_features]
LGG_CNV_external_test_final = LGG_CNV_external_test.loc[CNV_common_features]
LGG_CNV_internal_test_final = LGG_CNV_internal_test.loc[CNV_common_features]

In [None]:
GBM_CNV_train_train_final.to_csv(f'{save_dir}GBM_CNV_train.csv')
GBM_CNV_val_final.to_csv(f'{save_dir}GBM_CNV_val.csv')
GBM_CNV_external_test_final.to_csv(f'{save_dir}GBM_CNV_external_test.csv')
GBM_CNV_internal_test_final.to_csv(f'{save_dir}GBM_CNV_internal_test.csv')

LGG_CNV_train_final.to_csv(f'{save_dir}LGG_CNV_train.csv')
LGG_CNV_val_final.to_csv(f'{save_dir}LGG_CNV_val.csv')
LGG_CNV_external_test_final.to_csv(f'{save_dir}LGG_CNV_external_test.csv')
LGG_CNV_internal_test_final.to_csv(f'{save_dir}LGG_CNV_internal_test.csv')

**SNV**

In [None]:
GBM_SNV_train_filtered = filter_features_by_missing(GBM_SNV_train, threshold=threshold)
LGG_SNV_train_filtered = filter_features_by_missing(LGG_SNV_train, threshold=threshold)

In [None]:
SNV_common_features = GBM_SNV_train_filtered.index.intersection(LGG_SNV_train_filtered.index)

In [None]:
print('# of eliminated features:', len(GBM_SNV_train.index) - len(GBM_SNV_train_filtered))

# of eliminated features: 0


In [None]:
GBM_SNV_train_train_final = GBM_SNV_train_filtered.loc[SNV_common_features]
GBM_SNV_val_final = GBM_SNV_val.loc[SNV_common_features]
GBM_SNV_external_test_final = GBM_SNV_external_test.loc[SNV_common_features]
GBM_SNV_internal_test_final = GBM_SNV_internal_test.loc[SNV_common_features]

LGG_SNV_train_final = LGG_SNV_train_filtered.loc[SNV_common_features]
LGG_SNV_val_final = LGG_SNV_val.loc[SNV_common_features]
LGG_SNV_external_test_final = LGG_SNV_external_test.loc[SNV_common_features]
LGG_SNV_internal_test_final = LGG_SNV_internal_test.loc[SNV_common_features]

In [None]:
GBM_SNV_train_train_final.to_csv(f'{save_dir}GBM_SNV_train.csv')
GBM_SNV_val_final.to_csv(f'{save_dir}GBM_SNV_val.csv')
GBM_SNV_external_test_final.to_csv(f'{save_dir}GBM_SNV_external_test.csv')
GBM_SNV_internal_test_final.to_csv(f'{save_dir}GBM_SNV_internal_test.csv')

LGG_SNV_train_final.to_csv(f'{save_dir}LGG_SNV_train.csv')
LGG_SNV_val_final.to_csv(f'{save_dir}LGG_SNV_val.csv')
LGG_SNV_external_test_final.to_csv(f'{save_dir}LGG_SNV_external_test.csv')
LGG_SNV_internal_test_final.to_csv(f'{save_dir}LGG_SNV_internal_test.csv')

In [None]:
GBM_sample_sets = [
    GBM_CNV_train_train_final.columns, GBM_CNV_val_final.columns, GBM_CNV_external_test_final.columns, GBM_CNV_internal_test_final.columns,
    GBM_methylation_train_final.columns, GBM_methylation_val_final.columns, GBM_methylation_external_test_final.columns, GBM_methylation_internal_test_final.columns,
    GBM_rna_seq_train_train_final.columns, GBM_rna_seq_val_final.columns, GBM_rna_seq_external_test_final.columns, GBM_rna_seq_internal_test_final.columns,
    GBM_protein_train_final.columns, GBM_protein_val_final.columns, GBM_protein_external_test_final.columns, GBM_protein_internal_test_final.columns,
    GBM_SNV_train_train_final.columns, GBM_SNV_val_final.columns, GBM_SNV_external_test_final.columns, GBM_SNV_internal_test_final.columns
]

GBM_all_samples = sorted(set().union(*GBM_sample_sets))

LGG_sample_sets = [
    LGG_CNV_train_final.columns, LGG_CNV_val.columns, LGG_CNV_external_test.columns, LGG_CNV_internal_test.columns,
    LGG_methylation_train_final.columns, LGG_methylation_val_final.columns, LGG_methylation_external_test_final.columns, LGG_methylation_internal_test_final.columns,
    LGG_rna_seq_train_final.columns, LGG_rna_seq_val_final.columns, LGG_rna_seq_external_test_final.columns, LGG_rna_seq_internal_test_final.columns,
    LGG_protein_train_final.columns, LGG_protein_val_final.columns, LGG_protein_external_test_final.columns, LGG_protein_internal_test_final.columns,
    LGG_SNV_train_final.columns, LGG_SNV_val_final.columns, LGG_SNV_external_test_final.columns, LGG_SNV_internal_test_final.columns
]

LGG_all_samples = sorted(set().union(*LGG_sample_sets))

print("Total # of GBM samples:", len(GBM_all_samples))
print("Total # of LGG samples:", len(LGG_all_samples))

Total # of GBM samples: 495
Total # of LGG samples: 515
