In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import gzip

In [None]:
# Define the base folder path to your project directory in Google Drive
Ds_project_folder_path = '/content/drive/MyDrive/DS_project/'

# Folder where raw input data is stored
raw_data_path = 'Data/1_raw_data/'

# Folder where processed output data will be saved
gene_name_mapping_data_path = 'Data/2_gene_name_mapping_data/'

clinical_data_path = 'Data/0.clinical_rdata_to_csv/'

filtered_by_plate_rna_seq_data_path = 'Data/3_RNA_seq_sample_filtering_by_plate/'


In [None]:
gene_id_to_name_mapping_url = Ds_project_folder_path + gene_name_mapping_data_path + 'chromosome_probe_gene.csv'
rppa_id_to_name_mapping_url = Ds_project_folder_path + gene_name_mapping_data_path + 'RPPA_mapping_gene.csv'

In [None]:
gene_id_to_name_mapping = pd.read_csv(gene_id_to_name_mapping_url)
rppa_id_to_name_mapping = pd.read_csv(rppa_id_to_name_mapping_url)

In [None]:
# Whether to use gene-mapped data (True) or raw probe-based data (False)

gene_mapping = False

In [None]:
def RNA_seq_and_CNV_data_load(gene_mapping = True):
  RNA_seq_path = Ds_project_folder_path + filtered_by_plate_rna_seq_data_path

  if gene_mapping:
    CNV_path = Ds_project_folder_path + gene_name_mapping_data_path
    GBM_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-GBM_mRNA_gene_map_filtered_by_plate.csv', index_col=0)
    LGG_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-LGG_mRNA_gene_map_filtered_by_plate.csv', index_col=0)

    GBM_CNV_data = pd.read_csv(CNV_path + 'TCGA-GBM_CNV_gene_mapping.csv', index_col=0)
    LGG_CNV_data = pd.read_csv(CNV_path + 'TCGA-LGG_CNV_gene_mapping.csv', index_col=0)

  else:
    CNV_path = Ds_project_folder_path + raw_data_path

    GBM_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-GBM_mRNA_filtered_by_plate.csv', index_col=0)
    LGG_rna_seq_data = pd.read_csv(RNA_seq_path + 'TCGA-LGG_mRNA_filtered_by_plate.csv', index_col=0)

    with gzip.open(CNV_path + 'TCGA-GBM.CNV.tsv.gz', 'rt') as f:
      GBM_CNV_data = pd.read_csv(f, sep='\t')

    with gzip.open(CNV_path + 'TCGA-LGG.CNV.tsv.gz', 'rt') as f:
      LGG_CNV_data = pd.read_csv(f, sep='\t')

  return GBM_rna_seq_data, LGG_rna_seq_data, GBM_CNV_data, LGG_CNV_data

In [None]:
with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA_GBM_SNV.tsv.gz', 'rt') as f:
    GBM_SNV_data = pd.read_csv(f, sep='\t')
with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA_LGG_SNV.tsv.gz', 'rt') as f:
    LGG_SNV_data = pd.read_csv(f, sep='\t')

with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA-GBM.protein.tsv.gz', 'rt') as f:
    GBM_protein_data = pd.read_csv(f, sep='\t')
with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA-LGG.protein.tsv.gz', 'rt') as f:
    LGG_protein_data = pd.read_csv(f, sep='\t')

with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA_GBM_mval_methylation.tsv.gz', 'rt') as f:
    GBM_methlylation_data = pd.read_csv(f, sep='\t')
with gzip.open(Ds_project_folder_path + raw_data_path + 'TCGA_LGG_mval_methylation.tsv.gz', 'rt') as f:
    LGG_methlylation_data = pd.read_csv(f, sep='\t')

In [None]:
GBM_protein_data = GBM_protein_data.set_index("peptide_target")
LGG_protein_data = LGG_protein_data.set_index("peptide_target")

GBM_SNV_data = GBM_SNV_data.set_index("gene")
LGG_SNV_data = LGG_SNV_data.set_index("gene")

In [None]:
GBM_rna_seq_data, LGG_rna_seq_data, GBM_CNV_data, LGG_CNV_data = RNA_seq_and_CNV_data_load(gene_mapping)

In [None]:
chr_df = gene_id_to_name_mapping.copy()
chr_df['Chromosome'] = chr_df['Chromosome'].str.upper().str.replace('CHR', '')

remove_chr = ['X', 'Y']

remove_probes = chr_df[chr_df['Chromosome'].isin(remove_chr)]['Probe_ID'].tolist()
remove_genes = chr_df[chr_df['Chromosome'].isin(remove_chr)]['Gene_Symbol'].tolist()

In [None]:
filtered_GBM_rna_seq_data = GBM_rna_seq_data[~GBM_rna_seq_data.index.isin(remove_probes)]
filtered_LGG_rna_seq_data = LGG_rna_seq_data[~LGG_rna_seq_data.index.isin(remove_probes)]
filtered_GBM_CNV_data = GBM_CNV_data[~GBM_CNV_data.index.isin(remove_probes)]
filtered_LGG_CNV_data = LGG_CNV_data[~LGG_CNV_data.index.isin(remove_probes)]
filtered_GBM_SNV_data = GBM_SNV_data[~GBM_SNV_data.index.isin(remove_genes)]
filtered_LGG_SNV_data = LGG_SNV_data[~LGG_SNV_data.index.isin(remove_genes)]

In [None]:
filtered_GBM_rna_seq_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-GBM_mRNA_filtered_chromosome.csv')
filtered_LGG_rna_seq_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-LGG_mRNA_filtered_chromosome.csv')
filtered_GBM_CNV_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-GBM_CNV_filtered_chromosome.csv')
filtered_LGG_CNV_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-LGG_CNV_filtered_chromosome.csv')
filtered_GBM_SNV_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-GBM_SNV_filtered_chromosome.csv')
filtered_LGG_SNV_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-LGG_SNV_filtered_chromosome.csv')

### Protein target_id -> gene name mapping

In [None]:
rppa_gene_dic = {}
for rppa_target, gene_name in zip(rppa_id_to_name_mapping['RPPA_Target'], rppa_id_to_name_mapping['Gene_Symbol']):
  rppa_gene_dic[rppa_target] = []
  gene_name = str(gene_name)
  if gene_name.split(', ')[0] == 'nan':
    if rppa_target in gene_id_to_name_mapping['Gene_Symbol'].to_list():
      rppa_gene_dic[rppa_target].append(rppa_target)
  else:
    for gene in gene_name.split(', '):
      if gene not in gene_id_to_name_mapping['Gene_Symbol'].to_list():
        pass
      else:
        rppa_gene_dic[rppa_target].append(gene)

In [None]:
from collections import defaultdict

def collapse_protein_by_gene(df: pd.DataFrame, peptide_to_genes: dict) -> pd.DataFrame:

    gene_to_peptides = defaultdict(list)
    for peptide, genes in peptide_to_genes.items():
        for gene in genes:
            gene_to_peptides[gene].append(peptide)

    gene_rows = {}
    for gene, peptides in gene_to_peptides.items():
        valid_peptides = [p for p in peptides if p in df.index]
        if not valid_peptides:
            continue
        stacked = np.vstack([df.loc[p].values for p in valid_peptides])
        mean_row = np.nanmean(stacked, axis=0)
        gene_rows[gene] = mean_row

    final_df = pd.DataFrame.from_dict(gene_rows, orient='index', columns=df.columns)

    return final_df

In [None]:
mapping_GBM_protein_data = collapse_protein_by_gene(GBM_protein_data, rppa_gene_dic)
mapping_LGG_protein_data = collapse_protein_by_gene(LGG_protein_data, rppa_gene_dic)

  mean_row = np.nanmean(stacked, axis=0)


In [None]:
filtered_GBM_protein_data = mapping_GBM_protein_data[~mapping_GBM_protein_data.index.isin(remove_genes)]
filtered_LGG_protein_data = mapping_LGG_protein_data[~mapping_LGG_protein_data.index.isin(remove_genes)]

In [None]:
filtered_GBM_protein_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-GBM_protein_filtered_chromosome.csv')
filtered_LGG_protein_data.to_csv(Ds_project_folder_path + filtered_by_plate_rna_seq_data_path + 'TCGA-LGG_protein_filtered_chromosome.csv')