In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Description: Mapping Probe IDs to Gene Names

This script converts `probe x sample` matrices from CNV and RNA-seq data into `gene x sample` matrices using probe-to-gene mapping files.

#### Input:
- Raw data located in `1_raw_data/` for both **GBM** and **LGG**
- Probe-to-gene mapping files (Note: One probe can map to multiple genes)

#### Mapping Strategy:
- Each probe ID may correspond to **multiple gene names (1:N)**.
- If multiple probes map to the same gene, their expression values are **averaged** to generate a single row per gene.

#### Output:
- Converted files are saved in `Data/2_gene_name_mapping_data/`

#### Notice
- if you want to use keep probe ID, Ignore this

In [None]:
import pandas as pd
import gzip

In [None]:
# Define paths

# Define the base folder path to your project directory in Google Drive
Ds_project_folder_path = '/content/drive/MyDrive/DS_project/'

# Folder where raw input data is stored
raw_data_path = 'Data/1_raw_data/'

# Folder where processed output data will be saved
gene_name_mapping_data_path = 'Data/2_gene_name_mapping_data/'

In [None]:
# Full paths to the raw RNA-seq data files
GBM_rna_seq_url = Ds_project_folder_path + raw_data_path + 'TCGA-GBM_mRNA.csv'
LGG_rna_seq_url = Ds_project_folder_path + raw_data_path + 'TCGA-LGG_mRNA.csv'

# Full paths to the raw CNV data files
GBM_CNV_url = Ds_project_folder_path + raw_data_path + 'TCGA-GBM.CNV.tsv.gz'
LGG_CNV_url = Ds_project_folder_path + raw_data_path + 'TCGA-LGG.CNV.tsv.gz'

# Full path to the probe-to-gene mapping file
gene_id_to_name_mapping_url = Ds_project_folder_path + gene_name_mapping_data_path + 'gene_id_to_name_mapping.csv'

In [None]:
# Read GBM CNV data from compressed file
with gzip.open(GBM_CNV_url, 'rt') as f:
    GBM_CNV = pd.read_csv(f, sep='\t')

# Read LGG CNV data from compressed file
with gzip.open(LGG_CNV_url, 'rt') as f:
    LGG_CNV = pd.read_csv(f, sep='\t')

# Read GBM and LGG RNA-seq data (index_col=0 means the first column will be used as row labels)
GBM_rna_seq = pd.read_csv(GBM_rna_seq_url, index_col=0)
LGG_rna_seq = pd.read_csv(LGG_rna_seq_url, index_col=0)

# Read the probe ID to gene name mapping file
gene_id_to_name_mapping = pd.read_csv(gene_id_to_name_mapping_url)

In [None]:
# Create a dictionary: key = probe ID, value = gene name
id_to_name = pd.Series(gene_id_to_name_mapping.gene_name.values, index=gene_id_to_name_mapping.gene_id).to_dict()

In [None]:
# Replace probe IDs with gene names using the dictionary
# If multiple probes map to the same gene, take the average of their expression values

GBM_rna_seq.index = GBM_rna_seq.index.to_series().map(id_to_name)
GBM_rna_seq = GBM_rna_seq.groupby(GBM_rna_seq.index).mean()

LGG_rna_seq.index = LGG_rna_seq.index.to_series().map(id_to_name)
LGG_rna_seq = LGG_rna_seq.groupby(LGG_rna_seq.index).mean()

In [None]:
# Move the 'Ensembl_ID' column to be the row index for both GBM and LGG

GBM_CNV.index = GBM_CNV['Ensembl_ID']
GBM_CNV.drop(columns=['Ensembl_ID'], inplace=True)
LGG_CNV.index = LGG_CNV['Ensembl_ID']
LGG_CNV.drop(columns=['Ensembl_ID'], inplace=True)

In [None]:
# Map probe IDs to gene names for GBM and LGG CNV data

GBM_CNV.index = GBM_CNV.index.to_series().map(id_to_name)
GBM_CNV = GBM_CNV.groupby(GBM_CNV.index).mean()

LGG_CNV.index = LGG_CNV.index.to_series().map(id_to_name)
LGG_CNV = LGG_CNV.groupby(LGG_CNV.index).mean()

In [None]:
GBM_rna_seq.index.name = 'gene_name'
LGG_rna_seq.index.name = 'gene_name'
GBM_CNV.index.name = 'gene_name'
LGG_CNV.index.name = 'gene_name'

In [None]:
# Save the processed data (gene x sample format) as CSV files
GBM_rna_seq.to_csv(Ds_project_folder_path + gene_name_mapping_data_path + "TCGA-GBM_mRNA_gene_mapping.csv")
LGG_rna_seq.to_csv(Ds_project_folder_path + gene_name_mapping_data_path + "TCGA-LGG_mRNA_gene_mapping.csv")
GBM_CNV.to_csv(Ds_project_folder_path + gene_name_mapping_data_path + "TCGA-GBM_CNV_gene_mapping.csv")
LGG_CNV.to_csv(Ds_project_folder_path + gene_name_mapping_data_path + "TCGA-LGG_CNV_gene_mapping.csv")