In [1]:
import pandas as pd

In [2]:
# Load your Probe ID list
gene_data = pd.read_csv("GSE21815_geneID.csv")

# Load the GPL3921 annotation file (tab-separated format)
gpl_data = pd.read_csv("GPL6480_9577.txt", sep="\t", dtype=str, comment="#")

# Display column names to check structure
print(gpl_data.columns)

# Merge based on Probe ID
merged_data = gene_data.merge(gpl_data[['ID', 'GENE_SYMBOL']], left_on="ID_REF", right_on="ID", how="left")


Index(['ID', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ', 'GB_ACC', 'GENE',
       'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID', 'ENSEMBL_ID', 'TIGR_ID',
       'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION',
       'GO_ID', 'SEQUENCE'],
      dtype='object')


In [3]:
merged_data.head()

Unnamed: 0,ID_REF,ID,GENE_SYMBOL
0,A_23_P100001,A_23_P100001,FAM174B
1,A_23_P100011,A_23_P100011,AP3S2
2,A_23_P100022,A_23_P100022,SV2B
3,A_23_P100056,A_23_P100056,RBPMS2
4,A_23_P100074,A_23_P100074,AVEN


In [4]:
merged_data = merged_data.drop(columns="ID")
merged_data

Unnamed: 0,ID_REF,GENE_SYMBOL
0,A_23_P100001,FAM174B
1,A_23_P100011,AP3S2
2,A_23_P100022,SV2B
3,A_23_P100056,RBPMS2
4,A_23_P100074,AVEN
...,...,...
40995,A_32_P9986,
40996,A_32_P99864,
40997,A_32_P99902,C15orf40
40998,A_32_P99933,


In [5]:
merged_data = merged_data.assign(Gene_Symbol=merged_data['GENE_SYMBOL'].str.split(' /// ')).explode('Gene_Symbol')

In [6]:
# replace ' /// ' with ','
merged_data["GENE_SYMBOL"] = merged_data["GENE_SYMBOL"].str.replace(" /// ", ",")

In [7]:
merged_data= merged_data.rename(columns={"GENE_SYMBOL": "Cleaned_Gene_Symbol"})

In [8]:
merged_data = merged_data[~merged_data["ID_REF"].str.startswith("AFFX-")]

In [9]:
merged_data.head()

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol,Gene_Symbol
0,A_23_P100001,FAM174B,FAM174B
1,A_23_P100011,AP3S2,AP3S2
2,A_23_P100022,SV2B,SV2B
3,A_23_P100056,RBPMS2,RBPMS2
4,A_23_P100074,AVEN,AVEN


In [10]:
merged_data = merged_data.dropna()

In [11]:
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol,Gene_Symbol
0,A_23_P100001,FAM174B,FAM174B
1,A_23_P100011,AP3S2,AP3S2
2,A_23_P100022,SV2B,SV2B
3,A_23_P100056,RBPMS2,RBPMS2
4,A_23_P100074,AVEN,AVEN
...,...,...,...
40986,A_32_P99638,LOC339316,LOC339316
40987,A_32_P99648,EIF4E3,EIF4E3
40988,A_32_P99690,NLN,NLN
40992,A_32_P99753,C3orf25,C3orf25


In [12]:
merged_data = merged_data.drop(columns="Gene_Symbol")
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,A_23_P100001,FAM174B
1,A_23_P100011,AP3S2
2,A_23_P100022,SV2B
3,A_23_P100056,RBPMS2
4,A_23_P100074,AVEN
...,...,...
40986,A_32_P99638,LOC339316
40987,A_32_P99648,EIF4E3
40988,A_32_P99690,NLN
40992,A_32_P99753,C3orf25


In [13]:
# Save the mapped results
merged_data.to_csv("mapped_gene_list.csv", index=False)

print("Gene name mapping completed! Check 'mapped_gene_list.csv'")

Gene name mapping completed! Check 'mapped_gene_list.csv'
