In [1]:
import pandas as pd

In [2]:
# Load your Probe ID list
gene_data = pd.read_csv("GSE113513_geneID.csv")

# Load the GPL3921 annotation file (tab-separated format)
gpl_data = pd.read_csv("GPL15207_17536.txt", sep="\t", dtype=str, comment="#")

# Display column names to check structure
print(gpl_data.columns)

# Merge based on Probe ID
merged_data = gene_data.merge(gpl_data[['ID', 'Gene Symbol']], left_on="ID_REF", right_on="ID", how="left")


Index(['ID', 'GeneChip Array', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Transcript ID(Array Design)',
       'Target Description', 'GB_ACC', 'GI', 'Representative Public ID',
       'Archival UniGene Cluster', 'UniGene ID', 'Genome Version',
       'Alignments', 'Gene Title', 'Gene Symbol', 'Chromosomal Location',
       'Unigene Cluster Type', 'Ensembl', 'Entrez Gene', 'SwissProt', 'EC',
       'OMIM', 'RefSeq Protein ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function', 'Pathway', 'InterPro',
       'Annotation Description', 'Annotation Transcript Cluster',
       'Transcript Assignments', 'Annotation Notes', 'SPOT_ID'],
      dtype='object')


In [3]:
merged_data.head()

Unnamed: 0,ID_REF,ID,Gene Symbol
0,11715100_at,11715100_at,HIST1H3G
1,11715101_s_at,11715101_s_at,HIST1H3G
2,11715102_x_at,11715102_x_at,HIST1H3G
3,11715103_x_at,11715103_x_at,TNFAIP8L1
4,11715104_s_at,11715104_s_at,OTOP2


In [4]:
merged_data = merged_data.drop(columns="ID")
merged_data

Unnamed: 0,ID_REF,Gene Symbol
0,11715100_at,HIST1H3G
1,11715101_s_at,HIST1H3G
2,11715102_x_at,HIST1H3G
3,11715103_x_at,TNFAIP8L1
4,11715104_s_at,OTOP2
...,...,...
49390,AFFX-ThrX-5_at,---
49391,AFFX-ThrX-M_at,---
49392,AFFX-TrpnX-3_at,---
49393,AFFX-TrpnX-5_at,---


In [5]:
merged_data = merged_data.assign(Gene_Symbol=merged_data['Gene Symbol'].str.split(' /// ')).explode('Gene_Symbol')

In [6]:
# replace ' /// ' with ','
merged_data["Gene Symbol"] = merged_data["Gene Symbol"].str.replace(" /// ", ",")

In [7]:
merged_data= merged_data.rename(columns={"Gene Symbol": "Cleaned_Gene_Symbol"})

In [8]:
merged_data = merged_data[~merged_data["ID_REF"].str.startswith("AFFX-")]

In [9]:
merged_data.head()

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol,Gene_Symbol
0,11715100_at,HIST1H3G,HIST1H3G
1,11715101_s_at,HIST1H3G,HIST1H3G
2,11715102_x_at,HIST1H3G,HIST1H3G
3,11715103_x_at,TNFAIP8L1,TNFAIP8L1
4,11715104_s_at,OTOP2,OTOP2


In [10]:
merged_data = merged_data.dropna()

In [11]:
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol,Gene_Symbol
0,11715100_at,HIST1H3G,HIST1H3G
1,11715101_s_at,HIST1H3G,HIST1H3G
2,11715102_x_at,HIST1H3G,HIST1H3G
3,11715103_x_at,TNFAIP8L1,TNFAIP8L1
4,11715104_s_at,OTOP2,OTOP2
...,...,...,...
49289,200096_PM_s_at,ATP6V0E1,ATP6V0E1
49290,200097_PM_s_at,HNRNPK,HNRNPK
49291,200098_PM_s_at,ANAPC5,ANAPC5
49292,200099_PM_s_at,"RPS3A,RPS3AP5",RPS3A


In [12]:
merged_data = merged_data.drop(columns="Gene_Symbol")
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,11715100_at,HIST1H3G
1,11715101_s_at,HIST1H3G
2,11715102_x_at,HIST1H3G
3,11715103_x_at,TNFAIP8L1
4,11715104_s_at,OTOP2
...,...,...
49289,200096_PM_s_at,ATP6V0E1
49290,200097_PM_s_at,HNRNPK
49291,200098_PM_s_at,ANAPC5
49292,200099_PM_s_at,"RPS3A,RPS3AP5"


In [13]:
merged_data.shape

(52826, 2)

In [14]:
# Save the mapped results
merged_data.to_csv("mapped_gene_list.csv", index=False)

print("Gene name mapping completed! Check 'mapped_gene_list.csv'")

Gene name mapping completed! Check 'mapped_gene_list.csv'
