In [1]:
import pandas as pd

In [2]:
# Load your Probe ID list
gene_data = pd.read_csv("GSE44861_geneID.csv")

# Load the GPL3921 annotation file (tab-separated format)
gpl_data = pd.read_csv("GPL3921_N.txt", sep="\t", dtype=str, comment="#")

# Display column names to check structure
print(gpl_data.columns)

# Merge based on Probe ID
merged_data = gene_data.merge(gpl_data[['ID', 'Gene Symbol']], left_on="ID_REF", right_on="ID", how="left")


Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')


In [3]:
merged_data.head()

Unnamed: 0,ID_REF,ID,Gene Symbol
0,1007_s_at,1007_s_at,DDR1
1,1053_at,1053_at,RFC2
2,117_at,117_at,HSPA6
3,121_at,121_at,PAX8
4,1255_g_at,1255_g_at,GUCA1A


In [4]:
merged_data = merged_data.drop(columns="ID")
merged_data

Unnamed: 0,ID_REF,Gene Symbol
0,1007_s_at,DDR1
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
22272,AFFX-ThrX-5_at,
22273,AFFX-ThrX-M_at,
22274,AFFX-TrpnX-3_at,
22275,AFFX-TrpnX-5_at,


In [5]:
merged_data = merged_data.assign(Gene_Symbol=merged_data['Gene Symbol'].str.split(' /// ')).explode('Gene_Symbol')

In [6]:
# replace ' /// ' with ','
merged_data["Gene Symbol"] = merged_data["Gene Symbol"].str.replace(" /// ", ",")

In [7]:
merged_data= merged_data.rename(columns={"Gene Symbol": "Cleaned_Gene_Symbol"})

In [8]:
merged_data = merged_data[~merged_data["ID_REF"].str.startswith("AFFX-")]

In [9]:
merged_data.head()

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol,Gene_Symbol
0,1007_s_at,DDR1,DDR1
1,1053_at,RFC2,RFC2
2,117_at,HSPA6,HSPA6
3,121_at,PAX8,PAX8
4,1255_g_at,GUCA1A,GUCA1A


In [10]:
merged_data = merged_data.dropna()

In [11]:
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol,Gene_Symbol
0,1007_s_at,DDR1,DDR1
1,1053_at,RFC2,RFC2
2,117_at,HSPA6,HSPA6
3,121_at,PAX8,PAX8
4,1255_g_at,GUCA1A,GUCA1A
...,...,...,...
22210,91703_at,"MGC15523,EHBP1L1",EHBP1L1
22211,91816_f_at,RKHD1,RKHD1
22212,91826_at,EPS8L1,EPS8L1
22213,91920_at,BCAN,BCAN


In [12]:
merged_data = merged_data.drop(columns="Gene_Symbol")
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,1007_s_at,DDR1
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
22210,91703_at,"MGC15523,EHBP1L1"
22211,91816_f_at,RKHD1
22212,91826_at,EPS8L1
22213,91920_at,BCAN


In [14]:
merged_data.shape

(23086, 2)

In [13]:
# Save the mapped results
merged_data.to_csv("mapped_gene_list.csv", index=False)

print("Gene name mapping completed! Check 'mapped_gene_list.csv'")

Gene name mapping completed! Check 'mapped_gene_list.csv'
