In [None]:
import pandas as pd

In [None]:
# Load your Probe ID list
gene_data = pd.read_csv("GSE41328_geneID.csv")

# Load the GPL3921 annotation file (tab-separated format)
gpl_data = pd.read_csv("GPL570_55999.txt", sep="\t", dtype=str, comment="#")

# Display column names to check structure
print(gpl_data.columns)

# Merge based on Probe ID
merged_data = gene_data.merge(gpl_data[['ID', 'Gene Symbol']], left_on="ID_REF", right_on="ID", how="left")

Index(['ID', 'GB_ACC', 'SPOT_ID', 'Species Scientific Name', 'Annotation Date',
       'Sequence Type', 'Sequence Source', 'Target Description',
       'Representative Public ID', 'Gene Title', 'Gene Symbol',
       'ENTREZ_GENE_ID', 'RefSeq Transcript ID',
       'Gene Ontology Biological Process', 'Gene Ontology Cellular Component',
       'Gene Ontology Molecular Function'],
      dtype='object')


In [None]:
merged_data.head()

Unnamed: 0,ID_REF,ID,Gene Symbol
0,1007_s_at,1007_s_at,DDR1 /// MIR4640
1,1053_at,1053_at,RFC2
2,117_at,117_at,HSPA6
3,121_at,121_at,PAX8
4,1255_g_at,1255_g_at,GUCA1A


In [None]:
merged_data = merged_data.drop(columns="ID")
merged_data

Unnamed: 0,ID_REF,Gene Symbol
0,1007_s_at,DDR1 /// MIR4640
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
54670,AFFX-ThrX-5_at,
54671,AFFX-ThrX-M_at,
54672,AFFX-TrpnX-3_at,
54673,AFFX-TrpnX-5_at,


In [None]:
merged_data = merged_data.assign(Gene_Symbol=merged_data['Gene Symbol'].str.split(' /// ')).explode('Gene_Symbol')

In [None]:
# replace ' /// ' with ','
merged_data["Gene Symbol"] = merged_data["Gene Symbol"].str.replace(" /// ", ",")

In [None]:
merged_data= merged_data.rename(columns={"Gene_Symbol": "Cleaned_Gene_Symbol"})

In [None]:
merged_data = merged_data[~merged_data["ID_REF"].str.startswith("AFFX-")]

In [None]:
merged_data.head()

Unnamed: 0,ID_REF,Gene Symbol,Cleaned_Gene_Symbol
0,1007_s_at,"DDR1,MIR4640",DDR1
0,1007_s_at,"DDR1,MIR4640",MIR4640
1,1053_at,RFC2,RFC2
2,117_at,HSPA6,HSPA6
3,121_at,PAX8,PAX8


In [None]:
merged_data = merged_data.dropna()

In [None]:
merged_data

Unnamed: 0,ID_REF,Gene Symbol,Cleaned_Gene_Symbol
0,1007_s_at,"DDR1,MIR4640",DDR1
0,1007_s_at,"DDR1,MIR4640",MIR4640
1,1053_at,RFC2,RFC2
2,117_at,HSPA6,HSPA6
3,121_at,PAX8,PAX8
...,...,...,...
54608,91703_at,EHBP1L1,EHBP1L1
54609,91816_f_at,MEX3D,MEX3D
54610,91826_at,EPS8L1,EPS8L1
54611,91920_at,BCAN,BCAN


In [None]:
merged_data = merged_data.drop(columns="Gene Symbol")
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,1007_s_at,DDR1
0,1007_s_at,MIR4640
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
...,...,...
54608,91703_at,EHBP1L1
54609,91816_f_at,MEX3D
54610,91826_at,EPS8L1
54611,91920_at,BCAN


In [None]:
# Save the mapped results
merged_data.to_csv("mapped_gene_list.csv", index=False)

print("Gene name mapping completed! Check 'mapped_gene_list.csv'")

Gene name mapping completed! Check 'mapped_gene_list.csv'
