In [1]:
import pandas as pd

In [4]:
# Load your Probe ID list
gene_data = pd.read_csv("GSE25070_geneID.csv")

# Load the GPL3921 annotation file (tab-separated format)
gpl_data = pd.read_csv("GPL6883_11606.txt", sep="\t", dtype=str, comment="#")

# Display column names to check structure
print(gpl_data.columns)

# Merge based on Probe ID
merged_data = gene_data.merge(gpl_data[['ID', 'Symbol']], left_on="ID_REF", right_on="ID", how="left")


Index(['ID', 'Species', 'Source', 'Search_Key', 'Transcript', 'ILMN_Gene',
       'Source_Reference_ID', 'RefSeq_ID', 'Entrez_Gene_ID', 'GI', 'Accession',
       'Symbol', 'Protein_Product', 'Array_Address_Id', 'Probe_Type',
       'Probe_Start', 'SEQUENCE', 'Chromosome', 'Probe_Chr_Orientation',
       'Probe_Coordinates', 'Cytoband', 'Definition', 'Ontology_Component',
       'Ontology_Process', 'Ontology_Function', 'Synonyms', 'GB_ACC'],
      dtype='object')


In [5]:
merged_data.head()

Unnamed: 0,ID_REF,ID,Symbol
0,ILMN_1343291,ILMN_1343291,EEF1A1
1,ILMN_1651209,ILMN_1651209,SLC35E2
2,ILMN_1651228,ILMN_1651228,RPS28
3,ILMN_1651229,ILMN_1651229,IPO13
4,ILMN_1651235,ILMN_1651235,AFAP


In [6]:
merged_data = merged_data.drop(columns="ID")
merged_data

Unnamed: 0,ID_REF,Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1651209,SLC35E2
2,ILMN_1651228,RPS28
3,ILMN_1651229,IPO13
4,ILMN_1651235,AFAP
...,...,...
24521,ILMN_2415911,COVA1
24522,ILMN_2415926,THOC3
24523,ILMN_2415949,MRRF
24524,ILMN_2415979,KIAA1751


In [8]:
merged_data = merged_data.assign(Gene_Symbol=merged_data['Symbol'].str.split(' /// ')).explode('Gene_Symbol')

In [9]:
# replace ' /// ' with ','
merged_data["Symbol"] = merged_data["Symbol"].str.replace(" /// ", ",")

In [10]:
merged_data= merged_data.rename(columns={"Gene_Symbol": "Cleaned_Gene_Symbol"})

In [11]:
merged_data = merged_data[~merged_data["ID_REF"].str.startswith("AFFX-")]

In [12]:
merged_data.head()

Unnamed: 0,ID_REF,Symbol,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1,EEF1A1
1,ILMN_1651209,SLC35E2,SLC35E2
2,ILMN_1651228,RPS28,RPS28
3,ILMN_1651229,IPO13,IPO13
4,ILMN_1651235,AFAP,AFAP


In [13]:
merged_data = merged_data.dropna()

In [14]:
merged_data

Unnamed: 0,ID_REF,Symbol,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1,EEF1A1
1,ILMN_1651209,SLC35E2,SLC35E2
2,ILMN_1651228,RPS28,RPS28
3,ILMN_1651229,IPO13,IPO13
4,ILMN_1651235,AFAP,AFAP
...,...,...,...
24521,ILMN_2415911,COVA1,COVA1
24522,ILMN_2415926,THOC3,THOC3
24523,ILMN_2415949,MRRF,MRRF
24524,ILMN_2415979,KIAA1751,KIAA1751


In [15]:
merged_data = merged_data.drop(columns="Symbol")
merged_data

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1651209,SLC35E2
2,ILMN_1651228,RPS28
3,ILMN_1651229,IPO13
4,ILMN_1651235,AFAP
...,...,...
24521,ILMN_2415911,COVA1
24522,ILMN_2415926,THOC3
24523,ILMN_2415949,MRRF
24524,ILMN_2415979,KIAA1751


In [16]:
# Save the mapped results
merged_data.to_csv("mapped_gene_list.csv", index=False)

print("Gene name mapping completed! Check 'mapped_gene_list.csv'")

Gene name mapping completed! Check 'mapped_gene_list.csv'
