In [1]:
import pandas as pd
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE74602_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM1923670,GSM1923671,GSM1923672,GSM1923673,GSM1923674,GSM1923675,GSM1923676,GSM1923677,GSM1923678,...,GSM1923720,GSM1923721,GSM1923722,GSM1923723,GSM1923724,GSM1923725,GSM1923726,GSM1923727,GSM1923728,GSM1923729
0,ILMN_1343291,0.034019,0.307699,-0.385204,0.218209,0.196362,0.401151,0.122247,0.364366,0.038851,...,-0.37855,-0.115766,-0.235073,-0.209642,0.080709,0.149072,-0.218767,0.074314,-0.28752,0.146235
1,ILMN_1343292,0.762679,-0.856875,-1.760679,0.088397,0.306509,0.1719,-0.420557,0.892984,-0.477663,...,-0.261872,1.772682,1.029835,0.533648,0.123396,0.965112,-0.136953,0.130172,0.079698,0.036143
2,ILMN_1343293,0.506224,-0.132675,0.965889,-0.018617,0.027961,-0.117388,0.853586,-0.485096,0.73023,...,0.930435,-0.965127,0.588421,0.164329,0.598843,-0.422148,-0.094966,-0.333024,0.39156,-0.388337
3,ILMN_1343294,-0.042696,0.230176,-0.534735,0.035286,0.058753,0.477515,-0.000821,0.15285,0.000821,...,-0.344731,0.041217,-0.230332,-0.125455,0.083511,0.335932,-0.131966,0.262913,-0.262721,0.49902
4,ILMN_1651209,-0.092322,0.122087,-0.748453,0.0411,0.090834,0.275813,-0.071506,0.040806,-0.067123,...,-0.231332,0.254704,-0.10366,-0.073623,0.197407,0.075207,-0.408236,0.075943,-0.201398,0.256068


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,ILMN_1343291,ILMN_1343292,ILMN_1343293,ILMN_1343294,ILMN_1651209,ILMN_1651217,ILMN_1651228,ILMN_1651229,ILMN_1651234,...,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,ILMN_2038774,ILMN_2038777,ILMN_2038778
0,GSM1923670,0.034019,0.762679,0.506224,-0.042696,-0.092322,-0.190058,-0.229721,-0.875623,-0.295582,...,-0.315921,0.472947,-0.253801,-0.606511,-0.244869,-0.565818,0.172183,0.341347,-0.590778,-0.671084
1,GSM1923671,0.307699,-0.856875,-0.132675,0.230176,0.122087,-0.048757,-0.054029,-0.589342,0.075204,...,0.150381,0.093512,-0.171227,-0.163665,-0.116943,0.226491,0.444314,0.414416,-0.380373,-0.696555
2,GSM1923672,-0.385204,-1.760679,0.965889,-0.534735,-0.748453,-0.19769,-0.217416,-0.553076,-0.45575,...,-0.652523,1.648453,0.33969,-1.122893,-0.556817,-0.998321,0.407893,-0.125578,-0.270124,0.735986
3,GSM1923673,0.218209,0.088397,-0.018617,0.035286,0.0411,0.210338,0.235956,0.048933,0.034713,...,0.411864,-0.233437,-0.227423,0.10188,0.021584,0.270329,0.152678,0.127866,0.160852,-0.192957
4,GSM1923674,0.196362,0.306509,0.027961,0.058753,0.090834,-0.086746,-0.129879,-0.266786,-0.002016,...,-0.074866,1.249168,0.28861,-0.460824,0.111645,-0.13334,-0.649105,0.2111,0.007412,0.562918


In [6]:
# gene expression data
metadata_filepath = "GSE74602_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE74602_merged_data.csv", index=False)
print("Merged data saved as GSE74602_merged_data.csv")

Merged data saved as GSE74602_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Columns: 22186 entries, geo_accession to ILMN_2038778
dtypes: float64(22184), int64(1), object(1)
memory usage: 10.2+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,ILMN_1343291,ILMN_1343292,ILMN_1343293,ILMN_1343294,ILMN_1651209,ILMN_1651217,ILMN_1651228,ILMN_1651229,...,ILMN_1815923,ILMN_1815924,ILMN_1815933,ILMN_1815937,ILMN_1815938,ILMN_1815941,ILMN_1815951,ILMN_2038774,ILMN_2038777,ILMN_2038778
0,GSM1923670,1,0.034019,0.762679,0.506224,-0.042696,-0.092322,-0.190058,-0.229721,-0.875623,...,-0.315921,0.472947,-0.253801,-0.606511,-0.244869,-0.565818,0.172183,0.341347,-0.590778,-0.671084
1,GSM1923671,0,0.307699,-0.856875,-0.132675,0.230176,0.122087,-0.048757,-0.054029,-0.589342,...,0.150381,0.093512,-0.171227,-0.163665,-0.116943,0.226491,0.444314,0.414416,-0.380373,-0.696555
2,GSM1923672,1,-0.385204,-1.760679,0.965889,-0.534735,-0.748453,-0.19769,-0.217416,-0.553076,...,-0.652523,1.648453,0.33969,-1.122893,-0.556817,-0.998321,0.407893,-0.125578,-0.270124,0.735986
3,GSM1923673,0,0.218209,0.088397,-0.018617,0.035286,0.0411,0.210338,0.235956,0.048933,...,0.411864,-0.233437,-0.227423,0.10188,0.021584,0.270329,0.152678,0.127866,0.160852,-0.192957
4,GSM1923674,1,0.196362,0.306509,0.027961,0.058753,0.090834,-0.086746,-0.129879,-0.266786,...,-0.074866,1.249168,0.28861,-0.460824,0.111645,-0.13334,-0.649105,0.2111,0.007412,0.562918
5,GSM1923675,0,0.401151,0.1719,-0.117388,0.477515,0.275813,0.346959,0.100334,-0.347659,...,0.272127,-0.478751,-0.208202,0.343201,0.39946,0.134321,0.012172,0.165775,-0.198785,-0.539871
6,GSM1923676,1,0.122247,-0.420557,0.853586,-0.000821,-0.071506,-0.085156,0.18812,0.028178,...,-0.315488,1.379764,0.62348,-0.088825,-0.142336,-0.173133,-0.78095,-0.295197,0.045997,0.393401
7,GSM1923677,0,0.364366,0.892984,-0.485096,0.15285,0.040806,0.275216,-0.119977,-0.058438,...,0.128233,-0.11595,-0.314374,0.08087,0.352083,0.100417,-0.053183,0.327358,0.124743,-0.503925
8,GSM1923678,1,0.038851,-0.477663,0.73023,0.000821,-0.067123,-0.089928,0.188888,-0.794396,...,-0.079261,1.881888,0.853341,-0.470546,-0.021584,-0.562016,0.425327,0.420284,0.002744,0.682898
9,GSM1923679,0,0.147575,0.537063,-0.275589,0.132259,0.111644,0.144237,-0.067813,-0.285512,...,0.140349,-0.546386,-0.14386,0.031456,0.131483,0.267175,-0.197955,0.295454,0.158751,-0.573552


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
#df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343292,TUBB
2,ILMN_1343293,TXN
3,ILMN_1343294,ACTB
4,ILMN_1651209,SLC35E2
...,...,...
22179,ILMN_1815941,SMAD7
22180,ILMN_1815951,PCYOX1L
22181,ILMN_2038774,EEF1A1
22182,ILMN_2038777,ACTB


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343292,TUBB
2,ILMN_1343293,TXN
3,ILMN_1343294,ACTB
4,ILMN_1651209,SLC35E2
...,...,...
22179,ILMN_1815941,SMAD7
22180,ILMN_1815951,PCYOX1L
22181,ILMN_2038774,EEF1A1
22182,ILMN_2038777,ACTB


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'ILMN_1343291': 'EEF1A1',
 'ILMN_1343292': 'TUBB',
 'ILMN_1343293': 'TXN',
 'ILMN_1343294': 'ACTB',
 'ILMN_1651209': 'SLC35E2',
 'ILMN_1651217': 'PDCD1LG2',
 'ILMN_1651228': 'RPS28',
 'ILMN_1651229': 'IPO13',
 'ILMN_1651234': 'SYT14',
 'ILMN_1651235': 'AFAP1',
 'ILMN_1651236': 'GGTLA4',
 'ILMN_1651237': 'CDT1',
 'ILMN_1651238': 'TRPV1',
 'ILMN_1651254': 'LPP',
 'ILMN_1651259': 'UGP2',
 'ILMN_1651260': 'CCNE2',
 'ILMN_1651261': 'RSU1',
 'ILMN_1651262': 'HNRPAB',
 'ILMN_1651268': 'LOH12CR1',
 'ILMN_1651278': 'SNIP1',
 'ILMN_1651282': 'COL17A1',
 'ILMN_1651286': 'GRHL1',
 'ILMN_1651296': 'LOC143666',
 'ILMN_1651298': 'RAD17',
 'ILMN_1651303': 'ATP13A4',
 'ILMN_1651316': 'CD69',
 'ILMN_1651330': 'KCNG4',
 'ILMN_1651336': 'MLYCD',
 'ILMN_1651339': 'KIAA0701',
 'ILMN_1651343': 'ITGA11',
 'ILMN_1651346': 'TICAM2',
 'ILMN_1651349': 'CD86',
 'ILMN_1651354': 'SPP1',
 'ILMN_1651358': 'HBE1',
 'ILMN_1651364': 'PCBD2',
 'ILMN_1651370': 'USP21',
 'ILMN_1651373': 'RHD',
 'ILMN_1651375': 'BICD2',
 'I

In [15]:
len(gene_mapping)

22184

In [16]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,EEF1A1,TUBB,TXN,ACTB,SLC35E2,PDCD1LG2,RPS28,IPO13,...,SMCR7,NUP107,FTSJ2,MGC9712,TRPM3,SMAD7,PCYOX1L,EEF1A1.1,ACTB.1,GAPDH
0,GSM1923670,1,0.034019,0.762679,0.506224,-0.042696,-0.092322,-0.190058,-0.229721,-0.875623,...,-0.315921,0.472947,-0.253801,-0.606511,-0.244869,-0.565818,0.172183,0.341347,-0.590778,-0.671084
1,GSM1923671,0,0.307699,-0.856875,-0.132675,0.230176,0.122087,-0.048757,-0.054029,-0.589342,...,0.150381,0.093512,-0.171227,-0.163665,-0.116943,0.226491,0.444314,0.414416,-0.380373,-0.696555
2,GSM1923672,1,-0.385204,-1.760679,0.965889,-0.534735,-0.748453,-0.19769,-0.217416,-0.553076,...,-0.652523,1.648453,0.33969,-1.122893,-0.556817,-0.998321,0.407893,-0.125578,-0.270124,0.735986
3,GSM1923673,0,0.218209,0.088397,-0.018617,0.035286,0.0411,0.210338,0.235956,0.048933,...,0.411864,-0.233437,-0.227423,0.10188,0.021584,0.270329,0.152678,0.127866,0.160852,-0.192957
4,GSM1923674,1,0.196362,0.306509,0.027961,0.058753,0.090834,-0.086746,-0.129879,-0.266786,...,-0.074866,1.249168,0.28861,-0.460824,0.111645,-0.13334,-0.649105,0.2111,0.007412,0.562918
5,GSM1923675,0,0.401151,0.1719,-0.117388,0.477515,0.275813,0.346959,0.100334,-0.347659,...,0.272127,-0.478751,-0.208202,0.343201,0.39946,0.134321,0.012172,0.165775,-0.198785,-0.539871
6,GSM1923676,1,0.122247,-0.420557,0.853586,-0.000821,-0.071506,-0.085156,0.18812,0.028178,...,-0.315488,1.379764,0.62348,-0.088825,-0.142336,-0.173133,-0.78095,-0.295197,0.045997,0.393401
7,GSM1923677,0,0.364366,0.892984,-0.485096,0.15285,0.040806,0.275216,-0.119977,-0.058438,...,0.128233,-0.11595,-0.314374,0.08087,0.352083,0.100417,-0.053183,0.327358,0.124743,-0.503925
8,GSM1923678,1,0.038851,-0.477663,0.73023,0.000821,-0.067123,-0.089928,0.188888,-0.794396,...,-0.079261,1.881888,0.853341,-0.470546,-0.021584,-0.562016,0.425327,0.420284,0.002744,0.682898
9,GSM1923679,0,0.147575,0.537063,-0.275589,0.132259,0.111644,0.144237,-0.067813,-0.285512,...,0.140349,-0.546386,-0.14386,0.031456,0.131483,0.267175,-0.197955,0.295454,0.158751,-0.573552


In [17]:
df_expr_renamed.columns.duplicated().sum()

3987

In [21]:
df_expr_renamed.columns[df_expr_renamed.columns.duplicated()]

Index(['TRPV1', 'INPP4A', 'TNFSF14', 'IFT81', 'GCNT2', 'PRDM1', 'TRNT1', 'DMD',
       'CDKN2C', 'TRIM6',
       ...
       'IGLL1', 'HNRPA1', 'IL12RB1', 'GYPE', 'SMCR7', 'FTSJ2', 'TRPM3',
       'EEF1A1', 'ACTB', 'GAPDH'],
      dtype='object', length=3987)

In [24]:
from collections import Counter

df=df_expr_renamed
column_counts = Counter(df.columns)
for col, count in column_counts.items():
    if count>1:
        print(f"{col}: {count} times")


EEF1A1: 3 times
TUBB: 2 times
TXN: 2 times
ACTB: 3 times
AFAP1: 3 times
GGTLA4: 3 times
TRPV1: 5 times
UGP2: 2 times
CCNE2: 2 times
RSU1: 2 times
HNRPAB: 2 times
COL17A1: 2 times
GRHL1: 2 times
RAD17: 3 times
KCNG4: 2 times
KIAA0701: 3 times
CD86: 3 times
USP21: 2 times
BICD2: 2 times
AUP1: 2 times
BRD9: 2 times
DERL3: 2 times
SYTL2: 4 times
CTRB2: 2 times
PAIP1: 2 times
FOXI1: 2 times
LIPT1: 2 times
UTY: 3 times
FBXL10: 3 times
MEST: 3 times
CYP26A1: 3 times
WNT2B: 2 times
HIST1H2BD: 3 times
ERGIC1: 3 times
GEFT: 3 times
FBXO44: 3 times
ENTPD2: 2 times
SPRY1: 3 times
CTNNB1: 2 times
EXOC6: 3 times
ASB11: 3 times
RTN3: 3 times
ADAM33: 2 times
PARP6: 3 times
WDFY3: 3 times
GSG1: 2 times
ADAMTS20: 3 times
MBTPS1: 2 times
ARMCX3: 2 times
RRAGB: 2 times
MSH5: 3 times
VCX: 4 times
RORC: 2 times
GSTM4: 2 times
GRIK1: 3 times
ITSN1: 3 times
EVC: 3 times
ZNF187: 2 times
CDC25B: 2 times
ANK3: 3 times
SCYL3: 3 times
ABCC5: 3 times
FGFR3: 2 times
TP53I3: 2 times
DHX57: 3 times
CSNK1G3: 2 times
GA

In [18]:
df_expr_renamed.shape

(60, 22186)

In [19]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE74602_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE74602_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE74602_gene_expression_renamed.csv'
