In [1]:
import pandas as pd
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE106582_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM2843099,GSM2843100,GSM2843101,GSM2843102,GSM2843103,GSM2843104,GSM2843105,GSM2843106,GSM2843107,...,GSM2843283,GSM2843284,GSM2843285,GSM2843286,GSM2843287,GSM2843288,GSM2843289,GSM2843290,GSM2843291,GSM2843292
0,ILMN_1343291,14.246636,14.30643,14.348227,14.233364,14.037613,14.30643,14.209482,14.233364,14.137572,...,14.075304,14.209482,14.262302,14.137572,14.185752,14.30643,14.193309,14.348227,14.280982,14.246636
1,ILMN_1343295,12.738228,12.206211,12.700099,12.029697,11.334419,12.816596,12.552413,12.82233,11.944893,...,11.679008,11.843007,12.866065,11.624542,11.237864,11.46531,11.443065,10.883836,11.750486,11.466828
2,ILMN_1651199,6.412016,6.633564,6.57489,6.450712,6.579476,6.439554,6.431542,6.346484,6.518873,...,6.420452,6.674008,6.711727,6.579345,6.38841,6.399378,6.88924,6.394718,6.516751,6.187478
3,ILMN_1651209,6.710966,6.80521,6.622005,6.66824,6.736337,6.703673,6.452449,6.592954,6.50255,...,6.612033,6.555067,6.474639,6.650995,6.624086,6.799771,6.573304,6.542434,6.601863,6.7239
4,ILMN_1651210,6.512163,6.588,6.532367,6.576099,6.343878,6.589865,6.50119,6.571158,6.659456,...,6.470704,6.419202,6.517013,6.552662,6.43394,6.433472,6.387217,6.443103,6.389635,6.528619


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,ILMN_1343291,ILMN_1343295,ILMN_1651199,ILMN_1651209,ILMN_1651210,ILMN_1651221,ILMN_1651228,ILMN_1651229,ILMN_1651230,...,ILMN_3311145,ILMN_3311150,ILMN_3311155,ILMN_3311160,ILMN_3311165,ILMN_3311170,ILMN_3311175,ILMN_3311180,ILMN_3311185,ILMN_3311190
0,GSM2843099,14.246636,12.738228,6.412016,6.710966,6.512163,6.5948,12.402537,8.027912,6.5187,...,6.632421,6.254189,6.560038,6.667806,6.639648,6.449088,6.591616,6.692787,6.453807,6.825473
1,GSM2843100,14.30643,12.206211,6.633564,6.80521,6.588,6.55628,12.488418,8.20969,6.447463,...,6.677799,6.449481,6.539819,6.559442,6.70259,6.505448,6.357162,6.475546,6.56709,6.754554
2,GSM2843101,14.348227,12.700099,6.57489,6.622005,6.532367,6.739953,11.815945,7.613582,6.596743,...,6.736232,6.523931,6.509643,6.593197,6.676041,6.486032,6.516502,6.71743,6.437957,6.435831
3,GSM2843102,14.233364,12.029697,6.450712,6.66824,6.576099,6.62162,11.927947,8.412245,6.365218,...,6.42119,6.524297,6.485654,6.474509,6.748206,6.430602,6.406007,6.656576,6.535869,6.750124
4,GSM2843103,14.037613,11.334419,6.579476,6.736337,6.343878,6.474938,13.118719,7.527998,6.468574,...,6.592613,6.453748,6.447428,6.403681,6.798879,6.251682,6.395073,6.501447,6.59454,6.519387


In [6]:
# gene expression data
metadata_filepath = "GSE106582_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE106582_merged_data.csv", index=False)
print("Merged data saved as GSE106582_merged_data.csv")

Merged data saved as GSE106582_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Columns: 47292 entries, geo_accession to ILMN_3311190
dtypes: float64(47290), int64(1), object(1)
memory usage: 70.0+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,ILMN_1343291,ILMN_1343295,ILMN_1651199,ILMN_1651209,ILMN_1651210,ILMN_1651221,ILMN_1651228,ILMN_1651229,...,ILMN_3311145,ILMN_3311150,ILMN_3311155,ILMN_3311160,ILMN_3311165,ILMN_3311170,ILMN_3311175,ILMN_3311180,ILMN_3311185,ILMN_3311190
0,GSM2843099,1,14.246636,12.738228,6.412016,6.710966,6.512163,6.594800,12.402537,8.027912,...,6.632421,6.254189,6.560038,6.667806,6.639648,6.449088,6.591616,6.692787,6.453807,6.825473
1,GSM2843100,0,14.306430,12.206211,6.633564,6.805210,6.588000,6.556280,12.488418,8.209690,...,6.677799,6.449481,6.539819,6.559442,6.702590,6.505448,6.357162,6.475546,6.567090,6.754554
2,GSM2843101,1,14.348227,12.700099,6.574890,6.622005,6.532367,6.739953,11.815945,7.613582,...,6.736232,6.523931,6.509643,6.593197,6.676041,6.486032,6.516502,6.717430,6.437957,6.435831
3,GSM2843102,0,14.233364,12.029697,6.450712,6.668240,6.576099,6.621620,11.927947,8.412245,...,6.421190,6.524297,6.485654,6.474509,6.748206,6.430602,6.406007,6.656576,6.535869,6.750124
4,GSM2843103,0,14.037613,11.334419,6.579476,6.736337,6.343878,6.474938,13.118719,7.527998,...,6.592613,6.453748,6.447428,6.403681,6.798879,6.251682,6.395073,6.501447,6.594540,6.519387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,GSM2843288,1,14.306430,11.465310,6.399378,6.799771,6.433472,6.568854,13.039005,8.063550,...,6.389744,6.618410,6.457327,6.676147,6.670747,6.465575,6.473099,6.642995,6.547651,6.374058
190,GSM2843289,0,14.193309,11.443065,6.889240,6.573304,6.387217,6.553839,12.913759,7.634689,...,6.712699,6.702368,6.590317,6.477758,6.526935,6.420687,6.459328,6.678250,6.547469,6.475304
191,GSM2843290,0,14.348227,10.883836,6.394718,6.542434,6.443103,6.688859,12.794120,7.844166,...,6.697272,6.524499,6.660827,6.601896,6.544026,6.342178,6.314941,6.624100,6.380172,6.606369
192,GSM2843291,1,14.280982,11.750486,6.516751,6.601863,6.389635,6.424253,12.927110,7.441794,...,6.483706,6.413314,6.587828,6.323006,6.621739,6.470755,6.463463,6.422734,6.563595,6.704950


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
#df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343295,GAPDH
2,ILMN_1651199,LOC643334
3,ILMN_1651209,SLC35E2
4,ILMN_1651210,DUSP22
...,...,...
47285,ILMN_3311170,SKCG-1
47286,ILMN_3311175,ESP33
47287,ILMN_3311180,SKCG-1
47288,ILMN_3311185,ESP33


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1343295,GAPDH
2,ILMN_1651199,LOC643334
3,ILMN_1651209,SLC35E2
4,ILMN_1651210,DUSP22
...,...,...
47285,ILMN_3311170,SKCG-1
47286,ILMN_3311175,ESP33
47287,ILMN_3311180,SKCG-1
47288,ILMN_3311185,ESP33


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'ILMN_1343291': 'EEF1A1',
 'ILMN_1343295': 'GAPDH',
 'ILMN_1651199': 'LOC643334',
 'ILMN_1651209': 'SLC35E2',
 'ILMN_1651210': 'DUSP22',
 'ILMN_1651221': 'LOC642820',
 'ILMN_1651228': 'RPS28',
 'ILMN_1651229': 'IPO13',
 'ILMN_1651230': 'TESSP1',
 'ILMN_1651232': 'LOC653113',
 'ILMN_1651235': 'AFAP1',
 'ILMN_1651236': 'GGTLC1',
 'ILMN_1651237': 'CDT1',
 'ILMN_1651238': 'TRPV1',
 'ILMN_1651249': 'LOC652879',
 'ILMN_1651253': 'LOC644150',
 'ILMN_1651254': 'LPP',
 'ILMN_1651259': 'FLJ36848',
 'ILMN_1651260': 'CCNE2',
 'ILMN_1651262': 'HNRNPAB',
 'ILMN_1651268': 'LOH12CR1',
 'ILMN_1651278': 'SNIP1',
 'ILMN_1651279': 'LOC645349',
 'ILMN_1651281': 'LOC389669',
 'ILMN_1651282': 'COL17A1',
 'ILMN_1651285': 'BCL6B',
 'ILMN_1651286': 'GRHL1',
 'ILMN_1651288': 'LOC339760',
 'ILMN_1651292': 'LOC644613',
 'ILMN_1651296': 'LOC143666',
 'ILMN_1651303': 'ATP13A4',
 'ILMN_1651309': 'DIAPH2',
 'ILMN_1651310': 'MLLT4',
 'ILMN_1651315': 'HMG20B',
 'ILMN_1651316': 'CD69',
 'ILMN_1651325': 'LOC653493',
 'IL

In [15]:
len(gene_mapping)

47290

In [16]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,EEF1A1,GAPDH,LOC643334,SLC35E2,DUSP22,LOC642820,RPS28,IPO13,...,MGC11082,BEYLA,FOXO6,MGC11082.1,NCRNA00173,SKCG-1,ESP33,SKCG-1.1,ESP33.1,NCRNA00173.1
0,GSM2843099,1,14.246636,12.738228,6.412016,6.710966,6.512163,6.594800,12.402537,8.027912,...,6.632421,6.254189,6.560038,6.667806,6.639648,6.449088,6.591616,6.692787,6.453807,6.825473
1,GSM2843100,0,14.306430,12.206211,6.633564,6.805210,6.588000,6.556280,12.488418,8.209690,...,6.677799,6.449481,6.539819,6.559442,6.702590,6.505448,6.357162,6.475546,6.567090,6.754554
2,GSM2843101,1,14.348227,12.700099,6.574890,6.622005,6.532367,6.739953,11.815945,7.613582,...,6.736232,6.523931,6.509643,6.593197,6.676041,6.486032,6.516502,6.717430,6.437957,6.435831
3,GSM2843102,0,14.233364,12.029697,6.450712,6.668240,6.576099,6.621620,11.927947,8.412245,...,6.421190,6.524297,6.485654,6.474509,6.748206,6.430602,6.406007,6.656576,6.535869,6.750124
4,GSM2843103,0,14.037613,11.334419,6.579476,6.736337,6.343878,6.474938,13.118719,7.527998,...,6.592613,6.453748,6.447428,6.403681,6.798879,6.251682,6.395073,6.501447,6.594540,6.519387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,GSM2843288,1,14.306430,11.465310,6.399378,6.799771,6.433472,6.568854,13.039005,8.063550,...,6.389744,6.618410,6.457327,6.676147,6.670747,6.465575,6.473099,6.642995,6.547651,6.374058
190,GSM2843289,0,14.193309,11.443065,6.889240,6.573304,6.387217,6.553839,12.913759,7.634689,...,6.712699,6.702368,6.590317,6.477758,6.526935,6.420687,6.459328,6.678250,6.547469,6.475304
191,GSM2843290,0,14.348227,10.883836,6.394718,6.542434,6.443103,6.688859,12.794120,7.844166,...,6.697272,6.524499,6.660827,6.601896,6.544026,6.342178,6.314941,6.624100,6.380172,6.606369
192,GSM2843291,1,14.280982,11.750486,6.516751,6.601863,6.389635,6.424253,12.927110,7.441794,...,6.483706,6.413314,6.587828,6.323006,6.621739,6.470755,6.463463,6.422734,6.563595,6.704950


In [17]:
df_expr_renamed.columns.duplicated().sum()

12619

In [18]:
df_expr_renamed.columns[df_expr_renamed.columns.duplicated()]

Index(['LOC650677', 'USP32', 'SDHALP1', 'TTN', 'LOC648716', 'HERC2P2', 'TRPV1',
       'IL18BP', 'SDHALP1', 'ZNF589',
       ...
       'BEYLA', 'MGC11082', 'BEYLA', 'FOXO6', 'MGC11082', 'SKCG-1', 'ESP33',
       'SKCG-1', 'ESP33', 'NCRNA00173'],
      dtype='object', length=12619)

In [19]:
from collections import Counter

df=df_expr_renamed
column_counts = Counter(df.columns)
for col, count in column_counts.items():
    if count>1:
        print(f"{col}: {count} times")

EEF1A1: 4 times
GAPDH: 3 times
DUSP22: 4 times
AFAP1: 3 times
GGTLC1: 4 times
TRPV1: 5 times
LOC644150: 2 times
CCNE2: 2 times
HNRNPAB: 3 times
COL17A1: 2 times
GRHL1: 3 times
DIAPH2: 2 times
MLLT4: 2 times
CD69: 2 times
KCNG4: 2 times
UHRF1BP1L: 3 times
LOC401010: 2 times
ITGA11: 2 times
LOC729444: 3 times
SPP1: 2 times
PCBD2: 2 times
USP21: 3 times
AUP1: 2 times
LILRA6: 4 times
BRD9: 2 times
DERL3: 2 times
SYTL2: 5 times
CTRB2: 2 times
DCK: 2 times
FOXI1: 2 times
UTY: 3 times
ALOX5: 3 times
MMP17: 3 times
COX11P: 2 times
CYP26A1: 4 times
WNT2B: 2 times
HIST1H2BD: 3 times
ERGIC1: 3 times
ABCD4: 4 times
TMEM217: 3 times
GEFT: 3 times
SEMA3E: 2 times
RGS5: 2 times
LOC650677: 3 times
WHSC1: 6 times
XRCC2: 2 times
FBXO44: 3 times
ENTPD2: 2 times
LOC653210: 3 times
SPRED3: 3 times
CTNNB1: 7 times
EXOC6: 3 times
CYP2A13: 2 times
CRSP2: 5 times
ASB11: 3 times
RTN3: 4 times
PSG7: 2 times
ADAM33: 2 times
TMEM191B: 4 times
SDHALP1: 13 times
LOC649150: 2 times
WDFY3: 3 times
PGBD4: 2 times
GSG1:

In [20]:
df_expr_renamed.shape

(194, 47292)

In [21]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE106582_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE106582_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE106582_gene_expression_renamed.csv'
