In [1]:
import pandas as pd
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE21815_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM543123,GSM543124,GSM543125,GSM543126,GSM543127,GSM543128,GSM543129,GSM543130,GSM543131,...,GSM543254,GSM543255,GSM543256,GSM543257,GSM543258,GSM543259,GSM543260,GSM543261,GSM543262,GSM543263
0,A_23_P100001,318.5345,983.33606,941.8587,753.24054,740.5418,830.9857,820.8664,253.35808,94.853264,...,774.27826,225.84326,933.04553,735.8475,709.0923,740.83606,862.8688,390.95377,222.94467,89.48376
1,A_23_P100011,52.670303,15.14521,12.862661,183.62819,16.069187,13.049668,39.704803,6.657159,22.191977,...,42.903324,24.803036,8.391995,5.274396,8.464316,16.690062,19.789549,15.265101,15.430069,31.463722
2,A_23_P100022,26.2208,10.168971,15.440859,16.379416,29.971552,19.3878,3.145574,3.201703,11.33406,...,10.15618,6.667704,9.977,27.677319,6.897274,20.830055,25.690613,27.207232,18.940218,48.547432
3,A_23_P100056,24.889956,9.815201,14.016779,16.25257,3.773213,5.905363,2.943743,2.944624,7.400354,...,4.637189,2.527414,2.577005,3.776664,3.221462,3.217257,7.025648,5.753796,2.505106,2.561174
4,A_23_P100074,709.74695,3287.3096,2226.5806,470.47818,1645.5344,1534.8606,2713.2273,636.8601,1590.2495,...,1243.6354,419.8316,1115.8875,732.0791,1030.158,1030.1648,1256.1227,744.7937,670.2265,251.7929


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,A_23_P100001,A_23_P100011,A_23_P100022,A_23_P100056,A_23_P100074,A_23_P100092,A_23_P100103,A_23_P100111,A_23_P100127,...,A_32_P99715,A_32_P99744,A_32_P99753,A_32_P99804,A_32_P99825,A_32_P9986,A_32_P99864,A_32_P99902,A_32_P99933,A_32_P99942
0,GSM543123,318.5345,52.670303,26.2208,24.889956,709.74695,177.7533,52.75283,22.65348,497.01306,...,82.33978,59.25271,161.0982,47.78682,25.179932,24.5527,22.688488,139.14238,24.05016,24.666681
1,GSM543124,983.33606,15.14521,10.168971,9.815201,3287.3096,237.88464,67.02363,35.698795,447.66074,...,203.03171,13.112881,36.56534,37.067436,9.749793,9.59075,29.112078,528.0679,9.376528,9.601794
2,GSM543125,941.8587,12.862661,15.440859,14.016779,2226.5806,157.60567,58.016613,53.014282,508.76117,...,108.163795,45.90747,89.98511,34.138325,14.551808,14.535099,31.475683,511.19614,13.628448,14.01407
3,GSM543126,753.24054,183.62819,16.379416,16.25257,470.47818,44.526405,32.031246,14.21207,38.25217,...,40.599606,173.78671,130.58841,56.68932,15.613212,14.870501,124.5584,235.11426,15.859659,15.25008
4,GSM543127,740.5418,16.069187,29.971552,3.773213,1645.5344,160.72177,79.828255,19.031565,247.7323,...,90.5091,25.393394,376.2373,102.92282,3.726412,3.760689,31.599518,568.17303,53.939533,3.638617


In [6]:
# gene expression data
metadata_filepath = "GSE21815_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE21815_merged_data.csv", index=False)
print("Merged data saved as GSE21815_merged_data.csv")

Merged data saved as GSE21815_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Columns: 41002 entries, geo_accession to A_32_P99942
dtypes: float64(41000), int64(1), object(1)
memory usage: 44.1+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,A_23_P100001,A_23_P100011,A_23_P100022,A_23_P100056,A_23_P100074,A_23_P100092,A_23_P100103,A_23_P100111,...,A_32_P99715,A_32_P99744,A_32_P99753,A_32_P99804,A_32_P99825,A_32_P9986,A_32_P99864,A_32_P99902,A_32_P99933,A_32_P99942
0,GSM543123,1,318.53450,52.670303,26.220800,24.889956,709.74695,177.753300,52.752830,22.653480,...,82.339780,59.252710,161.098200,47.786820,25.179932,24.552700,22.688488,139.14238,24.050160,24.666681
1,GSM543124,1,983.33606,15.145210,10.168971,9.815201,3287.30960,237.884640,67.023630,35.698795,...,203.031710,13.112881,36.565340,37.067436,9.749793,9.590750,29.112078,528.06790,9.376528,9.601794
2,GSM543125,1,941.85870,12.862661,15.440859,14.016779,2226.58060,157.605670,58.016613,53.014282,...,108.163795,45.907470,89.985110,34.138325,14.551808,14.535099,31.475683,511.19614,13.628448,14.014070
3,GSM543126,1,753.24054,183.628190,16.379416,16.252570,470.47818,44.526405,32.031246,14.212070,...,40.599606,173.786710,130.588410,56.689320,15.613212,14.870501,124.558400,235.11426,15.859659,15.250080
4,GSM543127,1,740.54180,16.069187,29.971552,3.773213,1645.53440,160.721770,79.828255,19.031565,...,90.509100,25.393394,376.237300,102.922820,3.726412,3.760689,31.599518,568.17303,53.939533,3.638617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,GSM543259,0,740.83606,16.690062,20.830055,3.217257,1030.16480,140.747570,64.187650,33.580425,...,156.826130,22.741884,218.892230,25.704508,3.200185,3.157884,20.334978,230.24132,27.837970,3.106764
137,GSM543260,0,862.86880,19.789549,25.690613,7.025648,1256.12270,153.896760,108.851875,55.231750,...,133.964170,32.206482,136.451110,65.048470,7.162023,6.791829,27.911260,325.01752,40.619080,6.902172
138,GSM543261,0,390.95377,15.265101,27.207232,5.753796,744.79370,72.157220,56.194050,43.110985,...,74.642660,17.526539,94.750145,427.419980,6.013173,5.597268,17.423372,239.02641,22.472095,5.767078
139,GSM543262,0,222.94467,15.430069,18.940218,2.505106,670.22650,90.233860,27.943022,36.782480,...,67.121820,8.831112,45.259365,17.908142,2.494004,2.383881,11.025971,198.90659,7.559063,2.419862


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
#df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,A_23_P100001,FAM174B
1,A_23_P100011,AP3S2
2,A_23_P100022,SV2B
3,A_23_P100056,RBPMS2
4,A_23_P100074,AVEN
...,...,...
30931,A_32_P99638,LOC339316
30932,A_32_P99648,EIF4E3
30933,A_32_P99690,NLN
30934,A_32_P99753,C3orf25


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,A_23_P100001,FAM174B
1,A_23_P100011,AP3S2
2,A_23_P100022,SV2B
3,A_23_P100056,RBPMS2
4,A_23_P100074,AVEN
...,...,...
30931,A_32_P99638,LOC339316
30932,A_32_P99648,EIF4E3
30933,A_32_P99690,NLN
30934,A_32_P99753,C3orf25


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'A_23_P100001': 'FAM174B',
 'A_23_P100011': 'AP3S2',
 'A_23_P100022': 'SV2B',
 'A_23_P100056': 'RBPMS2',
 'A_23_P100074': 'AVEN',
 'A_23_P100092': 'ZSCAN29',
 'A_23_P100103': 'VPS39',
 'A_23_P100111': 'CHP',
 'A_23_P100127': 'CASC5',
 'A_23_P100133': 'ATMIN',
 'A_23_P100141': 'UNKL',
 'A_23_P100156': 'TMEM127',
 'A_23_P100177': 'MMP15',
 'A_23_P100189': 'PRM1',
 'A_23_P100196': 'USP10',
 'A_23_P100203': 'HSBP1',
 'A_23_P100220': 'ESRP2',
 'A_23_P100240': 'CDH16',
 'A_23_P10025': 'NELL2',
 'A_23_P100263': 'CMIP',
 'A_23_P100278': 'GRIN2A',
 'A_23_P100292': 'ZNF598',
 'A_23_P100315': 'DECR2',
 'A_23_P100326': 'NPRL3',
 'A_23_P100341': 'ORC6',
 'A_23_P100344': 'ORC6',
 'A_23_P100355': 'PPP4C',
 'A_23_P100386': 'IL34',
 'A_23_P100392': 'TCEB2',
 'A_23_P100408': 'NTN3',
 'A_23_P100413': 'EXOSC6',
 'A_23_P100420': 'ZCCHC14',
 'A_23_P100441': 'FAM192A',
 'A_23_P100455': 'MTHFSD',
 'A_23_P100469': 'TXNL4B',
 'A_23_P100478': 'KAT8',
 'A_23_P100486': 'VKORC1',
 'A_23_P100499': 'ROGDI',
 'A_23_P

In [15]:
len(gene_mapping)

30936

In [16]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,FAM174B,AP3S2,SV2B,RBPMS2,AVEN,ZSCAN29,VPS39,CHP,...,A_32_P99715,A_32_P99744,C3orf25,A_32_P99804,A_32_P99825,A_32_P9986,A_32_P99864,C15orf40,A_32_P99933,A_32_P99942
0,GSM543123,1,318.53450,52.670303,26.220800,24.889956,709.74695,177.753300,52.752830,22.653480,...,82.339780,59.252710,161.098200,47.786820,25.179932,24.552700,22.688488,139.14238,24.050160,24.666681
1,GSM543124,1,983.33606,15.145210,10.168971,9.815201,3287.30960,237.884640,67.023630,35.698795,...,203.031710,13.112881,36.565340,37.067436,9.749793,9.590750,29.112078,528.06790,9.376528,9.601794
2,GSM543125,1,941.85870,12.862661,15.440859,14.016779,2226.58060,157.605670,58.016613,53.014282,...,108.163795,45.907470,89.985110,34.138325,14.551808,14.535099,31.475683,511.19614,13.628448,14.014070
3,GSM543126,1,753.24054,183.628190,16.379416,16.252570,470.47818,44.526405,32.031246,14.212070,...,40.599606,173.786710,130.588410,56.689320,15.613212,14.870501,124.558400,235.11426,15.859659,15.250080
4,GSM543127,1,740.54180,16.069187,29.971552,3.773213,1645.53440,160.721770,79.828255,19.031565,...,90.509100,25.393394,376.237300,102.922820,3.726412,3.760689,31.599518,568.17303,53.939533,3.638617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,GSM543259,0,740.83606,16.690062,20.830055,3.217257,1030.16480,140.747570,64.187650,33.580425,...,156.826130,22.741884,218.892230,25.704508,3.200185,3.157884,20.334978,230.24132,27.837970,3.106764
137,GSM543260,0,862.86880,19.789549,25.690613,7.025648,1256.12270,153.896760,108.851875,55.231750,...,133.964170,32.206482,136.451110,65.048470,7.162023,6.791829,27.911260,325.01752,40.619080,6.902172
138,GSM543261,0,390.95377,15.265101,27.207232,5.753796,744.79370,72.157220,56.194050,43.110985,...,74.642660,17.526539,94.750145,427.419980,6.013173,5.597268,17.423372,239.02641,22.472095,5.767078
139,GSM543262,0,222.94467,15.430069,18.940218,2.505106,670.22650,90.233860,27.943022,36.782480,...,67.121820,8.831112,45.259365,17.908142,2.494004,2.383881,11.025971,198.90659,7.559063,2.419862


In [17]:
df_expr_renamed.columns.duplicated().sum()

11341

In [18]:
df_expr_renamed.columns[df_expr_renamed.columns.duplicated()]

Index(['ORC6', 'AIPL1', 'KRT38', 'VSIG10L', 'ZNF614', 'SFRP1', 'PPP1R37',
       'APC2', 'WNT10A', 'SLC2A4RG',
       ...
       'TNPO1', 'CHST11', 'YTHDF2', 'WIF1', 'TRAPPC5', 'HNRNPM', 'HSF2',
       'EIF4E3', 'NLN', 'C15orf40'],
      dtype='object', length=11341)

In [19]:
from collections import Counter

df=df_expr_renamed
column_counts = Counter(df.columns)
for col, count in column_counts.items():
    if count>1:
        print(f"{col}: {count} times")

AP3S2: 3 times
ZSCAN29: 2 times
VPS39: 2 times
CHP: 2 times
CASC5: 3 times
ATMIN: 3 times
UNKL: 3 times
TMEM127: 2 times
MMP15: 2 times
USP10: 2 times
HSBP1: 2 times
CMIP: 2 times
GRIN2A: 4 times
ZNF598: 2 times
NPRL3: 2 times
ORC6: 2 times
TCEB2: 2 times
EXOSC6: 4 times
ZCCHC14: 2 times
FAM192A: 2 times
MTHFSD: 2 times
TXNL4B: 3 times
HMOX2: 2 times
TMEM170A: 2 times
ABCC6: 3 times
EXOC7: 3 times
TBCD: 3 times
HN1: 2 times
SMG6: 4 times
SMURF2: 2 times
UBTF: 2 times
STAT5B: 2 times
STAT3: 4 times
CCDC56: 2 times
ARL13B: 2 times
MYO19: 2 times
HOXB3: 3 times
SUZ12: 4 times
GIT1: 2 times
AIPL1: 2 times
KRT38: 2 times
TMC6: 2 times
LGALS9C: 2 times
CTDP1: 2 times
RNF125: 2 times
SFRP1: 2 times
VSIG10L: 2 times
ZNF614: 3 times
ZNF446: 2 times
ZNF587: 4 times
PPP1R37: 3 times
ARHGEF1: 2 times
NKIRAS2: 2 times
TMEM38A: 2 times
NLRP12: 2 times
BCAT2: 2 times
ACTN4: 2 times
ATCAY: 3 times
DOT1L: 3 times
IL12RB1: 2 times
HPN: 2 times
FBXO17: 3 times
APC2: 4 times
SEZ6L2: 2 times
ARL5A: 2 times

In [20]:
df_expr_renamed.shape

(141, 41002)

In [21]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE21815_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE21815_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE21815_gene_expression_renamed.csv'
