In [1]:
import pandas as pd
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE113513_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM3108231,GSM3108232,GSM3108233,GSM3108234,GSM3108235,GSM3108236,GSM3108237,GSM3108238,GSM3108239,...,GSM3108249,GSM3108250,GSM3108251,GSM3108252,GSM3108253,GSM3108254,GSM3108255,GSM3108256,GSM3108257,GSM3108258
0,11715100_at,20.55044,27.800337,16.569454,16.01692,18.516483,19.59903,14.035622,18.630518,15.249186,...,113.98192,22.238367,24.411734,26.851763,15.015553,15.305097,17.580502,31.714455,20.180305,23.165363
1,11715101_s_at,38.911007,53.513405,36.890644,43.05269,47.595444,48.847095,40.725403,47.139206,41.86981,...,129.33466,44.873833,53.24131,56.720863,46.001324,32.304096,41.219917,63.58804,51.425312,53.734013
2,11715102_x_at,25.82976,33.444916,19.229105,22.094769,26.349796,28.435322,20.063507,26.51047,18.727713,...,138.10585,31.332518,30.89234,36.963127,23.635046,21.627863,25.500513,45.3729,28.139097,32.395504
3,11715103_x_at,35.28268,73.262596,48.691338,45.083103,53.57329,54.26887,44.54233,50.405674,99.52152,...,41.392715,54.75756,49.036583,81.83238,43.837696,55.213245,85.29773,61.27778,74.37124,71.067116
4,11715104_s_at,20.890873,40.296627,47.897854,74.16083,67.53836,70.46409,46.969917,20.381641,30.333733,...,19.147799,16.765242,17.671259,15.452515,18.031252,19.746616,15.258782,16.930073,15.564357,16.993692


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,11715100_at,11715101_s_at,11715102_x_at,11715103_x_at,11715104_s_at,11715105_at,11715106_x_at,11715107_s_at,11715108_x_at,...,AFFX-r2-TagO-3_at,AFFX-r2-TagO-5_at,AFFX-r2-TagQ-3_at,AFFX-r2-TagQ-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,GSM3108231,20.55044,38.911007,25.82976,35.28268,20.890873,584.40533,16.210958,19.788046,8.002856,...,9.011358,8.167891,8.400082,10.856209,341.78528,14.535798,68.16242,6.568994,9.048326,7.072084
1,GSM3108232,27.800337,53.513405,33.444916,73.262596,40.296627,8.146114,25.810207,17.198637,10.963716,...,7.333499,7.32496,7.844143,13.119226,930.1801,24.197203,183.31595,6.111884,10.366261,7.779812
2,GSM3108233,16.569454,36.890644,19.229105,48.691338,47.897854,7.184215,15.303843,17.251259,8.699706,...,6.46152,5.668957,7.923203,9.848433,371.97983,12.700565,80.22721,5.579404,8.286402,7.756152
3,GSM3108234,16.01692,43.05269,22.094769,45.083103,74.16083,7.416252,18.476812,16.654469,7.955965,...,7.992805,7.702706,7.602839,9.054254,267.1281,13.136146,52.165592,5.868696,9.331419,7.682987
4,GSM3108235,18.516483,47.595444,26.349796,53.57329,67.53836,9.876798,16.090324,20.565605,10.133324,...,6.740306,6.485854,7.391753,10.520305,340.5431,11.294785,61.768856,6.452526,8.89755,7.286209


In [6]:
# gene expression data
metadata_filepath = "GSE113513_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE113513_merged_data.csv", index=False)
print("Merged data saved as GSE113513_merged_data.csv")

Merged data saved as GSE113513_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Columns: 49295 entries, geo_accession to 200099_PM_s_at
dtypes: float64(49293), int64(1), object(1)
memory usage: 10.5+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,11715100_at,11715101_s_at,11715102_x_at,11715103_x_at,11715104_s_at,11715105_at,11715106_x_at,11715107_s_at,...,200090_PM_at,200091_PM_s_at,200092_PM_s_at,200093_PM_s_at,200094_PM_s_at,200095_PM_x_at,200096_PM_s_at,200097_PM_s_at,200098_PM_s_at,200099_PM_s_at
0,GSM3108231,0,20.55044,38.911007,25.82976,35.28268,20.890873,584.40533,16.210958,19.788046,...,1053.6359,5611.5312,7948.273,2010.2947,3449.94,10550.366,1314.9065,2117.7603,744.8316,10222.577
1,GSM3108232,0,27.800337,53.513405,33.444916,73.262596,40.296627,8.146114,25.810207,17.198637,...,973.078,3750.1787,7417.5923,2234.6343,3228.264,9650.367,912.74695,1555.5916,857.97455,7218.9033
2,GSM3108233,0,16.569454,36.890644,19.229105,48.691338,47.897854,7.184215,15.303843,17.251259,...,1120.9429,5739.152,8566.28,2121.5154,4327.257,10512.676,1091.4336,1830.5729,942.2897,10200.012
3,GSM3108234,0,16.01692,43.05269,22.094769,45.083103,74.16083,7.416252,18.476812,16.654469,...,1057.1653,4998.4395,7835.917,2142.4268,3594.5344,9816.909,1161.4584,2035.7389,951.96826,9022.114
4,GSM3108235,0,18.516483,47.595444,26.349796,53.57329,67.53836,9.876798,16.090324,20.565605,...,1052.3474,4789.2974,7861.5054,2454.4285,4148.504,10575.629,1156.9618,1682.7778,1038.5845,8917.155
5,GSM3108236,0,19.59903,48.847095,28.435322,54.26887,70.46409,8.973177,15.787043,16.811987,...,873.82996,4150.6133,7050.492,2315.388,3775.5898,10755.357,1064.0691,1876.6393,1071.4733,8265.083
6,GSM3108237,0,14.035622,40.725403,20.063507,44.54233,46.969917,8.31357,17.847345,19.606396,...,806.514,4879.9775,7970.924,2388.8083,3473.2642,10655.845,1220.9924,1768.9928,918.18427,8187.1655
7,GSM3108238,0,18.630518,47.139206,26.51047,50.405674,20.381641,7.860251,14.737239,16.854115,...,1088.4867,5686.4453,8225.697,2060.805,4483.5312,10798.929,1372.7394,2156.717,984.75476,9670.291
8,GSM3108239,0,15.249186,41.86981,18.727713,99.52152,30.333733,7.554156,15.826531,16.735561,...,1006.83374,6008.4585,8335.335,2267.5164,4734.6953,10546.971,1280.0126,1882.3619,955.69086,9785.635
9,GSM3108240,0,16.836504,49.68746,23.065104,65.25256,71.3619,8.200011,13.474164,15.745662,...,1041.2979,5295.339,7506.746,2542.2188,4283.7686,10775.027,1253.4601,2210.405,1008.72906,9475.596


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
#df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,11715100_at,HIST1H3G
1,11715101_s_at,HIST1H3G
2,11715102_x_at,HIST1H3G
3,11715103_x_at,TNFAIP8L1
4,11715104_s_at,OTOP2
...,...,...
52821,200096_PM_s_at,ATP6V0E1
52822,200097_PM_s_at,HNRNPK
52823,200098_PM_s_at,ANAPC5
52824,200099_PM_s_at,"RPS3A,RPS3AP5"


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,11715100_at,HIST1H3G
1,11715101_s_at,HIST1H3G
2,11715102_x_at,HIST1H3G
3,11715103_x_at,TNFAIP8L1
4,11715104_s_at,OTOP2
...,...,...
49288,200095_PM_x_at,RPS10
49289,200096_PM_s_at,ATP6V0E1
49290,200097_PM_s_at,HNRNPK
49291,200098_PM_s_at,ANAPC5


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'11715100_at': 'HIST1H3G',
 '11715101_s_at': 'HIST1H3G',
 '11715102_x_at': 'HIST1H3G',
 '11715103_x_at': 'TNFAIP8L1',
 '11715104_s_at': 'OTOP2',
 '11715105_at': 'C17orf78',
 '11715106_x_at': 'CTAGE15,CTAGE6,CTAGE15,CTAGE6',
 '11715107_s_at': 'F8A1,F8A2,F8A3,F8A1,F8A2,F8A3,F8A1,F8A2,F8A3',
 '11715108_x_at': 'LINC01098',
 '11715109_at': 'SAMD7',
 '11715110_at': 'ARRDC5',
 '11715111_s_at': 'CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8',
 '11715112_at': 'ERICH3',
 '11715113_x_at': 'FAM86C1',
 '11715114_x_at': 'FAM86C1',
 '11715115_s_at': 'HIST1H2BI',
 '11715116_s_at': 'HIST1H4E',
 '11715117_x_at': 'HIST1H2AJ',
 '11715118_s_at': 'HIST1H2BF',
 '11715119_s_at': 'C2CD4B',
 '11715120_s_at': 'HIST1H2BE',
 '11715121_s_at': 'HIST1H3A',
 '11715122_at': 'RAB3D',
 '11715123_s_at': 'PDZD9',
 '11715124_s_at': 'CACNG8',
 '11715125_at': 'MORC2',
 '11715126_s_at': 'HIST1H2BN',


In [15]:
len(gene_mapping)

49293

In [16]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,HIST1H3G,HIST1H3G.1,HIST1H3G.2,TNFAIP8L1,OTOP2,C17orf78,"CTAGE15,CTAGE6,CTAGE15,CTAGE6","F8A1,F8A2,F8A3,F8A1,F8A2,F8A3,F8A1,F8A2,F8A3",...,FNTA,RPS25,RPL37,HINT1,EEF2,RPS10,ATP6V0E1,HNRNPK,ANAPC5,"RPS3A,RPS3AP5,RPS3A,RPS3AP5"
0,GSM3108231,0,20.55044,38.911007,25.82976,35.28268,20.890873,584.40533,16.210958,19.788046,...,1053.6359,5611.5312,7948.273,2010.2947,3449.94,10550.366,1314.9065,2117.7603,744.8316,10222.577
1,GSM3108232,0,27.800337,53.513405,33.444916,73.262596,40.296627,8.146114,25.810207,17.198637,...,973.078,3750.1787,7417.5923,2234.6343,3228.264,9650.367,912.74695,1555.5916,857.97455,7218.9033
2,GSM3108233,0,16.569454,36.890644,19.229105,48.691338,47.897854,7.184215,15.303843,17.251259,...,1120.9429,5739.152,8566.28,2121.5154,4327.257,10512.676,1091.4336,1830.5729,942.2897,10200.012
3,GSM3108234,0,16.01692,43.05269,22.094769,45.083103,74.16083,7.416252,18.476812,16.654469,...,1057.1653,4998.4395,7835.917,2142.4268,3594.5344,9816.909,1161.4584,2035.7389,951.96826,9022.114
4,GSM3108235,0,18.516483,47.595444,26.349796,53.57329,67.53836,9.876798,16.090324,20.565605,...,1052.3474,4789.2974,7861.5054,2454.4285,4148.504,10575.629,1156.9618,1682.7778,1038.5845,8917.155
5,GSM3108236,0,19.59903,48.847095,28.435322,54.26887,70.46409,8.973177,15.787043,16.811987,...,873.82996,4150.6133,7050.492,2315.388,3775.5898,10755.357,1064.0691,1876.6393,1071.4733,8265.083
6,GSM3108237,0,14.035622,40.725403,20.063507,44.54233,46.969917,8.31357,17.847345,19.606396,...,806.514,4879.9775,7970.924,2388.8083,3473.2642,10655.845,1220.9924,1768.9928,918.18427,8187.1655
7,GSM3108238,0,18.630518,47.139206,26.51047,50.405674,20.381641,7.860251,14.737239,16.854115,...,1088.4867,5686.4453,8225.697,2060.805,4483.5312,10798.929,1372.7394,2156.717,984.75476,9670.291
8,GSM3108239,0,15.249186,41.86981,18.727713,99.52152,30.333733,7.554156,15.826531,16.735561,...,1006.83374,6008.4585,8335.335,2267.5164,4734.6953,10546.971,1280.0126,1882.3619,955.69086,9785.635
9,GSM3108240,0,16.836504,49.68746,23.065104,65.25256,71.3619,8.200011,13.474164,15.745662,...,1041.2979,5295.339,7506.746,2542.2188,4283.7686,10775.027,1253.4601,2210.405,1008.72906,9475.596


In [17]:
df_expr_renamed.columns.duplicated().sum()

29208

In [18]:
df_expr_renamed.columns[df_expr_renamed.columns.duplicated()]

Index(['HIST1H3G', 'HIST1H3G', 'FAM86C1',
       'CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8',
       'SSX2B,SSX7,SSX2B,SSX7', 'HIST2H2AB', 'PCDHGB1', 'DNAJC12', 'DEPDC5',
       'TDRKH',
       ...
       'RPL12', 'RPL4', 'FNTA', 'RPS25', 'RPL37', 'HINT1', 'EEF2', 'ATP6V0E1',
       'HNRNPK', 'ANAPC5'],
      dtype='object', length=29208)

In [19]:
from collections import Counter

df=df_expr_renamed
column_counts = Counter(df.columns)
for col, count in column_counts.items():
    if count>1:
        print(f"{col}: {count} times")

HIST1H3G: 4 times
TNFAIP8L1: 2 times
OTOP2: 2 times
CTAGE15,CTAGE6,CTAGE15,CTAGE6: 2 times
F8A1,F8A2,F8A3,F8A1,F8A2,F8A3,F8A1,F8A2,F8A3: 2 times
LINC01098: 2 times
CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8,CGB,CGB1,CGB2,CGB5,CGB7,CGB8: 4 times
FAM86C1: 3 times
HIST1H2BI: 3 times
HIST1H4E: 2 times
HIST1H2BF: 3 times
HIST1H2BE: 3 times
HIST1H3A: 2 times
PDZD9: 2 times
GPR32: 3 times
PCDHGB5: 2 times
KRTAP6-2: 2 times
KRTAP20-1: 2 times
HIST1H2BH: 3 times
PCDHGA1: 3 times
SNAPC5: 3 times
SSX2B,SSX7,SSX2B,SSX7: 2 times
GSC2: 2 times
OR9Q2: 2 times
KRTAP19-7: 2 times
KRTAP20-2: 2 times
HIST2H2AB: 3 times
PCDHA4: 2 times
PCDHGB1: 2 times
TUBB7P,TUBB8,TUBB8P7,TUBBP5,TUBB7P,TUBB8,TUBB8P7,TUBBP5,TUBB7P,TUBB8,TUBB8P7,TUBBP5,TUBB7P,TUBB8,TUBB8P7,TUBBP5: 2 times
KRTAP6-1: 2 times
KRTAP19-2: 2 times
KRTAP19-4: 2 times
KRTAP21-2: 2 times
HIST1H2AK: 3 times
DEFB128: 2 times
DNAJC12: 2 times
POU3F3: 

In [20]:
df_expr_renamed.shape

(28, 49295)

In [21]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE113513_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE113513_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE113513_gene_expression_renamed.csv'
