In [1]:
import pandas as pd
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE44861_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM1092909,GSM1092910,GSM1092911,GSM1092912,GSM1092913,GSM1092914,GSM1092915,GSM1092916,GSM1092917,...,GSM1093010,GSM1093011,GSM1093012,GSM1093013,GSM1093014,GSM1093015,GSM1093016,GSM1093017,GSM1093018,GSM1093019
0,1007_s_at,11.7622,10.5785,10.8828,11.233,10.0557,8.71981,10.1032,10.3361,10.4022,...,10.8628,11.0071,10.2954,11.6029,11.256,8.48345,11.8884,10.0129,10.9572,11.4878
1,1053_at,6.29542,6.77451,6.30281,8.18885,6.27496,7.23963,6.32391,6.68181,5.8371,...,6.43236,7.9771,7.29078,6.65951,7.40641,7.51192,6.73369,6.51293,5.97877,7.6279
2,117_at,5.5566,5.58586,5.61578,6.0957,5.99864,6.77403,5.62292,5.62754,5.61482,...,5.9922,6.09308,5.68518,5.86398,6.06084,5.76128,5.53477,6.32736,5.79387,6.07889
3,121_at,7.8109,7.66729,8.43118,7.81232,7.52846,7.62669,8.14157,8.25435,8.44242,...,7.65229,7.55289,7.9283,7.9515,8.29067,7.30795,7.93035,7.96173,8.03345,7.52337
4,1255_g_at,5.24249,5.21854,5.33201,5.30253,5.30274,5.19317,5.3992,5.37089,5.60335,...,5.31678,5.23348,5.31793,5.15591,5.31438,5.52718,5.24126,5.07803,5.41318,5.22522


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
0,GSM1092909,11.7622,6.29542,5.5566,7.8109,5.24249,9.1622,5.74971,5.90899,7.6454,...,13.0756,12.9699,14.6771,14.1862,6.77264,5.3874,5.50055,5.01456,5.04927,5.05146
1,GSM1092910,10.5785,6.77451,5.58586,7.66729,5.21854,7.73865,5.97187,5.52396,6.87646,...,13.3565,13.4528,14.837,14.487,6.68949,5.3736,5.52497,5.11313,5.05536,5.01421
2,GSM1092911,10.8828,6.30281,5.61578,8.43118,5.33201,8.72661,6.56693,5.67954,7.69565,...,14.8603,15.0778,15.3538,15.2983,6.81476,5.32175,5.59045,5.14005,5.19297,5.28986
3,GSM1092912,11.233,8.18885,6.0957,7.81232,5.30253,8.26605,5.69351,5.79842,7.64248,...,13.0209,12.946,14.5148,14.2798,6.59447,5.46719,5.45065,5.06556,5.04587,4.99616
4,GSM1092913,10.0557,6.27496,5.99864,7.52846,5.30274,8.74045,6.33641,5.81155,8.51429,...,14.0453,14.2421,15.138,14.9297,6.68579,5.44464,5.60029,5.07187,5.20692,5.05043


In [6]:
# gene expression data
metadata_filepath = "GSE44861_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE44861_merged_data.csv", index=False)
print("Merged data saved as GSE44861_merged_data.csv")

Merged data saved as GSE44861_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Columns: 22217 entries, geo_accession to 91952_at
dtypes: float64(22215), int64(1), object(1)
memory usage: 18.8+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,...,90610_at,91580_at,91617_at,91682_at,91684_g_at,91703_at,91816_f_at,91826_at,91920_at,91952_at
0,GSM1092909,0,11.76220,6.29542,5.55660,7.81090,5.24249,9.16220,5.74971,5.90899,...,7.86943,4.98448,7.18908,6.43754,6.41134,6.66857,6.01868,9.06129,5.89208,6.47189
1,GSM1092910,1,10.57850,6.77451,5.58586,7.66729,5.21854,7.73865,5.97187,5.52396,...,7.39007,5.03993,7.93754,6.61120,6.65593,6.01227,5.71611,9.28618,5.89694,6.26747
2,GSM1092911,0,10.88280,6.30281,5.61578,8.43118,5.33201,8.72661,6.56693,5.67954,...,10.00920,5.17202,8.47201,6.86043,6.45193,6.36926,5.39938,10.24930,6.24988,6.05145
3,GSM1092912,1,11.23300,8.18885,6.09570,7.81232,5.30253,8.26605,5.69351,5.79842,...,7.96354,4.93830,7.18008,6.69417,7.18770,6.14987,7.98777,8.50138,6.00022,6.56582
4,GSM1092913,0,10.05570,6.27496,5.99864,7.52846,5.30274,8.74045,6.33641,5.81155,...,9.09736,5.01600,7.95282,6.75630,6.34023,6.49607,5.50074,8.46019,5.87400,6.52917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,GSM1093015,1,8.48345,7.51192,5.76128,7.30795,5.52718,8.28275,6.31178,6.18549,...,8.32879,5.19599,7.67471,7.02798,6.84810,7.66751,6.83510,7.23230,6.53084,6.14957
107,GSM1093016,0,11.88840,6.73369,5.53477,7.93035,5.24126,9.51544,5.89620,5.86204,...,7.98083,4.99354,7.70769,6.59630,6.44176,7.09996,5.49886,8.98605,5.89060,7.28787
108,GSM1093017,1,10.01290,6.51293,6.32736,7.96173,5.07803,8.64392,6.12557,5.40914,...,8.57924,4.98191,7.04425,6.56639,6.60979,7.07794,5.42337,7.52787,6.03156,6.51158
109,GSM1093018,0,10.95720,5.97877,5.79387,8.03345,5.41318,9.59146,6.94554,5.81386,...,9.23162,5.17778,8.74249,6.78005,6.13279,6.48669,5.09639,9.50645,5.96558,5.90859


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
#df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,1007_s_at,DDR1
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
23081,91703_at,"MGC15523,EHBP1L1"
23082,91816_f_at,RKHD1
23083,91826_at,EPS8L1
23084,91920_at,BCAN


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,1007_s_at,DDR1
1,1053_at,RFC2
2,117_at,HSPA6
3,121_at,PAX8
4,1255_g_at,GUCA1A
...,...,...
21233,91703_at,"MGC15523,EHBP1L1,MGC15523,EHBP1L1"
21234,91816_f_at,RKHD1
21235,91826_at,EPS8L1
21236,91920_at,BCAN


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'1007_s_at': 'DDR1',
 '1053_at': 'RFC2',
 '117_at': 'HSPA6',
 '121_at': 'PAX8',
 '1255_g_at': 'GUCA1A',
 '1294_at': 'UBE1L',
 '1316_at': 'THRA',
 '1320_at': 'PTPN21',
 '1405_i_at': 'CCL5',
 '1431_at': 'CYP2E1',
 '1438_at': 'EPHB3',
 '1487_at': 'ESRRA',
 '1494_f_at': 'CYP2A6',
 '1598_g_at': 'GAS6',
 '160020_at': 'MMP14',
 '1729_at': 'TRADD',
 '1773_at': 'FNTB',
 '177_at': 'PLD1',
 '179_at': 'PMS2L11',
 '1861_at': 'BAD',
 '200000_s_at': 'PRPF8',
 '200001_at': 'CAPNS1',
 '200002_at': 'RPL35',
 '200003_s_at': 'RPL28',
 '200004_at': 'EIF4G2',
 '200005_at': 'EIF3S7',
 '200006_at': 'PARK7',
 '200007_at': 'SRP14',
 '200008_s_at': 'GDI2',
 '200009_at': 'GDI2',
 '200010_at': 'RPL11',
 '200011_s_at': 'ARF3',
 '200012_x_at': 'RPL21,LOC653737,LOC728501,LOC729402,LOC730700,LOC731567,RPL21,LOC653737,LOC728501,LOC729402,LOC730700,LOC731567,RPL21,LOC653737,LOC728501,LOC729402,LOC730700,LOC731567,RPL21,LOC653737,LOC728501,LOC729402,LOC730700,LOC731567,RPL21,LOC653737,LOC728501,LOC729402,LOC730700,LOC73

In [15]:
len(gene_mapping)

21238

In [16]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,DDR1,RFC2,HSPA6,PAX8,GUCA1A,UBE1L,THRA,PTPN21,...,LRCH4,LRTM1,DGCR8,91682_at,EXOSC4,"MGC15523,EHBP1L1,MGC15523,EHBP1L1",RKHD1,EPS8L1,BCAN,LOC90379
0,GSM1092909,0,11.76220,6.29542,5.55660,7.81090,5.24249,9.16220,5.74971,5.90899,...,7.86943,4.98448,7.18908,6.43754,6.41134,6.66857,6.01868,9.06129,5.89208,6.47189
1,GSM1092910,1,10.57850,6.77451,5.58586,7.66729,5.21854,7.73865,5.97187,5.52396,...,7.39007,5.03993,7.93754,6.61120,6.65593,6.01227,5.71611,9.28618,5.89694,6.26747
2,GSM1092911,0,10.88280,6.30281,5.61578,8.43118,5.33201,8.72661,6.56693,5.67954,...,10.00920,5.17202,8.47201,6.86043,6.45193,6.36926,5.39938,10.24930,6.24988,6.05145
3,GSM1092912,1,11.23300,8.18885,6.09570,7.81232,5.30253,8.26605,5.69351,5.79842,...,7.96354,4.93830,7.18008,6.69417,7.18770,6.14987,7.98777,8.50138,6.00022,6.56582
4,GSM1092913,0,10.05570,6.27496,5.99864,7.52846,5.30274,8.74045,6.33641,5.81155,...,9.09736,5.01600,7.95282,6.75630,6.34023,6.49607,5.50074,8.46019,5.87400,6.52917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,GSM1093015,1,8.48345,7.51192,5.76128,7.30795,5.52718,8.28275,6.31178,6.18549,...,8.32879,5.19599,7.67471,7.02798,6.84810,7.66751,6.83510,7.23230,6.53084,6.14957
107,GSM1093016,0,11.88840,6.73369,5.53477,7.93035,5.24126,9.51544,5.89620,5.86204,...,7.98083,4.99354,7.70769,6.59630,6.44176,7.09996,5.49886,8.98605,5.89060,7.28787
108,GSM1093017,1,10.01290,6.51293,6.32736,7.96173,5.07803,8.64392,6.12557,5.40914,...,8.57924,4.98191,7.04425,6.56639,6.60979,7.07794,5.42337,7.52787,6.03156,6.51158
109,GSM1093018,0,10.95720,5.97877,5.79387,8.03345,5.41318,9.59146,6.94554,5.81386,...,9.23162,5.17778,8.74249,6.78005,6.13279,6.48669,5.09639,9.50645,5.96558,5.90859


In [17]:
df_expr_renamed.columns.duplicated().sum()

7815

In [18]:
df_expr_renamed.columns[df_expr_renamed.columns.duplicated()]

Index(['GDI2', 'HNRPU', 'EIF3S10', 'EIF3S10', 'HSP90B1', 'PRKAR1A', 'PRKAR1A',
       'RAD21', 'WDR1', 'AP2B1',
       ...
       'FLJ20581', 'CENTA1', 'LRCH4', 'LRTM1', 'DGCR8', 'EXOSC4', 'RKHD1',
       'EPS8L1', 'BCAN', 'LOC90379'],
      dtype='object', length=7815)

In [19]:
from collections import Counter

df=df_expr_renamed
column_counts = Counter(df.columns)
for col, count in column_counts.items():
    if count>1:
        print(f"{col}: {count} times")

DDR1: 4 times
RFC2: 2 times
HSPA6: 2 times
PAX8: 8 times
GUCA1A: 2 times
UBE1L: 2 times
THRA: 4 times
PTPN21: 4 times
CCL5: 2 times
CYP2E1: 4 times
EPHB3: 2 times
ESRRA: 2 times
CYP2A6: 4 times
GAS6: 2 times
MMP14: 4 times
TRADD: 3 times
FNTB: 3 times
PLD1: 4 times
PMS2L11: 2 times
BAD: 2 times
RPL28: 2 times
EIF4G2: 2 times
GDI2: 2 times
ARF3: 3 times
HNRPC: 4 times
SEPT2: 2 times
HNRPA1: 2 times
TARDBP: 2 times
RPL18: 2 times
STARD7: 3 times
RPS11: 2 times
CBX3,LOC653972,CBX3,LOC653972: 2 times
RPL17: 3 times
PSMB2: 2 times
KHDRBS1: 3 times
BAT1: 2 times
SFRS9: 2 times
YY1: 4 times
JTB: 3 times
ZNF259,LOC442240,ZNF259,LOC442240: 2 times
NONO: 3 times
RNPS1: 2 times
NPM1: 3 times
HSP90AB1: 2 times
ARF1: 2 times
SNX3: 4 times
CANX: 3 times
SART3: 3 times
C2orf24: 2 times
HNRPM: 2 times
HNRPD: 6 times
GUK1: 2 times
OAZ1: 3 times
KARS: 2 times
H3F3A,LOC440926,LOC644914,LOC730740,H3F3A,LOC440926,LOC644914,LOC730740,H3F3A,LOC440926,LOC644914,LOC730740,H3F3A,LOC440926,LOC644914,LOC730740: 3

In [20]:
df_expr_renamed.shape

(111, 22217)

In [21]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE44861_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE44861_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE44861_gene_expression_renamed.csv'
