In [1]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np

In [2]:
# gene expression data
gene_expression_path = "GSE25070_gene_expression.csv"
gene_expression = pd.read_csv(gene_expression_path)

In [3]:
gene_expression.head()

Unnamed: 0,ID_REF,GSM615865,GSM615866,GSM615867,GSM615868,GSM615869,GSM615870,GSM615871,GSM615872,GSM615873,...,GSM615907,GSM615908,GSM615909,GSM615910,GSM615911,GSM615912,GSM615913,GSM615914,GSM615915,GSM615916
0,ILMN_1343291,16.310673,16.382521,16.553639,16.386483,16.780133,16.293642,16.799681,16.33427,16.683105,...,15.923677,15.722273,15.722958,15.903641,15.866211,16.015222,16.025085,15.793667,15.877138,16.200818
1,ILMN_1651209,7.472773,7.525614,7.226642,7.584939,7.391997,7.32677,7.326228,7.431076,7.53881,...,7.52666,7.464878,7.517103,7.524157,7.055126,7.483782,7.23907,7.058486,7.284159,7.39582
2,ILMN_1651228,12.605355,13.407539,13.024218,13.794024,12.90724,12.648493,12.824318,13.576602,12.874897,...,13.126329,12.996417,12.977386,12.481035,13.68442,12.821349,13.278809,13.748332,13.627788,13.613182
3,ILMN_1651229,8.388606,8.300935,8.476675,8.507332,8.381397,8.561448,8.367523,8.318598,8.241591,...,8.412907,8.624894,8.302382,8.470141,8.744655,8.54185,8.210321,8.429287,8.016001,7.903962
4,ILMN_1651235,7.323597,7.090995,7.199945,7.444461,7.142943,7.318811,7.273114,7.222337,7.515402,...,7.075781,6.943973,7.105941,7.174483,7.301426,7.0872,7.193387,7.235719,7.050396,7.010118


In [4]:
# Transpose the gene expression data to make geo_accession a column
gene_expression_transposed = gene_expression.set_index("ID_REF").transpose()
gene_expression_transposed.index.name = "geo_accession"  # Rename the index
gene_expression_transposed.reset_index(inplace=True)  # Convert index to a column

In [5]:
gene_expression_transposed.head()

ID_REF,geo_accession,ILMN_1343291,ILMN_1651209,ILMN_1651228,ILMN_1651229,ILMN_1651235,ILMN_1651236,ILMN_1651237,ILMN_1651238,ILMN_1651254,...,ILMN_2415748,ILMN_2415776,ILMN_2415786,ILMN_2415826,ILMN_2415898,ILMN_2415911,ILMN_2415926,ILMN_2415949,ILMN_2415979,ILMN_2416019
0,GSM615865,16.310673,7.472773,12.605355,8.388606,7.323597,7.161546,9.478377,6.876402,11.729496,...,9.646388,7.423532,7.65744,7.026665,9.224129,9.562152,11.075949,10.226991,10.788766,7.402863
1,GSM615866,16.382521,7.525614,13.407539,8.300935,7.090995,7.287824,9.588871,7.06887,12.466325,...,9.900085,7.292195,8.944919,7.301822,9.190759,8.765645,10.056183,10.511536,11.536755,9.097101
2,GSM615867,16.553639,7.226642,13.024218,8.476675,7.199945,7.336743,9.323239,7.135369,12.476884,...,10.323218,7.486351,8.463814,7.302144,9.408262,8.867894,10.661818,10.199168,12.144687,8.930406
3,GSM615868,16.386483,7.584939,13.794024,8.507332,7.444461,7.277562,9.711417,6.934031,13.33542,...,9.592024,7.620468,8.188683,7.065924,9.133406,10.003919,10.707401,10.316893,12.204338,6.96291
4,GSM615869,16.780133,7.391997,12.90724,8.381397,7.142943,7.255946,10.879047,7.209161,12.787669,...,9.143178,7.950969,7.674844,7.229192,8.785894,9.715709,10.129632,10.000958,10.147589,8.567663


In [6]:
# gene expression data
metadata_filepath = "GSE25070_target_metadata.csv"
metadata = pd.read_csv(metadata_filepath)

In [7]:
# Merge metadata with transposed gene expression data
merged_data = pd.merge(metadata, gene_expression_transposed, on="geo_accession")

# Save the merged data
# remove control probes (AFFX-*)
merged_data = merged_data.loc[:, ~merged_data.columns.str.startswith("AFFX-")]
merged_data.to_csv("GSE25070_merged_data.csv", index=False)
print("Merged data saved as GSE25070_merged_data.csv")

Merged data saved as GSE25070_merged_data.csv


In [8]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Columns: 24528 entries, geo_accession to ILMN_2416019
dtypes: float64(24526), int64(1), object(1)
memory usage: 9.7+ MB


In [9]:
merged_data

Unnamed: 0,geo_accession,target,ILMN_1343291,ILMN_1651209,ILMN_1651228,ILMN_1651229,ILMN_1651235,ILMN_1651236,ILMN_1651237,ILMN_1651238,...,ILMN_2415748,ILMN_2415776,ILMN_2415786,ILMN_2415826,ILMN_2415898,ILMN_2415911,ILMN_2415926,ILMN_2415949,ILMN_2415979,ILMN_2416019
0,GSM615865,1,16.310673,7.472773,12.605355,8.388606,7.323597,7.161546,9.478377,6.876402,...,9.646388,7.423532,7.65744,7.026665,9.224129,9.562152,11.075949,10.226991,10.788766,7.402863
1,GSM615866,1,16.382521,7.525614,13.407539,8.300935,7.090995,7.287824,9.588871,7.06887,...,9.900085,7.292195,8.944919,7.301822,9.190759,8.765645,10.056183,10.511536,11.536755,9.097101
2,GSM615867,1,16.553639,7.226642,13.024218,8.476675,7.199945,7.336743,9.323239,7.135369,...,10.323218,7.486351,8.463814,7.302144,9.408262,8.867894,10.661818,10.199168,12.144687,8.930406
3,GSM615868,1,16.386483,7.584939,13.794024,8.507332,7.444461,7.277562,9.711417,6.934031,...,9.592024,7.620468,8.188683,7.065924,9.133406,10.003919,10.707401,10.316893,12.204338,6.96291
4,GSM615869,1,16.780133,7.391997,12.90724,8.381397,7.142943,7.255946,10.879047,7.209161,...,9.143178,7.950969,7.674844,7.229192,8.785894,9.715709,10.129632,10.000958,10.147589,8.567663
5,GSM615870,1,16.293642,7.32677,12.648493,8.561448,7.318811,7.264386,9.720857,6.867115,...,9.245214,7.721418,8.013836,7.077708,8.971089,9.861532,11.209459,10.349404,10.921249,7.673327
6,GSM615871,1,16.799681,7.326228,12.824318,8.367523,7.273114,7.287287,8.260231,7.221032,...,9.579923,7.411567,7.980781,7.190713,9.077886,9.629277,11.365492,10.354496,11.622807,7.118221
7,GSM615872,1,16.33427,7.431076,13.576602,8.318598,7.222337,7.141597,9.448613,7.183768,...,9.702729,7.549229,8.32207,7.190149,9.487914,9.43868,10.56068,10.925172,10.835279,8.387435
8,GSM615873,1,16.683105,7.53881,12.874897,8.241591,7.515402,7.251015,9.786867,6.914817,...,10.11166,7.533991,7.887591,7.180163,9.419027,9.155193,11.085483,10.66529,10.713731,8.58315
9,GSM615874,1,17.038012,7.39566,12.661084,8.306763,7.423273,7.302419,10.374075,7.021226,...,8.730506,7.14143,7.566688,7.169577,9.646652,9.629248,10.858957,10.48251,11.151167,8.121389


In [10]:
# gene annot data
mapped_gene = "mapped_gene_list.csv"
df_mapped_gene = pd.read_csv(mapped_gene)
df_mapped_gene = df_mapped_gene[~df_mapped_gene["Cleaned_Gene_Symbol"].str.contains(r"MIR\d+", na=False)]

In [11]:
df_mapped_gene

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1651209,SLC35E2
2,ILMN_1651228,RPS28
3,ILMN_1651229,IPO13
4,ILMN_1651235,AFAP
...,...,...
24521,ILMN_2415911,COVA1
24522,ILMN_2415926,THOC3
24523,ILMN_2415949,MRRF
24524,ILMN_2415979,KIAA1751


In [12]:
df_annot = df_mapped_gene

# group multiple gene symbols per probe ID
df_annot_grouped = df_annot.groupby("ID_REF")["Cleaned_Gene_Symbol"].apply(lambda x: ",".join(x)).reset_index()

In [13]:
df_annot_grouped

Unnamed: 0,ID_REF,Cleaned_Gene_Symbol
0,ILMN_1343291,EEF1A1
1,ILMN_1651209,SLC35E2
2,ILMN_1651228,RPS28
3,ILMN_1651229,IPO13
4,ILMN_1651235,AFAP
...,...,...
24520,ILMN_2415911,COVA1
24521,ILMN_2415926,THOC3
24522,ILMN_2415949,MRRF
24523,ILMN_2415979,KIAA1751


In [14]:
# Create a dictionary mapping 'ID_REF' to 'Cleaned_Gene_Symbol'
gene_mapping = dict(zip(df_annot_grouped['ID_REF'], df_annot_grouped['Cleaned_Gene_Symbol']))
gene_mapping

{'ILMN_1343291': 'EEF1A1',
 'ILMN_1651209': 'SLC35E2',
 'ILMN_1651228': 'RPS28',
 'ILMN_1651229': 'IPO13',
 'ILMN_1651235': 'AFAP',
 'ILMN_1651236': 'GGTLA4',
 'ILMN_1651237': 'CDT1',
 'ILMN_1651238': 'TRPV1',
 'ILMN_1651254': 'LPP',
 'ILMN_1651260': 'CCNE2',
 'ILMN_1651262': 'HNRPAB',
 'ILMN_1651268': 'LOH12CR1',
 'ILMN_1651278': 'SNIP1',
 'ILMN_1651282': 'COL17A1',
 'ILMN_1651285': 'BCL6B',
 'ILMN_1651286': 'GRHL1',
 'ILMN_1651292': 'LOC644613',
 'ILMN_1651303': 'ATP13A4',
 'ILMN_1651309': 'EPAG',
 'ILMN_1651315': 'HMG20B',
 'ILMN_1651330': 'KCNG4',
 'ILMN_1651336': 'MLYCD',
 'ILMN_1651343': 'ITGA11',
 'ILMN_1651346': 'TICAM2',
 'ILMN_1651347': 'SERTAD2',
 'ILMN_1651354': 'SPP1',
 'ILMN_1651358': 'HBE1',
 'ILMN_1651364': 'PCBD2',
 'ILMN_1651365': 'ZBED2',
 'ILMN_1651370': 'USP21',
 'ILMN_1651373': 'RHD',
 'ILMN_1651378': 'AUP1',
 'ILMN_1651385': 'MFN2',
 'ILMN_1651405': 'BRD9',
 'ILMN_1651415': 'DERL3',
 'ILMN_1651428': 'SYTL2',
 'ILMN_1651429': 'SELM',
 'ILMN_1651430': 'CTRB1',
 'IL

In [15]:
# Rename columns in df_expr using the mapping (excluding first two columns)
df_expr_renamed = merged_data.rename(columns=gene_mapping)
df_expr_renamed

Unnamed: 0,geo_accession,target,EEF1A1,SLC35E2,RPS28,IPO13,AFAP,GGTLA4,CDT1,TRPV1,...,WSB1,WWOX,CD96,SPECC1,DNAJB14,COVA1,THOC3,MRRF,KIAA1751,ENTPD8
0,GSM615865,1,16.310673,7.472773,12.605355,8.388606,7.323597,7.161546,9.478377,6.876402,...,9.646388,7.423532,7.65744,7.026665,9.224129,9.562152,11.075949,10.226991,10.788766,7.402863
1,GSM615866,1,16.382521,7.525614,13.407539,8.300935,7.090995,7.287824,9.588871,7.06887,...,9.900085,7.292195,8.944919,7.301822,9.190759,8.765645,10.056183,10.511536,11.536755,9.097101
2,GSM615867,1,16.553639,7.226642,13.024218,8.476675,7.199945,7.336743,9.323239,7.135369,...,10.323218,7.486351,8.463814,7.302144,9.408262,8.867894,10.661818,10.199168,12.144687,8.930406
3,GSM615868,1,16.386483,7.584939,13.794024,8.507332,7.444461,7.277562,9.711417,6.934031,...,9.592024,7.620468,8.188683,7.065924,9.133406,10.003919,10.707401,10.316893,12.204338,6.96291
4,GSM615869,1,16.780133,7.391997,12.90724,8.381397,7.142943,7.255946,10.879047,7.209161,...,9.143178,7.950969,7.674844,7.229192,8.785894,9.715709,10.129632,10.000958,10.147589,8.567663
5,GSM615870,1,16.293642,7.32677,12.648493,8.561448,7.318811,7.264386,9.720857,6.867115,...,9.245214,7.721418,8.013836,7.077708,8.971089,9.861532,11.209459,10.349404,10.921249,7.673327
6,GSM615871,1,16.799681,7.326228,12.824318,8.367523,7.273114,7.287287,8.260231,7.221032,...,9.579923,7.411567,7.980781,7.190713,9.077886,9.629277,11.365492,10.354496,11.622807,7.118221
7,GSM615872,1,16.33427,7.431076,13.576602,8.318598,7.222337,7.141597,9.448613,7.183768,...,9.702729,7.549229,8.32207,7.190149,9.487914,9.43868,10.56068,10.925172,10.835279,8.387435
8,GSM615873,1,16.683105,7.53881,12.874897,8.241591,7.515402,7.251015,9.786867,6.914817,...,10.11166,7.533991,7.887591,7.180163,9.419027,9.155193,11.085483,10.66529,10.713731,8.58315
9,GSM615874,1,17.038012,7.39566,12.661084,8.306763,7.423273,7.302419,10.374075,7.021226,...,8.730506,7.14143,7.566688,7.169577,9.646652,9.629248,10.858957,10.48251,11.151167,8.121389


In [16]:
df_expr_renamed.columns.duplicated().sum()

5895

In [17]:
df_expr_renamed.columns[df_expr_renamed.columns.duplicated()]

Index(['TRPV1', 'INPP4A', 'TRNT1', 'DMD', 'CDKN2C', 'FLJ20186', 'TRIM6', 'ABR',
       'LOC647649', 'FBXL10',
       ...
       'RPS29', 'WSB1', 'WWOX', 'CD96', 'SPECC1', 'DNAJB14', 'COVA1', 'THOC3',
       'MRRF', 'ENTPD8'],
      dtype='object', length=5895)

In [18]:
from collections import Counter

df=df_expr_renamed
column_counts = Counter(df.columns)
for col, count in column_counts.items():
    if count>1:
        print(f"{col}: {count} times")


EEF1A1: 2 times
AFAP: 3 times
GGTLA4: 3 times
TRPV1: 5 times
CCNE2: 2 times
HNRPAB: 3 times
COL17A1: 2 times
GRHL1: 3 times
KCNG4: 2 times
ITGA11: 2 times
SPP1: 2 times
USP21: 3 times
BRD9: 2 times
DERL3: 2 times
SYTL2: 4 times
CTRB1: 2 times
FOXI1: 2 times
UTY: 3 times
FBXL10: 3 times
CYP26A1: 3 times
WNT2B: 2 times
HIST1H2BD: 3 times
ERGIC1: 3 times
ABCD4: 2 times
GEFT: 3 times
WHSC1: 5 times
FBXO44: 3 times
ENTPD2: 2 times
SPRED3: 3 times
CTNNB1: 2 times
EXOC6: 3 times
ASB11: 3 times
RTN3: 4 times
ADAM33: 2 times
WDFY3: 3 times
GSG1: 3 times
ADAMTS20: 2 times
MBTPS1: 2 times
ARMCX3: 2 times
RRAGB: 3 times
MSH5: 5 times
RORC: 4 times
GSTM4: 2 times
GRIK1: 3 times
CCT3: 2 times
ITSN1: 3 times
EVC: 2 times
ZNF187: 2 times
ANK3: 3 times
SCYL3: 3 times
ABCC5: 3 times
FGFR3: 2 times
TP53I3: 2 times
PPEF1: 2 times
DHX57: 2 times
CSNK1G3: 3 times
TTN: 4 times
KRAS: 2 times
MRPL43: 3 times
PNKD: 3 times
LOC649159: 2 times
CDKN2C: 3 times
IL4R: 2 times
CCM2: 3 times
PPP2R4: 4 times
FBXO32: 3 

In [19]:
# Save the updated dataframe
df_expr_renamed.to_csv("GSE25070_gene_expression_renamed.csv", index=False)

print("Column renaming completed! File saved as 'GSE25070_gene_expression_renamed.csv'")

Column renaming completed! File saved as 'GSE25070_gene_expression_renamed.csv'
