In [1]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
from statsmodels.stats.multitest import multipletests

In [2]:
# Load your data
df = pd.read_csv("GSE25070_gene_expression_renamed.csv")
df.head()

Unnamed: 0,geo_accession,target,EEF1A1,SLC35E2,RPS28,IPO13,AFAP,GGTLA4,CDT1,TRPV1,...,WSB1.1,WWOX.3,CD96.1,SPECC1.1,DNAJB14.2,COVA1.2,THOC3.1,MRRF.2,KIAA1751,ENTPD8.2
0,GSM615865,1,16.310673,7.472773,12.605355,8.388606,7.323597,7.161546,9.478377,6.876402,...,9.646388,7.423532,7.65744,7.026665,9.224129,9.562152,11.075949,10.226991,10.788766,7.402863
1,GSM615866,1,16.382521,7.525614,13.407539,8.300935,7.090995,7.287824,9.588871,7.06887,...,9.900085,7.292195,8.944919,7.301822,9.190759,8.765645,10.056183,10.511536,11.536755,9.097101
2,GSM615867,1,16.553639,7.226642,13.024218,8.476675,7.199945,7.336743,9.323239,7.135369,...,10.323218,7.486351,8.463814,7.302144,9.408262,8.867894,10.661818,10.199168,12.144687,8.930406
3,GSM615868,1,16.386483,7.584939,13.794024,8.507332,7.444461,7.277562,9.711417,6.934031,...,9.592024,7.620468,8.188683,7.065924,9.133406,10.003919,10.707401,10.316893,12.204338,6.96291
4,GSM615869,1,16.780133,7.391997,12.90724,8.381397,7.142943,7.255946,10.879047,7.209161,...,9.143178,7.950969,7.674844,7.229192,8.785894,9.715709,10.129632,10.000958,10.147589,8.567663


In [3]:
# Separate groups based on 'target' (0 = control, 1 = case)
group_0 = df[df['target'] == 0].drop(columns=['geo_accession', 'target'])
group_1 = df[df['target'] == 1].drop(columns=['geo_accession', 'target'])

In [5]:
# Perform t-test for each gene
deg_results = []
for gene in group_0.columns:
    stat, p_value = ttest_ind(group_0[gene], group_1[gene], equal_var=False)
    deg_results.append({'Gene': gene, 'p_value': p_value})

# Convert results to DataFrame
df_deg = pd.DataFrame(deg_results)

In [6]:
df_deg

Unnamed: 0,Gene,p_value
0,EEF1A1,8.914664e-01
1,SLC35E2,6.916640e-01
2,RPS28,1.372692e-01
3,IPO13,2.152958e-01
4,AFAP,5.277607e-01
...,...,...
24521,COVA1.2,1.498457e-07
24522,THOC3.1,1.725907e-04
24523,MRRF.2,1.907287e-02
24524,KIAA1751,1.441799e-03


In [7]:
# Filter DEG based on raw p-value threshold (e.g., 0.05)
deg_threshold = 0.05
deg_significant = df_deg[df_deg['p_value'] < deg_threshold]

# Output the significant DEGs
print(f"Significant DEGs (p-value < {deg_threshold}):")
print(deg_significant)

Significant DEGs (p-value < 0.05):
           Gene       p_value
6          CDT1  3.427553e-04
8           LPP  5.748730e-05
13      COL17A1  2.886960e-05
14        BCL6B  2.119026e-02
17      ATP13A4  6.869764e-03
...         ...           ...
24521   COVA1.2  1.498457e-07
24522   THOC3.1  1.725907e-04
24523    MRRF.2  1.907287e-02
24524  KIAA1751  1.441799e-03
24525  ENTPD8.2  1.215043e-08

[10067 rows x 2 columns]


In [8]:
# Save the final DEG results (with raw p-values) to a CSV file
deg_significant.to_csv('deg_raw_GSE25070.csv', index=False)

print("DEG results saved to 'deg_raw_GSE25070.csv'")

DEG results saved to 'deg_raw_GSE25070.csv'
