In [None]:
import pandas as pd
import numpy as np
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

In [None]:
# Load data
stage_1_samples = pd.read_csv('../data/cancer/stage_1_prostate_cancer_samples.csv')
stage_2_samples = pd.read_csv('../data/cancer/stage_2_prostate_cancer_samples.csv')

# Clean and combine datasets
stage_1_samples.columns = stage_1_samples.columns.str.strip()
stage_2_samples.columns = stage_2_samples.columns.str.strip()
combined_dataset = pd.concat([stage_1_samples, stage_2_samples], ignore_index=True)
combined_dataset['Stage'] = combined_dataset['Stage'].str.strip()
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: 1', 0, 1)
combined_dataset['ID_REF'] = np.where(combined_dataset['Stage'] == 'Stage: 2', 1, combined_dataset['ID_REF'])


# Drop irrelevant columns
columns_to_drop = ['Sample_ID', 'Sex', 'Age', 'Stage', 'Disease']
combined_dataset = combined_dataset.drop(columns=columns_to_drop, axis=1)

# Separate the ID_REF column
id_ref = combined_dataset["ID_REF"]
combined_dataset = combined_dataset.drop(["ID_REF"], axis=1)

# Convert all values to numeric, fill NaNs, and ensure non-negative values
combined_dataset = combined_dataset.apply(pd.to_numeric, errors='coerce').fillna(0)
combined_dataset = combined_dataset.applymap(lambda x: max(0, int(x)))

# Add ID_REF back to the dataset
combined_dataset["ID_REF"] = id_ref

# Create counts and metadata DataFrames
counts = combined_dataset.drop(["ID_REF"], axis=1)
metadata = pd.DataFrame({"condition": combined_dataset["ID_REF"]})
print(metadata['condition'].value_counts())

# Print data shapes and sample content for verification
print(counts.head())
print(counts.shape)
print(metadata.head())
print(metadata.shape)
print(metadata['condition'].unique())

In [60]:
# Initialize DeseqDataSet object
dds = DeseqDataSet(counts=counts, metadata=metadata, design_factors="condition")

# Perform DESeq2 workflow steps
dds.deseq2()
dds.fit_genewise_dispersions()
dds.fit_dispersion_trend()
dds.fit_dispersion_prior()
dds.fit_MAP_dispersions()
dds.fit_LFC()

# Perform statistical analysis
stat_res = DeseqStats(dds, alpha=0.05, cooks_filter=True, independent_filter=True)
stat_res.run_wald_test()

Fitting size factors...
... done in 0.02 seconds.

Fitting dispersions...
... done in 0.68 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend()
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.75 seconds.

Fitting LFCs...
... done in 0.67 seconds.

Replacing 0 outlier genes.

Fitting dispersions...
... done in 0.57 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend()
... done in 0.03 seconds.

Fitting MAP dispersions...
... done in 0.65 seconds.

Fitting LFCs...
... done in 0.60 seconds.

Running Wald tests...
... done in 0.49 seconds.



In [61]:
stat_res.summary()

# Extract the results DataFrame
results_df = stat_res.results_df

# Check if results_df is not empty
if results_df is not None:
    # Rank features by adjusted p-value
    ranked_features = results_df.sort_values(by='padj').index.tolist()  # Rank features by adjusted p-value

    # Save and display results
    results_df.to_csv('deseq2_feature_ranking.csv')
    print("Top features ranked by significance:")
    print(results_df.head(10))
else:
    print("Error: The results DataFrame is empty.")

Log2 fold change & Wald test p-value: condition 1 vs 0
                 baseMean  log2FoldChange     lfcSE      stat    pvalue  \
hsa-miR-28-3p    1.127601       -0.146624  0.256102 -0.572522  0.566969   
hsa-miR-27a-5p   1.133180        0.106815  0.264943  0.403162  0.686829   
hsa-miR-518b     2.108806       -0.119555  0.154693 -0.772851  0.439611   
hsa-miR-520b     1.569103       -0.005341  0.220386 -0.024236  0.980664   
hsa-miR-498      5.616244        0.009254  0.056172  0.164746  0.869144   
...                   ...             ...       ...       ...       ...   
hsa-miR-6880-3p  5.738318        0.014968  0.055585  0.269276  0.787717   
hsa-miR-6873-5p  2.043812        0.235521  0.155355  1.516016  0.129515   
hsa-miR-6872-3p  5.641957       -0.008523  0.055904 -0.152448  0.878833   
hsa-miR-6865-5p  5.299951       -0.016683  0.057699 -0.289131  0.772481   
hsa-miR-6864-3p  2.836202        0.012822  0.110486  0.116052  0.907611   

                     padj  
hsa-miR-28-3p   

In [62]:
# Rank features by adjusted p-value
ranked_features = results_df.sort_values(by='padj').index.tolist()  # Rank features by adjusted p-value

# Save and display results
results_df.to_csv('deseq2_feature_ranking.csv')
print("Top features ranked by significance:")
print(results_df.head(10))

Top features ranked by significance:
                baseMean  log2FoldChange     lfcSE      stat    pvalue  \
hsa-miR-28-3p   1.127601       -0.146624  0.256102 -0.572522  0.566969   
hsa-miR-27a-5p  1.133180        0.106815  0.264943  0.403162  0.686829   
hsa-miR-518b    2.108806       -0.119555  0.154693 -0.772851  0.439611   
hsa-miR-520b    1.569103       -0.005341  0.220386 -0.024236  0.980664   
hsa-miR-498     5.616244        0.009254  0.056172  0.164746  0.869144   
hsa-miR-512-3p  1.477547       -0.005450  0.214504 -0.025409  0.979729   
hsa-miR-491-5p  4.696004       -0.093782  0.060927 -1.539246  0.123744   
hsa-miR-490-3p  2.204691       -0.112726  0.156792 -0.718956  0.472168   
hsa-miR-452-5p  1.772331       -0.005137  0.195230 -0.026314  0.979007   
hsa-miR-451a    7.642452        0.088104  0.048872  1.802770  0.071424   

                    padj  
hsa-miR-28-3p   0.933528  
hsa-miR-27a-5p  0.976772  
hsa-miR-518b    0.871094  
hsa-miR-520b    0.998161  
hsa-miR-498  

In [66]:
# Save the first column containing miRNA to another CSV
miRNA_df = pd.DataFrame(ranked_features, columns=['miRNA'])
miRNA_df.head()
miRNA_df.to_csv('deseq2_miRNA_ranking.csv', index=False)