In [1]:
from pathlib import Path
from paths.pathsval import (project_root, runends_directory)
from download_data.get_downloads_info import (run_end, save_run_ends)
import pandas as pd


In [5]:
base_project_path = project_root()
bioproject = "482464"
project = "SRA"  # Options: "GSE" or "SRA"
gse_test = "GSE25348, GSE23229, GSE18508"
bpdir = base_project_path / "data"  / bioproject

print(f"Bioproject directory: {bpdir}")

sra_files_dir = bpdir / "sra_files"
gds_files_dir = bpdir / "gds_files"

sra_info_path = sra_files_dir / "sra_runinfo.tsv"
gds_info_path = gds_files_dir / "gse_gsm.tsv"

if not sra_info_path.is_file():
    raise FileNotFoundError(f"SRA runinfo file not found at {sra_info_path}")

print (f"Reading SRA runinfo from {sra_info_path}")
    

if project == "GSE":
    try:
        sra_df = pd.read_csv(sra_info_path, sep="\t")
        if sra_df.empty:
            raise ValueError("SRA runinfo file is empty.")
        if 'Run' not in sra_df.columns or 'LibraryLayout' not in sra_df.columns:
            raise ValueError("SRA runinfo file does not contain 'Run' or 'LibraryLayout' column.")
    except Exception as e:
        raise ValueError(f"Error reading SRA runinfo file: {e}")

    try:
        df_gse_gsm = pd.read_csv(gds_info_path, sep=",")
        if df_gse_gsm.empty:
            raise ValueError("GDS info file is empty.")
        if 'GSE' not in df_gse_gsm.columns or 'GSM' not in df_gse_gsm.columns:
            raise ValueError("GDS info file does not contain 'GSE' or 'GSM' column.")
    except Exception as e:
        raise ValueError(f"Error reading GDS info file: {e}")

   
    for gse in gse_test.split(","):
       gse = gse.strip()
       gsm_list = df_gse_gsm[df_gse_gsm['GSE'] == gse]['GSM'].tolist()
       sra_subset = sra_df[sra_df['SampleName'].isin(gsm_list)]
       if sra_subset.empty:
           print(f"No matching GSM entries found in SRA runinfo for {gse}. Skipping.")
           continue

       single, paired = run_end(sra_subset)
       output_path = runends_directory(base_project_path, bioproject)
       paired_path, single_path = save_run_ends(paired, single, output_path, gse)

       print(f"Processed {gse}:")
       print(f"  Paired-end runs saved to: {paired_path}")
       print(f"  Single-end runs saved to: {single_path}")

elif project == "SRA":
    try:
        sra_df = pd.read_csv(sra_info_path, sep="\t")
        if sra_df.empty:
            raise ValueError("SRA runinfo file is empty.")
        if 'Run' not in sra_df.columns or 'LibraryLayout' not in sra_df.columns:
            raise ValueError("SRA runinfo file does not contain 'Run' or 'LibraryLayout' column.")
    except Exception as e:
        raise ValueError(f"Error reading SRA runinfo file: {e}")
         
    single, paired = run_end(sra_df)
    output_path = runends_directory(base_project_path, bioproject)
    paired_path, single_path = save_run_ends(paired, single, output_path, bioproject)
    
    print(f"Processed SRA project {bioproject}:")
    print(f"  Paired-end runs saved to: {paired_path}")
    print(f"  Single-end runs saved to: {single_path}")


Bioproject directory: C:\Users\pdmpe\OneDrive\Documentos\GitHub\RbohB-DE\data\482464
Reading SRA runinfo from C:\Users\pdmpe\OneDrive\Documentos\GitHub\RbohB-DE\data\482464\sra_files\sra_runinfo.tsv
Single-end reads:            SRR
15  SRR7696209
16  SRR7696210
17  SRR7696208
Paired-end reads:            SRR
0   SRR7693916
1   SRR7693915
2   SRR7693917
3   SRR7696200
4   SRR7696201
5   SRR7696202
6   SRR7696193
7   SRR7696194
8   SRR7696192
9   SRR7696204
10  SRR7696205
11  SRR7696206
12  SRR7696589
13  SRR7696590
14  SRR7696591
Total Single-end: 3
Total Paired-end: 15
Processed SRA project 482464:
  Paired-end runs saved to: C:\Users\pdmpe\OneDrive\Documentos\GitHub\RbohB-DE\results\482464\runends\482464_paired_end_runs.tsv
  Single-end runs saved to: C:\Users\pdmpe\OneDrive\Documentos\GitHub\RbohB-DE\results\482464\runends\482464_single_end_runs.tsv
