In [None]:
# Testing Area

In [34]:

# imports
import pandas as pd
from typing import *
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
import anndata as ad
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# variables
adata_path = "../data/pp_data-24-09-02-01/data.h5ad"

# functions



def get_test_split(obs: pd.DataFrame,n_splits=5) -> List[str]:
    """Get Test Split
    We will perform a split for those diseases which have more than one dataset.

    Ther MUST not be any data-leakage between the train and test set - no shared datasets between the two sets.

    Strategy:
        1. Check diseases w/ 5+ datasets
        2. Divide dataset into train and test w/ 4:1 ratio
        3. Assign train and test to the respective datasets

    """

    # pre-process data
    obs["_combination"] = obs["disease"].astype(str) + "_" + obs["dataset"].astype(str)

    diseases_f1 = set() # diseases filter 1


    # 1. Check diseases w/ 5+ datasets
    all_diseases = obs["disease"].unique()
    for diseases in all_diseases:
        QUERY = f"disease == \"{diseases}\""
        _df_query = obs.query(QUERY)
        if len(_df_query["dataset"].unique()) >= 5:
            diseases_f1.add(diseases)

    logging.info(f"Number of diseases with 5+ datasets: {len(diseases_f1)}")

    # 2. Divide dataset into train and test w/ 4:1 ratio
    QUERY = "disease in @diseases_f1"
    df_diseases_f1 = obs.query(QUERY)
    
    sgkf = StratifiedGroupKFold(n_splits=n_splits)
    for i, (train_idx, test_idx) in enumerate(sgkf.split(X=df_diseases_f1["dsaid"], y=df_diseases_f1["disease"], groups=df_diseases_f1["dataset"])):
        
        # get which disease & datasets are in the test
        df_diseases_f1_test = df_diseases_f1.iloc[test_idx]
        
        logging.info(f"Nº of diseases in test split {i+1}: {len(df_diseases_f1_test['disease'].unique())}")        
        logging.info(f"Nº of datasets in test split {i+1}: {len(df_diseases_f1_test['dataset'].unique())}")        
        
        # 3. Assign train and test labels
        obs[f'test_split_{i+1}'] = obs['_combination'].isin(df_diseases_f1_test['_combination']).astype(int)
        
        logging.info(f"Nº of samples in test split {i+1}: {obs[f'test_split_{i+1}'].sum()}")
    
    obs.drop(columns=["_combination"], inplace=True)

    return obs


# load data
adata = ad.read(adata_path, backed="r")

obs = adata.obs.copy()



In [37]:
obs_2["test_split_1"]

0         0
1         0
2         0
3         0
4         0
         ..
202005    1
202006    1
202007    1
202008    1
202009    1
Name: test_split_1, Length: 202010, dtype: int64

In [31]:
import pickle
import os

data_path = os.path.join("../data/pp_data-24-09-02-01/","data.h5ad")


def filter_samples_n_genes(data_folder:str, n_genes:int)->np.array:
    """Filter Samples by Nº of Genes
    Args:
        - data_folder (str): Data Folder
        - n_genes (int): Nº of Genes
    Returns:
        - np.array: Filter
    """
    with open(os.path.join(data_folder,"metadata.pkl"),"rb") as f:
        metadata = pickle.load(f)
    
    n_genes_per_gex = metadata["n_non_nan_gex"]
    return n_genes_per_gex >= n_genes


def get_nan_filter(X:np.array, thr_non_nan=10000)->np.array:
    """Get NaN Filter
    Args:
        - X (np.array): Data
        - n_non_nan (int, optional): Number of non-NaN values. Defaults to 10000.
    Returns:
        - np.array: Filter
    """
    # count non-nan per row
    n_non_nan = (~np.isnan(X)).sum(axis=1)
    
    # filter
    filter = n_non_nan >= thr_non_nan
    
    return filter


mask = filter_samples_n_genes(os.path.dirname(data_path), 10000)
print(np.sum(mask))

192605


In [24]:
import numpy as np
a = np.ones((150,100))

filter = get_nan_filter(a, 250)

a[filter].shape

(0, 100)

In [19]:
n_non_nans = (~np.isnan(a)).sum(axis=1)

filter_n_non_nans = n_non_nans > 0.5 * a.shape[1]
filter_n_non_nans

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [11]:
obs_2 = get_test_split(obs)

2024-09-02 16:31:32,038 - root - INFO - Number of diseases with 5+ datasets: 112
2024-09-02 16:31:32,528 - root - INFO - Nº of diseases in test split 1: 112
2024-09-02 16:31:32,529 - root - INFO - Nº of datasets in test split 1: 413
2024-09-02 16:31:32,533 - root - INFO - Nº of samples in test split 1: 20191
2024-09-02 16:31:32,619 - root - INFO - Nº of diseases in test split 2: 112
2024-09-02 16:31:32,619 - root - INFO - Nº of datasets in test split 2: 446
2024-09-02 16:31:32,624 - root - INFO - Nº of samples in test split 2: 20191
2024-09-02 16:31:32,708 - root - INFO - Nº of diseases in test split 3: 112
2024-09-02 16:31:32,709 - root - INFO - Nº of datasets in test split 3: 448
2024-09-02 16:31:32,713 - root - INFO - Nº of samples in test split 3: 20583
2024-09-02 16:31:32,797 - root - INFO - Nº of diseases in test split 4: 112
2024-09-02 16:31:32,798 - root - INFO - Nº of datasets in test split 4: 457
2024-09-02 16:31:32,802 - root - INFO - Nº of samples in test split 4: 21361
202

In [12]:
obs["disease_study"].value_counts()

disease_study
Asthma                                   4104
Schizophrenia                            3779
Chronic Obstructive Pulmonary Disease    3152
Hepatocellular Carcinoma                 3074
Systemic Lupus Erythematosus             2845
                                         ... 
Peroxisome Biogenesis Disorders             3
Rhabdoid Tumor of The Kidney                3
Renal Lipoma                                3
Cholangiocellular Carcinoma                 3
Angina                                      3
Name: count, Length: 1046, dtype: int64

In [13]:
obs_2

Unnamed: 0,ids,dataset,batch,batch_id,dsaid,tissue,n_genes,disease,celltype,disease_study,test_split_1,test_split_2,test_split_3,test_split_4,test_split_5
0,DSA00009;GSM6943825;Control,GSE223245,1267,1267,DSA00009,Whole blood,18564,Control,Control,Brain Damage,0,1,0,0,0
1,DSA00009;GSM6943826;Control,GSE223245,1267,1267,DSA00009,Whole blood,18564,Control,Control,Brain Damage,0,1,0,0,0
2,DSA00009;GSM6943827;Control,GSE223245,1267,1267,DSA00009,Whole blood,18564,Control,Control,Brain Damage,0,1,0,0,0
3,DSA00009;GSM6943828;Control,GSE223245,1267,1267,DSA00009,Whole blood,18564,Control,Control,Brain Damage,0,1,0,0,0
4,DSA00009;GSM6943813;Case,GSE223245,1267,1267,DSA00009,Whole blood,18564,Brain Damage,Brain Damage,Brain Damage,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129408,DSA10290;GSM799514;Case,GSE32269,1187,1187,DSA10290,Prostate,10668,Prostate Cancer,Prostate Cancer,Prostate Cancer,1,0,0,0,0
129409,DSA10290;GSM799515;Case,GSE32269,1187,1187,DSA10290,Prostate,10668,Prostate Cancer,Prostate Cancer,Prostate Cancer,1,0,0,0,0
129410,DSA10290;GSM799516;Case,GSE32269,1187,1187,DSA10290,Prostate,10668,Prostate Cancer,Prostate Cancer,Prostate Cancer,1,0,0,0,0
129411,DSA10290;GSM799517;Case,GSE32269,1187,1187,DSA10290,Prostate,10668,Prostate Cancer,Prostate Cancer,Prostate Cancer,1,0,0,0,0


In [15]:
import os
df_info_path = os.path.join(
    "/aloy",
    "home",
    "ddalton",
    "projects",
    "disease_signatures",
    "data",
    "DiSignAtlas",
    "Disease_information_Datasets_extended.csv",
)


df_info = pd.read_csv(df_info_path)

In [27]:

library_strategies_of_interest_set = {"RNA-seq", "Microarray"}

QUERY = "library_strategy in @library_strategies_of_interest_set & organism == 'Homo sapiens'"


logging.info(f"df_info shape: {df_info.shape}")


df_info_query = df_info.query(QUERY)

logging.info(f"df_info_query shape: {df_info_query.shape}")

# Nº of diseases
diseases_dsaid = set()
diseases_dt = set()
for disease in df_info_query["disease"].unique():
    QUERY = "disease == @disease"
    _df_query = df_info_query.query(QUERY)
    if _df_query["dsaid"].nunique() >= 5:
        diseases_dsaid.add(disease)
    if _df_query["accession"].nunique() >= 5:
        diseases_dt.add(disease)

logging.info(f"Nº of diseases w/ 5+ dsaids {len(diseases_dsaid)}")
logging.info(f"Nº of diseases w/ 5+ datasets {len(diseases_dt)}")

2024-09-02 17:08:08,483 - root - INFO - df_info shape: (10306, 14)
2024-09-02 17:08:08,486 - root - INFO - df_info_query shape: (4468, 14)
2024-09-02 17:08:09,543 - root - INFO - Nº of diseases w/ 5+ dsaids 198
2024-09-02 17:08:09,543 - root - INFO - Nº of diseases w/ 5+ datasets 112


In [25]:
df_info_query.query(QUERY)

Unnamed: 0,dsaid,accession,platform,deg_count,disease,diseaseid,tissue,data_source,library_strategy,organism,control_case_sample_count,definition,Control,Case


In [32]:
df_info[
    (df_info["organism"] == "Homo sapiens")
    & (
        (df_info["library_strategy"] == "RNA-Seq")
        | (df_info["library_strategy"] == "Microarray")
    )
].shape[0]

7004

In [36]:
library_strategies_of_interest_set = {"RNA-Seq", "Microarray"}

QUERY = "library_strategy in @library_strategies_of_interest_set & organism == 'Homo sapiens'"
df_info.query(QUERY).shape[0]

7004

In [30]:

df_info["library_strategy"].value_counts()

library_strategy
Microarray    5994
RNA-Seq       3984
scRNA-Seq      318
snRNA-Seq       10
Name: count, dtype: int64

In [19]:
df_info

Unnamed: 0,dsaid,accession,platform,deg_count,disease,diseaseid,tissue,data_source,library_strategy,organism,control_case_sample_count,definition,Control,Case
0,DSA00001,GSE224398,GPL21103,1000,Alzheimer's Disease,C0002395,Hippocampus,GEO,scRNA-Seq,Mus musculus,1|1,DO:An Alzheimer's disease that has_material_ba...,GSM7021712,GSM7021715
1,DSA00002,GSE224398,GPL21103,1000,Alzheimer's Disease,C0002395,Hippocampus,GEO,scRNA-Seq,Mus musculus,1|1,DO:An Alzheimer's disease that has_material_ba...,GSM7021713,GSM7021716
2,DSA00003,GSE224398,GPL21103,1000,Alzheimer's Disease,C0002395,Hippocampus,GEO,scRNA-Seq,Mus musculus,1|1,DO:An Alzheimer's disease that has_material_ba...,GSM7021714,GSM7021717
3,DSA00004,GSE224022,GPL16791,1000,Retinoblastoma,C0035335,Retina,GEO,RNA-Seq,Homo sapiens,4|5,DO:A retinal cell cancer and malignant neoplas...,GSM7009973;GSM7009974;GSM7009976;GSM7009977,GSM7009978;GSM7009979;GSM7009980;GSM7009981;GS...
4,DSA00005,GSE126342,GPL11154,1000,Congenital Myotonic Dystrophy,C0410226,Skeletal muscle,GEO,RNA-Seq,Homo sapiens,9|11,MONDO:An inherited progressive disorder affect...,GSM3596881;GSM3596882;GSM3596883;GSM3596884;GS...,GSM3596870;GSM3596871;GSM3596872;GSM3596873;GS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10301,DSA10302,GSE6008,GPL96,1000,Ovarian Tumor,C1140680,Ovary,GEO,Microarray,Homo sapiens,4|41,DO:A female reproductive organ cancer that is ...,Click to openGSM139476;GSM139477;GSM139478;GSM...,Click to openGSM139435;GSM139436;GSM139437;GSM...
10302,DSA10303,GSE6280,GPL96,758,Kidney Tumor,C0022665,Kidney,GEO,Microarray,Homo sapiens,6|14,DO:A urinary system cancer that is located_in ...,GSM144461;GSM144462;GSM144463;GSM144472;GSM144...,GSM144464;GSM144465;GSM144466;GSM144467;GSM144...
10303,DSA10304,GSE6280,GPL97,337,Kidney Tumor,C0022665,Kidney,GEO,Microarray,Homo sapiens,6|14,DO:A urinary system cancer that is located_in ...,GSM144481;GSM144482;GSM144483;GSM144492;GSM144...,GSM144484;GSM144485;GSM144486;GSM144487;GSM144...
10304,DSA10305,GSE6344,GPL96,1000,Clear Cell Ependymoma,C1384403,Kidney,GEO,Microarray,Homo sapiens,5|5,"EFO:A WHO grade II, slow growing tumor of chil...",GSM146778;GSM146780;GSM146782;GSM146784;GSM146786,GSM146779;GSM146781;GSM146783;GSM146785;GSM146787


In [43]:
df_info_path = os.path.join(
    "/aloy",
    "home",
    "ddalton",
    "projects",
    "disease_signatures",
    "data",
    "DiSignAtlas",
    "Disease_information_Datasets_extended.csv",
)


df_info = pd.read_csv(df_info_path)

print(df_info["disease"].value_counts()[:30].index)

Index(['Huntington's Disease', 'Alzheimer's Disease', 'Asthma', 'COVID-19',
       'Influenza', 'Parkinson's Disease', 'Systemic Lupus Erythematosus',
       'Obesity', 'Hepatocellular Carcinoma', 'Crohn's Disease',
       'Ulcerative Colitis', 'Sepsis', 'Breast Cancer', 'Psoriasis',
       'Schizophrenia', 'Multiple Sclerosis', 'Amyotrophic Lateral Sclerosis',
       'Tuberculosis', 'Chronic Obstructive Pulmonary Disease',
       'Rheumatoid Arthritis', 'Idiopathic Pulmonary Fibrosis',
       'Colorectal Carcinoma', 'Type 1 Diabetes',
       'Non-Alcoholic Steatohepatitis', 'Melanoma', 'Diabetes',
       'Myocardial Infarction', 'Acute Myeloid Leukemia (Aml-M2)', 'Colitis',
       'Prostate Cancer'],
      dtype='object', name='disease')


In [44]:
from time import datetime

def get_folder_name():
    # Step 1: Generate today's date string
    today = datetime.now().strftime("%y-%m-%d")

    # Step 2: Define the base output directory
    base_output_dir = os.path.join("..", "outputs")

    # Step 3: Find the highest existing run number for today
    existing_runs = [
        d for d in os.listdir(base_output_dir)
        if os.path.isdir(os.path.join(base_output_dir, d)) and d.startswith(f"run-{today}")
    ]

    # Extract numbers from existing runs and find the max
    existing_numbers = [
        int(d.split("-")[-1]) for d in existing_runs if d.split("-")[-1].isdigit()
    ]

    # Calculate the next run number
    next_run_number = max(existing_numbers, default=0) + 1

    # Step 4: Create the directory name with zero-padded run number
    output_dir = os.path.join(base_output_dir, f"run-{today}-{next_run_number:02d}")

    # Step 5: Create the directory
    os.makedirs(output_dir, exist_ok=True)

    print(f"Output directory created: {output_dir}")
    return output_dir

outpud_dir = get_folder_name()

NameError: name 'datetime' is not defined