# DiSignAtlas Download

## Download

In [1]:
"""We have found examples where in the bulk download of 
differential expression in the DiSignAtlas, the profile does
not match that of the individual download. This script is 
meant to download the individual files.

Structure:
    1. Imports, Variables, and Functions
    2. Load Identifiers
    3. Download Data
"""

# 1. Imports, Variables, and Functions
# imports
import os, sys, pandas as pd, numpy as np
import requests
import logging
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# variables
data_path = os.path.join(
    "..", "data", "DiSignAtlas", "Disease_information_Datasets.csv"
)
output_path = os.path.join(
    "..", "data", "DiSignAtlas", "dsa_diff_download.individual_downloads"
)


# functions
def download_file(dsaid):

    output_path = os.path.join(
        "..", "data", "DiSignAtlas", "dsa_diff_download.individual_downloads/"
    )
    download_url = (
        f"http://www.inbirg.com/disignatlas/download/diff_results_download/{dsaid}"
    )
    filename = download_url.split("/")[-1]
    file_path = os.path.join(output_path, filename)

    max_retries = 10
    timeout = 10
    for attempt in range(max_retries):
        try:
            response = requests.get(download_url, timeout=timeout)
            if response.status_code == 200:
                with open(file_path, "wb") as file:
                    file.write(response.content)
                # print(f"File downloaded successfully: {filename}")
                return
            else:
                # print(
                #     f"Attempt {attempt + 1} of {max_retries}: Failed with status code {response.status_code}"
                # )
                pass
        except requests.RequestException as e:
            # print(f"Attempt {attempt + 1} of {max_retries}: Failed with error {e}")
            pass
    logging.info(f"Failed to download {filename} after {max_retries} attempts.")


# 2. Load Identifiers
# load df
df = pd.read_csv(data_path)
dsaids = df["dsaid"].to_list()

# 3. Download Data
# multiprocessing download
process_map(download_file, dsaids, max_workers=8, chunksize=1)

  from .autonotebook import tqdm as notebook_tqdm
 84%|████████▎ | 8621/10306 [33:47<40:10,  1.43s/it]   2024-02-05 12:21:19,863 - INFO - Failed to download DSA08622 after 10 attempts.
2024-02-05 12:21:19,863 - INFO - Failed to download DSA08623 after 10 attempts.
 84%|████████▎ | 8622/10306 [35:16<6:07:04, 13.08s/it]2024-02-05 12:21:20,071 - INFO - Failed to download DSA08625 after 10 attempts.
 84%|████████▎ | 8625/10306 [35:17<3:55:01,  8.39s/it]2024-02-05 12:21:20,080 - INFO - Failed to download DSA08626 after 10 attempts.
2024-02-05 12:21:20,444 - INFO - Failed to download DSA08627 after 10 attempts.
100%|██████████| 10306/10306 [41:52<00:00,  4.10it/s] 


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
## Filter out failed downloads

In [9]:
""" Here we want to seek out which files downloaded as succesful
csv files and which failed to download.

Structure:
    1. Imports, Variables, and Functions
    2. Load data info
    3. Seek out failed csv files
"""

# 1. Imports, Variables, and Functions
# imports
import os, sys, pandas as pd, numpy as np
import requests
import logging
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
import csv

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# variables
data_path = os.path.join(
    "..", "data", "DiSignAtlas", "Disease_information_Datasets.csv"
)
output_path = os.path.join(
    "..", "data", "DiSignAtlas", "dsa_diff_download.individual_downloads"
)


# functions
def check_files(directory):
    csv_files = []
    html_files = []

    for filename in tqdm(os.listdir(directory)):
        file_path = os.path.join(directory, filename)

        try:
            # Try opening the file as a CSV
            with open(file_path, "r", newline="", encoding="utf-8") as f:
                start = f.read(
                    1024
                )  # Read the first 1024 bytes to check the file content
                # Check if the content is HTML
                if "<!DOCTYPE html>" in start or "<html>" in start:
                    html_files.append(filename)
                else:
                    # Attempt to parse as CSV to further ensure it's correctly formatted
                    f.seek(0)  # Go back to the start of the file
                    reader = csv.reader(f)
                    headers = next(reader)  # Attempt to read the first row (headers)
                    if headers:
                        csv_files.append(filename)
        except Exception as e:
            logging.info(f"Error processing file {filename}: {e}")
            # Optionally, classify files that caused exceptions if needed
            # error_files.append(filename)

    return csv_files, html_files


# 2. Load data info
df = pd.read_csv(data_path)
dsaids = df["dsaid"].to_list()

# 3. Seek out failed csv files
# which datasets have not been downloaded
not_downloaded_dsaids = list()
for d in dsaids:
    if not os.path.exists(os.path.join(output_path, d)):
        not_downloaded_dsaids.append(d)

logging.info(f"Number of datasets not downloaded: {len(not_downloaded_dsaids)}")

# which datasets have not been downloaded as csv
csv_files, html_files = check_files(output_path)

logging.info(f"Number of csv datasets: {len(csv_files)}")

logging.info(f"Number of html datasets: {len(html_files)}")

2024-02-05 12:50:47,082 - INFO - Number of datasets not downloaded: 3
100%|██████████| 10303/10303 [01:22<00:00, 124.21it/s] 
2024-02-05 12:52:10,032 - INFO - Number of csv datasets: 9978
2024-02-05 12:52:10,032 - INFO - Number of html datasets: 325


In [7]:
df[df["library_id"]==]["dsaid"]

 correlation.h5                         DSA01882_alldiff.test.iLINCS.csv
 diff_results_download.zip              [0m[01;34mdsa_diff_download[0m/
 diff_source_code                       [01;34mdsa_diff_download.individual_downloads[0m/
'Disease_information_Datasets(2).csv'   enrich_source_code
 Disease_information_Datasets.csv       ess.h5
 dis_info_datasets                      gene_info
 dis_info_degs


In [39]:
dsaids_interest = df[
    (df["library_strategy"] == "Microarray") | (df["library_strategy"] == "RNA-Seq")
]["dsaid"].to_list()

In [40]:
len(dsaids_interest)

9978

In [41]:
len(set(dsaids_interest) & set(csv_files))

9978