# iLINCS Freeze

## iLINCS Freeze: CSV

In [1]:
"""
iLINCS Freeze: CSV

Here we will attempt to freeze the iLINCS database. It will be stored to CSV files!

Structure:
    1. Imports, Variables, Functions
    2. Retrieve Data
    3. Parse Data
    4. Store Data

"""

# 1. Imports, Variables, Functions
# imports
import requests, os
import pandas as pd
import logging

# Remove any existing handlers associated with the root logger.
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Reconfigure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logging.info("iLINCS Freeze: CSV")

# variabels
OUTPUT_PATH = "../data/iLINCS"


# functions
def get_signatures():
    """
    get_signatures
    Retrieves a list of signatures from the iLINCS API.

    Parameters:
    - None

    Returns:
    - List[Dict]:
        A list of dictionaries, each representing a signature. Returns None in case of failure.
    """
    url = "http://www.ilincs.org/api/SignatureMeta?"
    url = "http://www.ilincs.org/api/SignatureMeta?"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print("Failed to retrieve data")
        return None


def get_datasets():
    """
    get_datasets
    Retrieves a list of datasets from the iLINCS API.

    Parameters:
    - None

    Returns:
    - List[Dict]:
        A list of dictionaries, each representing a dataset. Returns None in case of failure.
    """
    # url = 'http://www.ilincs.org/api/PublicDatasets?filter={"limit":1000}'
    url = "http://www.ilincs.org/api/PublicDatasets?"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Failed to retrieve datasets")
        return None


def get_genes():
    """
    get_genes
    Retrieves gene information from the iLINCS API.

    Parameters:
    - None

    Returns:
    - List[Dict]:
        A list of dictionaries, each representing a gene. Returns None in case of failure.
    """
    url = "http://www.ilincs.org/api/GeneInfos?"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Failed to retrieve genes")
        return None


def get_compounds():
    """
    get_compounds
    Retrieves a list of compounds from the iLINCS API.

    Parameters:
    - None

    Returns:
    - List[Dict]: A list of dictionaries, each representing a compound. Returns None in case of failure.
    """
    url = "http://www.ilincs.org/api/Compounds?"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Failed to retrieve compounds")
        return None


def save_to_csv(data, filename):
    """
    save_to_csv
    Saves given data to a CSV file.

    Parameters:
    - data: List[Dict]
        The data to be saved into a CSV file.
    - filename: str
        The name of the file to save the data into.

    Returns:
    - df
    """
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

    return df


def get_signature_data(signature_id):
    """
    get_signature_data
    Retrieves a list of compounds from the iLINCS API.

    Parameters:
    - signature_id: str()
        Signature ID of interest

    Returns:
    - List[Dict]: A list of dictionaries, each representing a signature. Returns None in case of failure.
    """
    url = f"http://www.ilincs.org/api/ilincsR/downloadSignature"
    payload = {"sigID": signature_id}
    response = requests.post(url, data=payload)
    if response.status_code == 200:
        return response.json()  # Or process the response as needed
    else:
        print(f"Failed to retrieve data for signature {signature_id}")
        return None


def download_signature_data(signature_ids, no_of_top_genes, display):
    """
    Download iLINCS Signature Data

    Arguments:
    - signature_ids: list()
        List of signature IDs
    - no_of_top_genes: int()
        Nº of top DE genes
    - display: bool()

    Returns:
    - respose.json()
        Response obtained
    """
    endpoint = "http://www.ilincs.org/api/ilincsR/downloadSignature"
    data = {
        "sigID": ",".join(signature_ids),
        "noOfTopGenes": no_of_top_genes,
        "display": display,
    }
    response = requests.post(endpoint, data=data)
    if response.status_code == 200:
        return response.json()
    else:
        print("Error:", response.status_code, response.text)
        return None


# 2. Retrieve Data
# get signatures
signatures = get_signatures()
logging.info(f"Nº Retrieved Signatures {len(signatures)}")

# get datasets
datasets = get_datasets()
logging.info(f"Nº Retrieved Datasets {len(datasets)}")

# get genes
genes = get_genes()
logging.info(f"Nº Retrieved Genes {len(genes)}")

# get compounds
compounds = get_compounds()
logging.info(f"Nº Retrieved Compounds {len(compounds)}")

# get signature vectors
# signature_vectors = get_signature_data()

# 3. Parse Data

# 4. Store Data
# parse & store signatures
df_signatures = save_to_csv(signatures, os.path.join(OUTPUT_PATH, "signatures.csv"))

# parse & store datasets
df_datasets = save_to_csv(datasets, os.path.join(OUTPUT_PATH, "datasets.csv"))

# parse & store genes
df_genes = save_to_csv(genes, os.path.join(OUTPUT_PATH, "genes.csv"))

# parse & store compounds
df_compounds = save_to_csv(compounds, os.path.join(OUTPUT_PATH, "compounds.csv"))

2023-12-06 16:50:56,212 - INFO - iLINCS Freeze: CSV
2023-12-06 16:51:20,710 - INFO - Nº Retrieved Signatures 227578
2023-12-06 16:51:28,807 - INFO - Nº Retrieved Datasets 41272
2023-12-06 16:51:42,580 - INFO - Nº Retrieved Genes 235982
2023-12-06 16:51:44,158 - INFO - Nº Retrieved Compounds 21299


In [5]:
def download_batch_signature_data(
    signature_ids, no_of_top_genes, display, batch_size=10
):
    """
    Download iLINCS Signature Data - optimized for Batch downloads

    Arguments:
    - signature_ids: list of str
        List of signature IDs
    - no_of_top_genes: int
        Number of top differentially expressed genes
    - display: bool
        Whether to display the data
    - batch_size: int
        Number of signatures to download in each batch

    Returns:
    - processed_data: dict
        Dictionary with SignatureID -> [{}]
    """
    endpoint = "http://www.ilincs.org/api/ilincsR/downloadSignature"
    processed_data = {}

    for i in range(0, len(signature_ids), batch_size):
        print(f"Batch {i}", end="\r")

        batch_ids = signature_ids[i : i + batch_size]
        data = {
            "sigID": ",".join(batch_ids),
            "noOfTopGenes": no_of_top_genes,
            "display": display,
        }
        response = requests.post(endpoint, data=data)
        if response.status_code == 200:
            raw_data = response.json()
            for item in raw_data["data"]["signature"]:
                signatureID = item["signatureID"]
                if signatureID not in processed_data:
                    processed_data[signatureID] = []
                processed_data[signatureID].append(item)

        else:
            print(
                f"Error in batch {i // batch_size + 1}: {response.status_code}, {response.text}"
            )

    print()
    return processed_data

In [6]:
disease_signatureIDs = df_signatures[df_signatures["libraryid"] == "LIB_1"][
    "signatureid"
].unique()

In [7]:
logging.info(f"Nº Disease Signatures {len(disease_signatureIDs)}")

2023-12-06 16:52:03,242 - INFO - Nº Disease Signatures 9097


In [8]:
b = download_batch_signature_data(
    signature_ids=disease_signatureIDs[:20],
    no_of_top_genes=100000,
    display=True,
    batch_size=20,
)

Batch 0


In [10]:
import os




def save_signature_data(data, filename):
    """
    save_signature_data
    Saves given data to a CSV file.

    Parameters:
    - data: List[Dict]
        The data to be saved into a CSV file.
    - filename: str
        The name of the file to save the data into.

    Returns:
    - df
    """
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

    return df



for key, value in b.items():
    save_signature_data(value, os.path.join(OUTPUT_PATH, f"{key}.csv"))
    