# SQL

In [1]:
"""
Here we will be creating an SQL database to store disease signatures from iLINCS!

We will filter those signatures which belong to diseases - which are >9,000 signatures
from the iLINCS database.

Resources: 
    * http://www.ilincs.org/ilincs/APIinfo
"""

'\nHere we will be creating an SQL database to store disease signatures from iLINCS!\n\nWe will filter those signatures which belong to diseases - which are >9,000 signatures\nfrom the iLINCS database.\n\nResources: \n    * http://www.ilincs.org/ilincs/APIinfo\n'

## Create Table Dataset

In [23]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/datasets.csv"
path_signature = "../../data/iLINCS/signatures.csv"
table_name = "datasets"
primary_key = "experiment"
int_columns = ["nsamples"]  # INT columns - rest TEXT
drop_table = True
columns_of_interest = ["geolink", "publink", "experiment", "organism", "description"]
ilincs_2_sql_columns = {
    "experiment": "dataset_id",
    "geolink": "geo_link",
    "publink": "pub_link",
    "organism": "organism",
    "description": "description",
}


# functions
def get_disease_datasets():
    """
    Get Disease Datasets
    Function to retrieve from those filtered signatures the datasetid

    Arguments:

    Return:
    datasetid: list()
        List of unique dataset id's
    """
    path_data = "../../data/iLINCS/signatures.csv"
    filter_df = lambda df: df["libraryid"] == "LIB_1"

    # Load Data
    try:
        df_data = pd.read_csv(path_data)
        logging.info("Data loaded successfully.")
    except FileNotFoundError:
        logging.error("Data file not found. Please check the file path.")
        exit()

    # filter disease signatures
    df_data = df_data[filter_df]

    return list(df_data["datasetid"].unique())


# 2. Load Data
try:
    df_data = pd.read_csv(path_data)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("Data file not found. Please check the file path.")
    exit()

# get unique disease dataset id's
unique_datasetid = get_disease_datasets()

# filter disease signatures
# dataset id's refer to experiment
df_data = df_data[df_data["experiment"].isin(unique_datasetid)]

# filter columns of interest
df_data = df_data[columns_of_interest]

# assert we find all dataset id's
assert len(unique_datasetid) == len(
    df_data["experiment"].unique()
), "Error, Dataset Table does not conain all unique datasetid"

# get max length for each
max_lengths = [
    max([len(str(n)) for n in df_data[c].to_list()]) for c in df_data.columns
]

# Convert specified integer columns and handle NaN by replacing with 0
if any(c in df_data.columns for c in int_columns):
    for col in list(set(int_columns) & set(df_data.columns)):
        df_data[col] = (
            pd.to_numeric(df_data[col], errors="coerce").fillna(0).astype(int)
        )

# For other columns, replace NaN with None (which will become NULL in SQL)
for col in df_data.columns:
    if col not in int_columns:
        df_data[col] = df_data[col].where(pd.notnull(df_data[col]), None)

# Drop Duplicate for experiment column
df_data = df_data.drop_duplicates(subset="experiment", keep="first")

logging.info(f"Shape of filtered DataFrame: {df_data.shape}")

2023-12-13 15:51:58,318 - INFO - Data loaded successfully.
  df_data = pd.read_csv(path_data)
2023-12-13 15:51:58,782 - INFO - Data loaded successfully.
2023-12-13 15:51:58,833 - INFO - Shape of filtered DataFrame: (1087, 5)


In [20]:
"""for c in df_data.columns:
    print(f"######{c}######\n{df_data[c].value_counts()}")"""

'for c in df_data.columns:\n    print(f"######{c}######\n{df_data[c].value_counts()}")'

In [25]:
# 3. Connect with Database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    logging.info("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    logging.error(f"Unable to connect to the database: {e}")
    exit()

# 4. Create Cursor Object
cursor = conn.cursor()

# 5. Check if Table Exists and Delete Data if It Does
# Check if the table exists and drop it if it does
if drop_table:
    try:
        cursor.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE;")
        conn.commit()
        print(f"Table {table_name} dropped successfully if it existed.")
    except psycopg2.Error as e:
        print(f"An error occurred: {e}")
        conn.rollback()


cursor.execute(
    "SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name=%s)",
    (table_name,),
)
table_exists = cursor.fetchone()[0]

if table_exists:
    try:
        cursor.execute(f"DELETE FROM {table_name};")
        conn.commit()
        logging.info(f"Existing data in table {table_name} deleted successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while deleting data from the table: {e}")
        cursor.close()
        conn.close()
        exit()
else:
    # Create table if it does not exist
    column_text = ", ".join(
        f"{ilincs_2_sql_columns.get(c)} VARCHAR({n + 10})"
        if c not in int_columns
        else f"{ilincs_2_sql_columns.get(c)} INT"
        for c, n in zip(df_data.columns, max_lengths)
    )
    create_table_query = f"CREATE TABLE {table_name} ({column_text}, PRIMARY KEY({ilincs_2_sql_columns.get(primary_key)}));"
    try:
        cursor.execute(create_table_query)
        conn.commit()
        logging.info(f"Table {table_name} created successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while creating the table: {e}")
        cursor.close()
        conn.close()
        exit()

2023-12-13 16:15:28,206 - INFO - Connected to the database successfully.
2023-12-13 16:15:28,215 - INFO - Table datasets created successfully.


Table datasets dropped successfully if it existed.


In [31]:
# 6. Dump Data into Table
data_tuples = list(df_data.itertuples(index=False, name=None))
insert_query = (
    f"INSERT INTO {table_name} ({', '.join([ilincs_2_sql_columns.get(c) for c in df_data.columns])}) VALUES (%s"
    + ", %s" * (len(df_data.columns) - 1)
    + ")"
)

try:
    with conn:
        with conn.cursor() as curs:
            for record in data_tuples:
                try:
                    curs.execute(insert_query, record)
                except psycopg2.Error as e:
                    logging.error(f"Error inserting record {record}: {e}")
                    # Optionally, you can break the loop after logging the first error
                    break
    logging.info(f"Data dumped into {table_name} successfully.")
except psycopg2.Error as e:
    logging.error(f"An error occurred while inserting data into the table: {e}")
    conn.rollback()
    exit()

2023-12-13 16:19:14,230 - INFO - Data dumped into datasets successfully.


## Create Signature Table

In [1]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/signatures.csv"
table_name = "signatures"
primary_key = "signatureid"
int_columns = ["nCtrSamples", "nTrtSamples", "pubChemID"]  # INT columns - rest TEXT
drop_table = True
filter_df = lambda df: df["libraryid"] == "LIB_1"
reference_table = "datasets"
reference_key = "dataset_id"
foreign_key = "dataset_id"

columns_of_interest = [
    "signatureid",
    "datasetid",
    "level1",
    "level2",
    "tissue",
    "cellline",
]

ilincs_2_sql_columns = {
    "signatureid": "signature_id",
    "datasetid": "dataset_id",
    "level1": "condition_1",
    "level2": "condition_2",
    "tissue": "tissue",
    "cellline": "cell_line",
}

# 2. Load Data
try:
    df_data = pd.read_csv(path_data)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("Data file not found. Please check the file path.")
    exit()

# filter disease signatures
# which are libraryid LIB_1
df_data = df_data[filter_df]

# filter columns of interest
df_data = df_data[columns_of_interest]

# get max length for each
max_lengths = [
    max([len(str(n)) for n in df_data[c].to_list()]) for c in df_data.columns
]

# Convert specified integer columns and handle NaN by replacing with 0
if any(c in df_data.columns for c in int_columns):
    for col in list(set(int_columns) & set(df_data.columns)):
        df_data[col] = (
            pd.to_numeric(df_data[col], errors="coerce").fillna(0).astype(int)
        )

# For other columns, replace NaN with None (which will become NULL in SQL)
for col in df_data.columns:
    if col not in int_columns:
        df_data[col] = df_data[col].where(pd.notnull(df_data[col]), None)

logging.info(f"Shape of filtered DataFrame: {df_data.shape}")

  df_data = pd.read_csv(path_data)
2023-12-13 16:38:04,516 - INFO - Data loaded successfully.
2023-12-13 16:38:04,557 - INFO - Shape of filtered DataFrame: (9097, 6)


In [3]:
# 3. Connect with Database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    logging.info("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    logging.error(f"Unable to connect to the database: {e}")
    exit()

# 4. Create Cursor Object
cursor = conn.cursor()

# 5. Check if Table Exists and Delete Data if It Does

# Check if the table exists and drop it if it does
if drop_table:
    try:
        cursor.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE;")
        conn.commit()
        print(f"Table {table_name} dropped successfully if it existed.")
    except psycopg2.Error as e:
        print(f"An error occurred: {e}")
        conn.rollback()


cursor.execute(
    "SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name=%s)",
    (table_name,),
)
table_exists = cursor.fetchone()[0]

if table_exists:
    try:
        cursor.execute(f"DELETE FROM {table_name};")
        conn.commit()
        logging.info(f"Existing data in table {table_name} deleted successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while deleting data from the table: {e}")
        cursor.close()
        conn.close()
        exit()
else:
    # Create table if it does not exist
    column_text = ", ".join(
        f"{ilincs_2_sql_columns.get(c)} VARCHAR({n + 10})"
        if c not in int_columns
        else f"{ilincs_2_sql_columns.get(c)} INT"
        for c, n in zip(df_data.columns, max_lengths)
    )
    create_table_query = f"CREATE TABLE {table_name} ({column_text}, PRIMARY KEY({ilincs_2_sql_columns.get(primary_key)}),FOREIGN KEY ({foreign_key}) REFERENCES {reference_table}({reference_key}));"
    try:
        cursor.execute(create_table_query)
        conn.commit()
        logging.info(f"Table {table_name} created successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while creating the table: {e}")
        cursor.close()
        conn.close()
        exit()

2023-12-13 16:38:56,268 - INFO - Connected to the database successfully.
2023-12-13 16:38:56,275 - INFO - Table signatures created successfully.


Table signatures dropped successfully if it existed.


In [5]:
# 6. Dump Data into Table
data_tuples = list(df_data.itertuples(index=False, name=None))
insert_query = (
    f"INSERT INTO {table_name} ({', '.join([ilincs_2_sql_columns.get(c) for c in df_data.columns])}) VALUES (%s"
    + ", %s" * (len(df_data.columns) - 1)
    + ")"
)

try:
    with conn:
        with conn.cursor() as curs:
            for record in data_tuples:
                try:
                    curs.execute(insert_query, record)
                except psycopg2.Error as e:
                    logging.error(f"Error inserting record {record}: {e}")
                    # Optionally, you can break the loop after logging the first error
                    break
    logging.info(f"Data dumped into {table_name} successfully.")
except psycopg2.Error as e:
    logging.error(f"An error occurred while inserting data into the table: {e}")
    conn.rollback()
    exit()

2023-12-13 16:39:36,545 - INFO - Data dumped into signatures successfully.


## Create Table Signature Values

In [2]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging, os
from tqdm import tqdm

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/signature_vectors"
table_name = "signature_values"
primary_key = "signature_id_gene_id"
int_columns = ["gene_id"]
float_columns = ["log_diff_exp", "p_value"]
drop_table = True
foreign_key = "signature_id"
reference_table = "signatures"
reference_key = "signature_id"

columns_of_interest = [
    "signatureID",
    "ID_geneid",
    "Name_GeneSymbol",
    "Value_LogDiffExp",
    "Significance_pvalue",
    "signature_id_gene_id",
]

ilincs_2_sql_columns = {
    "signatureID": "signature_id",
    "ID_geneid": "gene_id",
    "Name_GeneSymbol": "gene_name",
    "Value_LogDiffExp": "log_diff_exp",
    "Significance_pvalue": "p_value",
    "signature_id_gene_id": "signature_id_gene_id",
}


# functions
def get_disease_signatureids():
    """
    Get Disease Datasets
    Function to retrieve from those filtered signatures the datasetid

    Arguments:

    Return:
    datasetid: list()
        List of unique dataset id's
    """
    path_data = "../../data/iLINCS/signatures.csv"
    filter_df = lambda df: df["libraryid"] == "LIB_1"

    # Load Data
    try:
        df_data = pd.read_csv(path_data)
        logging.info("Data loaded successfully.")
    except FileNotFoundError:
        logging.error("Data file not found. Please check the file path.")
        exit()

    # filter disease signatures
    df_data = df_data[filter_df]

    return list(df_data["signatureid"].unique())

In [3]:
# 2. Load Data

if os.path.exists("../../data/iLINCS/disease_signature_vectors.csv"):
    df_data = pd.read_csv("../../data/iLINCS/disease_signature_vectors.csv")
else:
    # get signature ids
    signature_ids = get_disease_signatureids()
    loop = 0
    # get data
    for signature_id in tqdm(signature_ids):
        if loop == 0:
            try:
                df_data = pd.read_csv(os.path.join(path_data, signature_id + ".csv"))
                loop = 1
            except FileNotFoundError:
                logging.error("Data file not found. Please check the file path.")
                exit()

        else:
            try:
                df = pd.read_csv(os.path.join(path_data, signature_id + ".csv"))
                df_data = pd.concat([df, df_data])
            except FileNotFoundError:
                logging.error("Data file not found. Please check the file path.")
                exit()

    # save dataframe
    try:
        df_data.to_csv("../../data/iLINCS/disease_signature_vectors.csv", index=False)
    except Exception as e:
        logging.error(f"Error saving data to csv: {e}")
        exit()

In [4]:
# create primary key column
df_data["signature_id_gene_id"] = (
    df_data["signatureID"] + "_" + df_data["ID_geneid"].astype(str)
)

# filter columns of interest
df_data = df_data[columns_of_interest]

In [5]:
logging.info(f"Shape of filtered DataFrame: {df_data.shape}")
logging.info(f"Columnns of filtered DataFrame: {df_data.columns}")

2023-12-18 14:37:20,116 - INFO - Shape of filtered DataFrame: (144545011, 6)
2023-12-18 14:37:20,116 - INFO - Columnns of filtered DataFrame: Index(['signatureID', 'ID_geneid', 'Name_GeneSymbol', 'Value_LogDiffExp',
       'Significance_pvalue', 'signature_id_gene_id'],
      dtype='object')


In [6]:
# get max length for each
d_max_lengths = {
    c: max([len(str(n)) for n in df_data[c].to_list()]) for c in df_data.columns
}

In [7]:
logging.info(f"Max Lengths: {d_max_lengths}")

2023-12-18 14:39:09,901 - INFO - Max Lengths: {'signatureID': 8, 'ID_geneid': 9, 'Name_GeneSymbol': 22, 'Value_LogDiffExp': 23, 'Significance_pvalue': 23, 'signature_id_gene_id': 18}


In [8]:
# 3. Connect with Database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    logging.info("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    logging.error(f"Unable to connect to the database: {e}")
    exit()

# 4. Create Cursor Object
cursor = conn.cursor()

# 5. Check if Table Exists and Delete Data if It Does

# Check if the table exists and drop it if it does
if drop_table:
    try:
        cursor.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE;")
        conn.commit()
        logging.info(f"Table {table_name} dropped successfully if it existed.")
    except psycopg2.Error as e:
        logging.info(f"An error occurred: {e}")
        conn.rollback()


cursor.execute(
    "SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name=%s)",
    (table_name,),
)
table_exists = cursor.fetchone()[0]

if table_exists:
    try:
        cursor.execute(f"DELETE FROM {table_name};")
        conn.commit()
        logging.info(f"Existing data in table {table_name} deleted successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while deleting data from the table: {e}")
        cursor.close()
        conn.close()
        exit()
else:
    # Create table if it does not exist
    column_text_list = list()
    for c in df_data.columns:
        if ilincs_2_sql_columns.get(c) in int_columns:
            column_text_list.append(f"{ilincs_2_sql_columns.get(c)} INT")
        elif ilincs_2_sql_columns.get(c) in float_columns:
            column_text_list.append(f"{ilincs_2_sql_columns.get(c)} FLOAT")
        else:
            column_text_list.append(
                f"{ilincs_2_sql_columns.get(c)} VARCHAR({d_max_lengths[c] + 10})"
            )

    column_text = ", ".join(column_text_list)

    create_table_query = f"CREATE TABLE {table_name} ({column_text}, PRIMARY KEY({ilincs_2_sql_columns.get(primary_key)}),FOREIGN KEY ({foreign_key}) REFERENCES {reference_table}({reference_key}));"
    try:
        cursor.execute(create_table_query)
        conn.commit()
        logging.info(f"Table {table_name} created successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while creating the table: {e}")
        cursor.close()
        conn.close()
        exit()

2023-12-18 14:39:09,921 - INFO - Connected to the database successfully.
2023-12-18 14:39:09,936 - INFO - Table signature_values dropped successfully if it existed.
2023-12-18 14:39:09,951 - INFO - Table signature_values created successfully.


In [9]:
logging.info(f"Table query created with:\n{create_table_query}")

2023-12-18 14:39:09,955 - INFO - Table query created with:
CREATE TABLE signature_values (signature_id VARCHAR(18), gene_id INT, gene_name VARCHAR(32), log_diff_exp FLOAT, p_value FLOAT, signature_id_gene_id VARCHAR(28), PRIMARY KEY(signature_id_gene_id),FOREIGN KEY (signature_id) REFERENCES signatures(signature_id));


In [10]:
# 6. Dump Data into Table
data_tuples = list(df_data.itertuples(index=False, name=None))
insert_query = (
    f"INSERT INTO {table_name} ({', '.join([ilincs_2_sql_columns.get(c) for c in df_data.columns])}) VALUES (%s"
    + ", %s" * (len(df_data.columns) - 1)
    + ")"
)

try:
    with conn:
        with conn.cursor() as curs:
            for record in data_tuples:
                try:
                    curs.execute(insert_query, record)
                except psycopg2.Error as e:
                    logging.error(f"Error inserting record {record}: {e}")
                    # Optionally, you can break the loop after logging the first error
                    break
    logging.info(f"Data dumped into {table_name} successfully.")
except psycopg2.Error as e:
    logging.error(f"An error occurred while inserting data into the table: {e}")
    conn.rollback()
    exit()

2023-12-18 16:43:32,793 - INFO - Data dumped into signature_values successfully.


## Create Table MeSH Terms

In [9]:
# 1. Imports, Variables, Functions
# imports
import requests
import xml.etree.ElementTree as ET
import time, re
from Bio import Entrez
import logging
import pandas as pd
import json

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
# variables
base_url = "http://www.ilincs.org/api"


# functions
def extract_pmid_from_publink(publink):
    """Extract the PubMed ID from the provided publink."""
    pmid_match = re.search(r"term=(\d+)\[UID\]", publink)
    if pmid_match:
        return pmid_match.group(1)
    return None


def get_disease_datasets():
    """
    Get Disease Datasets
    Function to retrieve from those filtered signatures the datasetid

    Arguments:

    Return:
    datasetid: list()
        List of unique dataset id's
    """
    path_data = "../../data/iLINCS/signatures.csv"
    filter_df = lambda df: df["libraryid"] == "LIB_1"

    # Load Data
    try:
        df_data = pd.read_csv(path_data)
        logging.info("Data loaded successfully.")
    except FileNotFoundError:
        logging.error("Data file not found. Please check the file path.")
        exit()

    # filter disease signatures
    df_data = df_data[filter_df]

    return list(df_data["datasetid"].unique())


def fetch_dataset_metadata(dataset_id):
    """Fetch dataset metadata/description for a given dataset."""
    endpoint = f"{base_url}/PublicDatasets/{dataset_id}"
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()  # assuming the response is in JSON format
    else:
        print("Error:", response.status_code, response.text)
        return None


def get_pmid_from_geo_via_eutils(geo_id):
    # Use elink to establish links between GEO and PubMed databases
    handle = Entrez.elink(dbfrom="gds", db="pubmed", id=geo_id[3:])
    record = Entrez.read(handle)
    handle.close()

    # Extract the PMID from the linked records
    # if it has LinkSetDb report else return None
    if len(record[0]["LinkSetDb"]) > 0:
        return record[0]["LinkSetDb"][0]["Link"][0]["Id"]
    else:
        return None


def fetch_mesh_terms_from_pubmed(pmid, max_retries=10, retry_delay=5):
    """Fetch MeSH terms for a given PubMed ID, with retries."""
    if not pmid:
        return []

    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {"db": "pubmed", "id": pmid, "retmode": "xml"}
    attempts = 0

    while attempts < max_retries:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            # Parse the XML response to extract MeSH terms and tree numbers
            root = ET.fromstring(response.text)
            mesh_terms = [
                descriptor.findtext("DescriptorName")
                for descriptor in root.findall(".//MeshHeading")
            ]

            mesh_tree_numbers = []
            for descriptor in root.findall(".//MeshHeading"):
                descriptor_ui = descriptor.find("DescriptorName").get("UI")
                tree_numbers = root.findall(
                    f".//DescriptorRecord[DescriptorUI='{descriptor_ui}']/TreeNumberList/TreeNumber"
                )
                mesh_tree_numbers.extend(
                    [tree_number.text for tree_number in tree_numbers]
                )

            return mesh_terms, mesh_tree_numbers

        else:
            print(
                f"Attempt {attempts + 1} failed: Error fetching MeSH terms for PMID {pmid}: {response.text}"
            )
            time.sleep(retry_delay)
            attempts += 1

    print("Max retries reached. Failed to fetch MeSH terms.")
    return []

2023-12-19 18:53:31,930 - INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-19 18:53:31,930 - INFO - NumExpr defaulting to 8 threads.


In [25]:
datasets = get_disease_datasets()

  df_data = pd.read_csv(path_data)


In [10]:
logging.info(f"Nº of datasets: {len(datasets)}")

2023-12-19 18:53:43,637 - INFO - Nº of datasets: 117


In [28]:
from tqdm import tqdm

# define dictionary
d_dataset_2_mesh = dict()

start_time = time.time()

i = 0
for dataset_id in tqdm(datasets):
    # retrieve metadata from iLINCS for specific datasetid
    metadata = fetch_dataset_metadata(dataset_id)

    # retrieve from metadata pmid
    pmid = extract_pmid_from_publink(metadata["publink"])

    if pmid:
        # if pmid listed retrieve from pmid associated MeSH terms
        mesh_terms = fetch_mesh_terms_from_pubmed(pmid)

        # print(f"Found MeSH terms for dataset {dataset_id}: {mesh_terms}")
        d_dataset_2_mesh[dataset_id] = mesh_terms

    else:
        # if pmid NOT listed try and retrieve it by accessing GEO website &
        # retrieving by web scrapping the pmid
        pmid = get_pmid_from_geo_via_eutils(metadata["SourceID"])

        if pmid:
            # if pmid listed retrieve pmid associated MeSH terms
            mesh_terms = fetch_mesh_terms_from_pubmed(pmid)
            # print(f"Found MeSH terms for dataset {dataset_id}: {mesh_terms}")
            d_dataset_2_mesh[dataset_id] = mesh_terms
        else:
            pass
            # print(f"No valid PMID found for dataset {dataset_id}.")
    if i == 0:
        logging.info(f"Example of metadata: {d_dataset_2_mesh}")
        i += 1
end_time = time.time()
logging.info(
    f"Finished Retrieving MeSH terms for Datasets. Total time taken: %.4f seconds"
    % (end_time - start_time)
)


# save dictionary
try:
    with open("../../data/iLINCS/dataset_2_mesh.json", "w") as f:
        json.dump(d_dataset_2_mesh, f)
        print("Saved!")
except Exception as e:
    logging.error(f"Error saving data to json: {e}")
    exit()

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
  1%|          | 10/1087 [00:09<16:27,  1.09it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 20197764: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



  2%|▏         | 26/1087 [00:30<16:05,  1.10it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 20105310: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



  6%|▌         | 62/1087 [01:13<16:20,  1.04it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 21029402: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



  8%|▊         | 89/1087 [01:47<16:07,  1.03it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 21266183: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



  9%|▉         | 101/1087 [02:04<15:31,  1.06it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 21705112: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 10%|▉         | 108/1087 [02:17<19:47,  1.21s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 22108827: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 11%|█▏        | 124/1087 [02:38<15:15,  1.05it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 21862633: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 11%|█▏        | 125/1087 [02:44<41:23,  2.58s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 22034635: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 13%|█▎        | 142/1087 [03:07<16:44,  1.06s/it]  

Attempt 1 failed: Error fetching MeSH terms for PMID 21625507: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 15%|█▌        | 168/1087 [03:40<16:52,  1.10s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 20678967: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 16%|█▋        | 177/1087 [03:55<16:15,  1.07s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 22073175: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 17%|█▋        | 184/1087 [04:07<16:50,  1.12s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 22021740: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 17%|█▋        | 185/1087 [04:13<40:24,  2.69s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 21408152: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 18%|█▊        | 200/1087 [04:33<13:40,  1.08it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 21346816: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 19%|█▉        | 208/1087 [04:46<16:09,  1.10s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 17595242: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 25%|██▌       | 276/1087 [05:57<11:41,  1.16it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 17390049: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 31%|███       | 337/1087 [07:04<11:16,  1.11it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 16958858: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 34%|███▍      | 367/1087 [07:37<10:37,  1.13it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 16682498: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 35%|███▌      | 383/1087 [07:58<10:28,  1.12it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 16795038: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 36%|███▌      | 389/1087 [08:09<13:36,  1.17s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 15548687: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 38%|███▊      | 408/1087 [08:34<10:50,  1.04it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 15958562: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 42%|████▏     | 454/1087 [09:23<09:27,  1.12it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 15558013: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 44%|████▎     | 474/1087 [09:46<09:29,  1.08it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 15592430: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 48%|████▊     | 517/1087 [10:38<09:31,  1.00s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 15579294: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 49%|████▊     | 529/1087 [10:54<08:47,  1.06it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 15985639: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 51%|█████     | 549/1087 [11:20<08:35,  1.04it/s]

Attempt 1 failed: Error fetching MeSH terms for PMID 15033914: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



 51%|█████     | 550/1087 [11:26<23:03,  2.58s/it]

Attempt 1 failed: Error fetching MeSH terms for PMID 14973112: {"error":"API rate limit exceeded","api-key":"84.88.74.211","count":"4","limit":"3"}



100%|██████████| 1087/1087 [20:21<00:00,  1.12s/it]

Saved!





In [8]:
"""iLINCS

The exercise here is to quantify HOW many diseases are there for which we have "disease" signatures

Structure:
    1. Imports, Variables, Functions
    2. Retrieve MeSH terms
    3. Retrieve Signature Datasets
    4. Maps MeSH terms to Signatures
    5. Plot Results
"""

# 1. Imports, Variables, Functions
# imports
import requests, json, re
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
from Bio import Entrez
import logging
import time

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


# variables
Entrez.email = "dylandaltonsub@gmail.com"
base_url = "http://www.ilincs.org/api"
doi_data_path = "../../data/DiseaseOntology/doid.obo"
mesh_file_path = "../../data/MeSH/desc2023.xml"
d_dataset_2_mesh = dict()
d_signature_2_mesh = dict()
d_mesh_symbol_2_term = dict()
filter_criteria = lambda s: (s["factor"] == "disease.state") and (
    "normal" in s["level2"] or "control" in s["level2"] or "healthy" in s["level2"]
)


# functions
def fetch_disease_signatures(factor):
    """Fetch Disease Signatures"""

    # Construct the filtering JSON based on provided example
    # filter_json = {
    #     "where": {
    #         "factor": factor,
    #         #"baseline": baseline
    #     }
    # }
    # filter_str = json.dumps(filter_json)

    endpoint = f"{base_url}/SignatureMeta"
    # response = requests.get(endpoint, params={"filter": filter_str})
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()  # assuming the response is in JSON format
    else:
        print("Error:", response.status_code, response.text)
        return []


def extract_disease_names_from_obo(file_path):
    """
    Extracts disease names from an OBO formatted file.

    Args:
    - file_path (str): Path to the OBO file.

    Returns:
    - List[str]: A list of disease names.
    """

    # Open and read the content of the OBO file
    with open(file_path, "r") as f:
        content = f.read()

    # The OBO format divides entries using '[Term]'. We split the content based on this to get individual entries.
    terms = content.split("[Term]")

    disease_names = []  # List to store extracted disease names

    # Iterate over each term/entry
    for term in terms:
        # Use a regular expression to search for the line that starts with 'name: '
        # This line contains the name of the disease.
        match = re.search(r"name: (.+)", term)

        # If a match is found (i.e., the term has a name), extract it and add to the list
        if match:
            disease_name = match.group(
                1
            )  # The actual name is captured in the first group of the regex
            disease_names.append(disease_name)

    return disease_names


def parse_mesh_data(file_path):
    """Parse MeSH XML data and extract disease terms."""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract disease terms
    disease_terms = []
    for descriptor in root.findall("DescriptorRecord"):
        term = descriptor.find("DescriptorName/String").text
        disease_terms.append(term)

    return disease_terms, None


def parse_mesh_data(file_path):
    """Parse MeSH XML data and extract disease terms.

    Retrieve the Botom-Most disease terms which contain the most specific
    information for a disease.

    Parameters:
        file_path: str()

    Return:
        disease_terms: list()
        list_tree_numbers: list()"""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract disease terms
    disease_terms = list()
    list_tree_numbers = list()
    for descriptor in root.findall("DescriptorRecord"):
        # Check if the term is under the category of diseases
        tree_numbers = descriptor.findall("TreeNumberList/TreeNumber")
        for tree_number in tree_numbers:
            # This is a basic check for TreeNumbers starting with 'C' which usually denotes diseases in MeSH
            # You might need to adjust this based on the specific structure of your XML file
            if tree_number.text.startswith("C"):
                list_tree_numbers.append(tree_number.text)
                term = descriptor.find("DescriptorName/String").text
                disease_terms.append(term)
                break  # Break after adding the term to avoid duplicates

    return disease_terms, list_tree_numbers


def extract_pmid_from_publink(publink):
    """Extract the PubMed ID from the provided publink."""
    pmid_match = re.search(r"term=(\d+)\[UID\]", publink)
    if pmid_match:
        return pmid_match.group(1)
    return None


def get_pmid_from_geo_via_eutils(geo_id):
    # Use elink to establish links between GEO and PubMed databases
    handle = Entrez.elink(dbfrom="gds", db="pubmed", id=geo_id[3:])
    record = Entrez.read(handle)
    handle.close()

    # Extract the PMID from the linked records
    # if it has LinkSetDb report else return None
    if len(record[0]["LinkSetDb"]) > 0:
        return record[0]["LinkSetDb"][0]["Link"][0]["Id"]
    else:
        return None


def fetch_disease_signatures():
    """Fetch Disease Signatures"""

    # Construct the filtering JSON based on provided example
    # filter_json = {
    #     "where": {
    #         "factor": factor,
    #         #"baseline": baseline
    #     }
    # }
    # filter_str = json.dumps(filter_json)

    endpoint = f"{base_url}/SignatureMeta"
    # response = requests.get(endpoint, params={"filter": filter_str})
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()  # assuming the response is in JSON format
    else:
        print("Error:", response.status_code, response.text)
        return []


def fetch_dataset_metadata(dataset_id):
    """Fetch dataset metadata/description for a given dataset."""
    endpoint = f"{base_url}/PublicDatasets/{dataset_id}"
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()  # assuming the response is in JSON format
    else:
        print("Error:", response.status_code, response.text)
        return None


def fetch_mesh_terms_from_pubmed(pmid):
    """Fetch MeSH terms for a given PubMed ID."""
    if not pmid:
        return []

    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {"db": "pubmed", "id": pmid, "retmode": "xml"}
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print(f"Error fetching MeSH terms for PMID {pmid}: {response.text}")
        return []

    # Parse the XML response to extract MeSH terms
    root = ET.fromstring(response.text)
    mesh_terms = [
        descriptor.findtext("DescriptorName")
        for descriptor in root.findall(".//MeshHeading")
    ]

    # Parse the XML response to extract MeSH tree numbers
    mesh_tree_numbers = list()
    for descriptor in root.findall(".//MeshHeading"):
        # Find the DescriptorName element and get its UI attribute
        descriptor_ui = descriptor.find("DescriptorName").get("UI")
        # Use the UI to find the corresponding TreeNumberList/TreeNumber elements
        tree_numbers = root.findall(
            f".//DescriptorRecord[DescriptorUI='{descriptor_ui}']/TreeNumberList/TreeNumber"
        )
        mesh_tree_numbers.extend([tree_number.text for tree_number in tree_numbers])

    return mesh_terms, mesh_tree_numbers


def extract_pmid_from_publink(publink):
    """Extract the PubMed ID from the provided publink."""
    pmid_match = re.search(r"term=(\d+)\[UID\]", publink)
    if pmid_match:
        return pmid_match.group(1)
    return None


def build_mesh_term_tree_number_mapping(mesh_xml_file_path: str) -> dict:
    """
    Build a mapping of MeSH terms to their tree numbers from the MeSH XML file.

    Parameters:
    - mesh_xml_file_path (str): The file path to the MeSH XML file.

    Returns:
    - dict: A dictionary where keys are MeSH terms and values are lists of associated tree numbers.
    """
    tree = ET.parse(mesh_xml_file_path)
    root = tree.getroot()

    mesh_term_2_symbol = dict()
    mesh_symbol_2_term = dict()
    for descriptor in root.findall("DescriptorRecord"):
        term = descriptor.find("DescriptorName/String").text
        tree_numbers = [
            tree_number.text
            for tree_number in descriptor.findall("TreeNumberList/TreeNumber")
        ]
        for tree_number in tree_numbers:
            mesh_symbol_2_term[tree_number] = term
        mesh_term_2_symbol[term] = tree_numbers

    return mesh_term_2_symbol, mesh_symbol_2_term


# 2. Retrieve MeSH terms
# retrieve disease terms and store in dictionary
disease_names_mesh, symbol_mesh = parse_mesh_data(file_path=mesh_file_path)
d_mesh_symbol_2_term = dict(zip(symbol_mesh, disease_names_mesh))

# 3. Retrieve Signature Datasets
start_time = time.time()
logging.info("Starting to Get All Signatures: ")

# get all signatures
signatures = fetch_disease_signatures()

end_time = time.time()
logging.info(
    f"Finished Getting All Signatures. Total time taken: %.4f seconds"
    % (end_time - start_time)
)

# filter signatures to only those which have as level2 "normal" & factor "disease state"
# get unique datasets for this filtering
datasets = list(set([s["datasetid"] for s in signatures if filter_criteria(s)]))


start_time = time.time()
logging.info("Starting to Retrieve MeSH terms for Datasets: ")

for dataset_id in datasets[:10]:
    # retrieve metadata from iLINCS for specific datasetid
    metadata = fetch_dataset_metadata(dataset_id)

    # retrieve from metadata pmid
    pmid = extract_pmid_from_publink(metadata["publink"])

    if pmid:
        # if pmid listed retrieve from pmid associated MeSH terms
        mesh_terms = fetch_mesh_terms_from_pubmed(pmid)

        # print(f"Found MeSH terms for dataset {dataset_id}: {mesh_terms}")
        d_dataset_2_mesh[dataset_id] = mesh_terms

    else:
        # if pmid NOT listed try and retrieve it by accessing GEO website &
        # retrieving by web scrapping the pmid
        pmid = get_pmid_from_geo_via_eutils(metadata["SourceID"])

        if pmid:
            # if pmid listed retrieve pmid associated MeSH terms
            mesh_terms = fetch_mesh_terms_from_pubmed(pmid)
            # print(f"Found MeSH terms for dataset {dataset_id}: {mesh_terms}")
            d_dataset_2_mesh[dataset_id] = mesh_terms
        else:
            pass
            # print(f"No valid PMID found for dataset {dataset_id}.")

end_time = time.time()
logging.info(
    f"Finished Retrieving MeSH terms for Datasets. Total time taken: %.4f seconds"
    % (end_time - start_time)
)

2023-12-19 18:40:58,611 - INFO - Starting to Get All Signatures: 
2023-12-19 18:41:22,901 - INFO - Finished Getting All Signatures. Total time taken: 24.2891 seconds
2023-12-19 18:41:22,967 - INFO - Starting to Retrieve MeSH terms for Datasets: 
2023-12-19 18:41:31,575 - INFO - Finished Retrieving MeSH terms for Datasets. Total time taken: 8.6083 seconds


In [4]:
mesh_term_2_symbol, mesh_symbol_2_term = build_mesh_term_tree_number_mapping(
    mesh_file_path
)

In [5]:
failed_translation_all = list()
for v, MeSH_terms in d_dataset_2_mesh.items():
    tree_symbols = list()
    failed_translation = list()
    for MeSH_term in MeSH_terms[0]:
        if len(mesh_term_2_symbol[MeSH_term]) > 0:
            tree_symbols.append(mesh_term_2_symbol[MeSH_term])
            # print(mesh_term_2_symbol[MeSH_term])
        else:
            failed_translation.append(MeSH_term)
            failed_translation_all.append(MeSH_term)

In [3]:
d_dataset_2_mesh

{'gdsGDS3903': (['Cluster Analysis',
   'Databases, Genetic',
   'Extracellular Matrix',
   'Gene Expression Profiling',
   'Humans',
   'Intracranial Aneurysm',
   'Oligonucleotide Array Sequence Analysis',
   'Reverse Transcriptase Polymerase Chain Reaction'],
  []),
 'gdsGDS3902': (['Chromatin',
   'Cluster Analysis',
   'Gene Expression Profiling',
   'Gene Expression Regulation, Leukemic',
   'HeLa Cells',
   'Humans',
   'Leukemia, Lymphocytic, Chronic, B-Cell',
   'MicroRNAs',
   'Microarray Analysis',
   'Oncogene Proteins v-myb',
   'Promoter Regions, Genetic',
   'Protein Binding',
   'Transcription, Genetic',
   'Transfection',
   'Tumor Cells, Cultured'],
  []),
 'gdsGDS2855': (['Biopsy',
   'Child',
   'DNA Fingerprinting',
   'Gene Expression Profiling',
   'Humans',
   'Lamin Type A',
   'Membrane Proteins',
   'Models, Statistical',
   'Muscle, Skeletal',
   'Muscular Dystrophies',
   'Muscular Dystrophy, Emery-Dreifuss',
   'Mutation',
   'MyoD Protein',
   'Nuclear En

## Query

In [11]:
# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
table_name = "signature_values"

# Connect to the database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    print("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    print(f"Unable to connect to the database: {e}")
    exit()

# Query the table
try:
    query = f"""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_name = '{table_name}'
    """
    query = f"SELECT * FROM {table_name} LIMIT 10;"  # Adjust the query as needed
    query = f"SELECT * FROM {table_name} WHERE antibodytarget IS NOT NULL;"
    query = f"SELECT * FROM {table_name} WHERE experiment IS  NULL;"
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql(query, conn)
    print(df)
except Exception as e:
    print(f"An error occurred while querying the table: {e}")
finally:
    conn.close()

# The DataFrame 'df' now contains the first 10 rows of the table.

Connected to the database successfully.


  df = pd.read_sql(query, conn)


# Signature CSV files

In [None]:
"""Signature CSV files

Structure:
    1. Imports, Variables, Functions
    2. Load Data
    3. Save to CSV
"""