# SQL

## Create Signature Table

In [2]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/signatures.csv"
table_name = "signatures"
primary_key = "signatureid"
int_columns = ["nCtrSamples", "nTrtSamples", "pubChemID"]  # INT columns - rest TEXT
drop_table = False

# 2. Load Data
try:
    df_data = pd.read_csv(path_data)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("Data file not found. Please check the file path.")
    exit()

# get max length for each
max_lengths = [max([len(str(n)) for n in df_data[c].to_list()]) for c in df_data.columns]

# Convert specified integer columns and handle NaN by replacing with 0
for col in int_columns:
    df_data[col] = pd.to_numeric(df_data[col], errors='coerce').fillna(0).astype(int)

# For other columns, replace NaN with None (which will become NULL in SQL)
for col in df_data.columns:
    if col not in int_columns:
        df_data[col] = df_data[col].where(pd.notnull(df_data[col]), None)
# 3. Connect with Database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    logging.info("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    logging.error(f"Unable to connect to the database: {e}")
    exit()

# 4. Create Cursor Object
cursor = conn.cursor()

# 5. Check if Table Exists and Delete Data if It Does

# Check if the table exists and drop it if it does
if drop_table:
    try:
        cursor.execute(f"DROP TABLE IF EXISTS {table_name};")
        conn.commit()
        print(f"Table {table_name} dropped successfully if it existed.")
    except psycopg2.Error as e:
        print(f"An error occurred: {e}")
        conn.rollback()
    finally:
        # Close cursor and connection
        cursor.close()
        conn.close()


cursor.execute("SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name=%s)", (table_name,))
table_exists = cursor.fetchone()[0]

if table_exists:
    try:
        cursor.execute(f"DELETE FROM {table_name};")
        conn.commit()
        logging.info(f"Existing data in table {table_name} deleted successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while deleting data from the table: {e}")
        cursor.close()
        conn.close()
        exit()
else:
    # Create table if it does not exist
    column_text = ", ".join(f"{c} VARCHAR({n + 10})" if c not in int_columns else f"{c} INT" for c, n in zip(df_data.columns, max_lengths))
    create_table_query = f"CREATE TABLE {table_name} ({column_text}, PRIMARY KEY({primary_key}));"
    try:
        cursor.execute(create_table_query)
        conn.commit()
        logging.info(f"Table {table_name} created successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while creating the table: {e}")
        cursor.close()
        conn.close()
        exit()

  df_data = pd.read_csv(path_data)
2023-11-27 16:01:13,716 - INFO - Data loaded successfully.
2023-11-27 16:01:14,359 - INFO - Connected to the database successfully.
2023-11-27 16:01:14,362 - INFO - Existing data in table signatures deleted successfully.


In [6]:
for c in df_data.columns:
    if c not in int_columns:
        print(f"{c} str")
    else:
        print(f"{c} int")

antibodytarget str
cellline str
tissue str
cid str
compound str
concentration str
concordancetable str
datasetid str
factor str
level1 str
level2 str
libraryid str
lincspertid str
nCtrSamples int
nTrtSamples int
peaktype str
platform str
signatureid str
lincsSigID str
organism str
clueIoCompound str
integratedMoas str
GeneTargets str
time str
treatment str
perturbagenID str
stitchID str
pubChemID int
is_exemplar str
pert_type str


In [66]:
# 6. Dump Data into Table
data_tuples = list(df_data.itertuples(index=False, name=None))
insert_query = f"INSERT INTO {table_name} ({', '.join(df_data.columns)}) VALUES (%s" + ", %s"*(len(df_data.columns)-1) + ")"

try:
    with conn:
        with conn.cursor() as curs:
            for record in data_tuples:
                try:
                    curs.execute(insert_query, record)
                except psycopg2.Error as e:
                    logging.error(f"Error inserting record {record}: {e}")
                    # Optionally, you can break the loop after logging the first error
                    break  
    logging.info(f"Data dumped into {table_name} successfully.")
except psycopg2.Error as e:
    logging.error(f"An error occurred while inserting data into the table: {e}")
    conn.rollback()
    exit()


2023-11-27 14:02:10,881 - INFO - Data dumped into signatures successfully.


## Create Table Dataset

In [149]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/datasets.csv"
table_name = "datasets"
primary_keys = ["experiment","portal"]
int_columns = ["nsamples"]  # INT columns - rest TEXT
drop_table = True
foreign_key = "SourceID"
parent_table = "signatures"
parent_id = "datasetid"
# 2. Load Data
try:
    df_data = pd.read_csv(path_data)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("Data file not found. Please check the file path.")
    exit()

# get max length for each
max_lengths = [max([len(str(n)) for n in df_data[c].to_list()]) for c in df_data.columns]

# Convert specified integer columns and handle NaN by replacing with 0
for col in int_columns:
    df_data[col] = pd.to_numeric(df_data[col], errors='coerce').fillna(0).astype(int)

# For other columns, replace NaN with None (which will become NULL in SQL)
for col in df_data.columns:
    if col not in int_columns:
        df_data[col] = df_data[col].where(pd.notnull(df_data[col]), None)

2023-11-28 16:44:17,622 - INFO - Data loaded successfully.


In [124]:
df_data_2 = df_data.drop(columns = ["portal","experiment","description","nsamples","platform","pubmeddescription"])

In [125]:
df_data_2= df_data_2.drop_duplicates()

In [165]:
print(df_data.shape,df_data.shape)
print(len(df_data["experiment"].unique()))

(31437, 10) (41272, 16)
31426


In [162]:
df_1.columns

Index(['antibodytarget', 'cellline', 'tissue', 'cid', 'compound',
       'concentration', 'concordancetable', 'datasetid', 'factor', 'level1',
       'level2', 'libraryid', 'lincspertid', 'nCtrSamples', 'nTrtSamples',
       'peaktype', 'platform', 'signatureid', 'lincsSigID', 'organism',
       'clueIoCompound', 'integratedMoas', 'GeneTargets', 'time', 'treatment',
       'perturbagenID', 'stitchID', 'pubChemID', 'is_exemplar', 'pert_type'],
      dtype='object')

In [158]:
non_success[8000:]

['gdsGDS2737',
 'gdsGDS2728',
 'gdsGDS2724',
 'gdsGDS2697',
 'gdsGDS2657',
 'gdsGDS2635',
 'gdsGDS2628',
 'gdsGDS2615',
 'gdsGDS2611',
 'gdsGDS2609',
 'gdsGDS2604',
 'gdsGDS2565',
 'gdsGDS2548',
 'gdsGDS2534',
 'gdsGDS2526',
 'gdsGDS2499',
 'gdsGDS2495',
 'gdsGDS2491',
 'gdsGDS2486',
 'gdsGDS2484',
 'gdsGDS2470',
 'gdsGDS2453',
 'gdsGDS2432',
 'gdsGDS2431',
 'gdsGDS2426',
 'gdsGDS2418',
 'gdsGDS2414',
 'gdsGDS2374',
 'gdsGDS2307',
 'gdsGDS2250',
 'gdsGDS2221',
 'gdsGDS2215',
 'gdsGDS2213',
 'gdsGDS2164',
 'gdsGDS2154',
 'gdsGDS2125',
 'gdsGDS2118',
 'gdsGDS2089',
 'gdsGDS2088',
 'gdsGDS2083',
 'gdsGDS2052',
 'gdsGDS2046',
 'gdsGDS2023',
 'gdsGDS1989',
 'gdsGDS1973',
 'gdsGDS1962',
 'gdsGDS1917',
 'gdsGDS1869',
 'gdsGDS1779',
 'gdsGDS1732',
 'gdsGDS1673',
 'gdsGDS1667',
 'gdsGDS1665',
 'gdsGDS1617',
 'gdsGDS1604',
 'gdsGDS1580',
 'gdsGDS1579',
 'gdsGDS1439',
 'gdsGDS1413',
 'gdsGDS1369',
 'gdsGDS1344',
 'gdsGDS1249',
 'gdsGDS4401',
 'gdsGDS1237',
 'gdsGDS651',
 'gdsGDS2855',
 'gdsGDS440

In [150]:
df_data

Unnamed: 0,platform,description,summary,geolink,lincsDsgc,publink,pubmeddescription,experiment,assay,dataFormat,sampleType,dataType,organism,portal,SourceID,nsamples
0,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
1,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
2,GPL16791_humanS1500,Run 3 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run3_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
3,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",apap_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
4,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",apap_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41267,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",trovafloxacin_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41268,GPL16791_humanS1500,Run 3 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",trovafloxacin_run3_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41269,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",vpa_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41270,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",vpa_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42


In [134]:
df_unique = df_data_2.drop_duplicates(subset='SourceID', keep='first')


In [None]:
df_data

In [138]:
df_unique["SourceID"].value_counts()

SourceID
TCGA_UVM_RPPA_2019         1
SRP166108                  1
TCGA_PCPG_RNASeqV2_2019    1
TCGA_PCPG_RPPA_2019        1
TCGA_PRAD_Gistic_2020      1
                          ..
EDS-1014                   1
GSE8096                    1
E-TABM-276                 1
GDS1363                    1
GDS1439                    1
Name: count, Length: 31426, dtype: int64

In [131]:
df_data_2[df_data_2["SourceID"] == "GDS1363"]["geolink"].to_list()[1]

'http://www.ncbi.nlm.nih.gov/projects/geo/gds/gds_browse.cgi?gds=1363'

In [117]:
df_data_2[df_data_2["SourceID"]=="LDG-1284"]

Unnamed: 0,platform,summary,geolink,lincsDsgc,publink,pubmeddescription,assay,dataFormat,sampleType,dataType,organism,SourceID
40957,gcpLDS1236Platform,,https://panoramaweb.org/labkey/project/LINCS/G...,broad_proteomics,,,GCP,MaxD,cell line,Proteomics,human,LDG-1284
40961,gcpLDS1265Platform,,https://panoramaweb.org/labkey/project/LINCS/G...,broad_proteomics,,,GCP,MaxD,cell line,Proteomics,human,LDG-1284
40963,GCP_2016_08_16,,http://lincsportal.ccs.miami.edu/datasets/#/vi...,broad_proteomics,,,GCP,MaxD,cell line,Proteomics,human,LDG-1284


In [57]:
print(df_data.shape)
df_data = df_data.drop_duplicates()
print(df_data.shape)

(41272, 16)
(41272, 16)


In [144]:
success = list() ; non_success = list();

for d in df_1["datasetid"].unique():
    if d in df_unique["SourceID"].unique():
        success.append(d)
    else:
        non_success.append(d)

print(f"Success {len(success)} Non-Success {len(non_success)}")

Success 11 Non-Success 8491


In [61]:
print(len(set([str(i)+str(n) for i,n in zip(df_data["experiment"],df_data["SourceID"])])))

31524


In [85]:
df_data["sampleType"].unique()


array(['cell line', 'tissue', None, 'single cell', 'tissue and cell line',
       'xenograft'], dtype=object)

In [84]:
df_data

Unnamed: 0,platform,description,summary,geolink,lincsDsgc,publink,pubmeddescription,experiment,assay,dataFormat,sampleType,dataType,organism,portal,SourceID,nsamples
0,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
1,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
2,GPL16791_humanS1500,Run 3 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run3_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
3,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",apap_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
4,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",apap_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41267,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",trovafloxacin_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41268,GPL16791_humanS1500,Run 3 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",trovafloxacin_run3_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41269,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",vpa_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41270,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",vpa_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42


In [80]:
a = list(df_1["datasetid"].unique())[100]
a = "GSE116436"

In [81]:
for c in df_data.columns:
    if a in df_data[c].to_list():
        print(c)

experiment
SourceID


In [66]:
len(df_data["experiment"].unique())

31524

In [58]:
for c in df_data.columns:
    print(c,len(df_data[c].unique()))

platform 356
description 31510
summary 26366
geolink 31328
lincsDsgc 6
publink 2951
pubmeddescription 2965
experiment 31524
assay 9
dataFormat 1
sampleType 6
dataType 3
organism 4
portal 10
SourceID 31426
nsamples 466


In [9]:
for c in df_data.columns:
    if c not in int_columns:
        print(f"{c} str")
    else:
        print(f"{c} int")

platform str
description str
summary str
geolink str
lincsDsgc str
publink str
pubmeddescription str
experiment str
assay str
dataFormat str
sampleType str
dataType str
organism str
portal str
SourceID str
nsamples int


In [38]:
df_1["datasetid"]

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
227573    GSE116436
227574    GSE116436
227575    GSE116436
227576    GSE116436
227577    GSE116436
Name: datasetid, Length: 227578, dtype: object

In [52]:
i = 0
for c in df_data.iterrows():
    print(c[1])
    i+=1
    if i==2:
        break

platform                                           GPL16791_humanS1500
description          Run 1 of 42 Human S1500+ TempO-Seq samples pro...
summary                                                           None
geolink              https://manticore.niehs.nih.gov/cebssearch/pap...
lincsDsgc                                                         None
publink                    http://www.ncbi.nlm.nih.gov/pubmed/30850835
pubmeddescription    Ramaiahgari SC, Auerbach SS, Saddler TO, et al...
experiment                                       aflatoxin_b1_run1_bmd
assay                                Expression profiling by TempO-Seq
dataFormat                                                        MaxD
sampleType                                                   cell line
dataType                                               Gene Expression
organism                                                         human
portal                                                  Toxicogenomics
Source

In [55]:
df_data["description"].to_list()[0]

'Run 1 of 42 Human S1500+ TempO-Seq samples profiling gene expression in differentiated (2D-Diff) HepaRG cells exposed to Aflatoxin B1 at various concentrations and DMSO control for 96 hours.'

In [32]:
#print(len(set([str(i)+str(n) for i,n in zip(df_data["experiment"],df_data["portal"])])))

In [33]:
# 3. Connect with Database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    logging.info("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    logging.error(f"Unable to connect to the database: {e}")
    exit()

# 4. Create Cursor Object
cursor = conn.cursor()

# 5. Check if Table Exists and Delete Data if It Does
# Check if the table exists and drop it if it does
if drop_table:
    try:
        cursor.execute(f"DROP TABLE IF EXISTS {table_name};")
        conn.commit()
        print(f"Table {table_name} dropped successfully if it existed.")
    except psycopg2.Error as e:
        print(f"An error occurred: {e}")
        conn.rollback()


cursor.execute("SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name=%s)", (table_name,))
table_exists = cursor.fetchone()[0]

if table_exists:
    try:
        cursor.execute(f"DELETE FROM {table_name};")
        conn.commit()
        logging.info(f"Existing data in table {table_name} deleted successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while deleting data from the table: {e}")
        cursor.close()
        conn.close()
        exit()
else:
    # Create table if it does not exist
    column_text = ", ".join(f"{c} VARCHAR({n + 10})" if c not in int_columns else f"{c} INT" for c, n in zip(df_data.columns, max_lengths))
    create_table_query = f"CREATE TABLE {table_name} ({column_text}, PRIMARY KEY({primary_keys[0]},{primary_keys[1]}), FOREIGN KEY ({foreign_key}) REFERENCES {parent_table}({parent_id}));"
    try:
        cursor.execute(create_table_query)
        conn.commit()
        logging.info(f"Table {table_name} created successfully.")
    except psycopg2.Error as e:
        logging.error(f"An error occurred while creating the table: {e}")
        cursor.close()
        conn.close()
        exit()

2023-11-27 15:42:05,380 - INFO - Connected to the database successfully.
2023-11-27 15:42:05,387 - ERROR - An error occurred while creating the table: there is no unique constraint matching given keys for referenced table "signatures"



Table datasets dropped successfully if it existed.


In [5]:
# 6. Dump Data into Table
data_tuples = list(df_data.itertuples(index=False, name=None))
insert_query = f"INSERT INTO {table_name} ({', '.join(df_data.columns)}) VALUES (%s" + ", %s"*(len(df_data.columns)-1) + ")"

try:
    with conn:
        with conn.cursor() as curs:
            for record in data_tuples:
                try:
                    curs.execute(insert_query, record)
                except psycopg2.Error as e:
                    logging.error(f"Error inserting record {record}: {e}")
                    # Optionally, you can break the loop after logging the first error
                    break  
    logging.info(f"Data dumped into {table_name} successfully.")
except psycopg2.Error as e:
    logging.error(f"An error occurred while inserting data into the table: {e}")
    conn.rollback()
    exit()


2023-11-27 14:25:48,774 - INFO - Data dumped into datasets successfully.


In [10]:
df_data

Unnamed: 0,platform,description,summary,geolink,lincsDsgc,publink,pubmeddescription,experiment,assay,dataFormat,sampleType,dataType,organism,portal,SourceID,nsamples
0,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
1,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
2,GPL16791_humanS1500,Run 3 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",aflatoxin_b1_run3_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
3,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",apap_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
4,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",apap_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41267,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",trovafloxacin_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41268,GPL16791_humanS1500,Run 3 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",trovafloxacin_run3_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41269,GPL16791_humanS1500,Run 1 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",vpa_run1_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42
41270,GPL16791_humanS1500,Run 2 of 42 Human S1500+ TempO-Seq samples pro...,,https://manticore.niehs.nih.gov/cebssearch/pap...,,http://www.ncbi.nlm.nih.gov/pubmed/30850835,"Ramaiahgari SC, Auerbach SS, Saddler TO, et al...",vpa_run2_bmd,Expression profiling by TempO-Seq,MaxD,cell line,Gene Expression,human,Toxicogenomics,SRP166108,42


## Create Table Compounds

In [34]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/compounds.csv"
table_name = "compounds"
primary_keys = []
int_columns = ["pubchemcid"]  # INT columns - rest TEXT
drop_table = True
foreign_key = ""
parent_table = ""
parent_id = ""
# 2. Load Data
try:
    df_data = pd.read_csv(path_data)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("Data file not found. Please check the file path.")
    exit()

2023-11-27 16:20:58,366 - INFO - Data loaded successfully.


In [22]:
print(df_data.shape)
sum(df_data["stitchid"].notnull())

(21299, 5)


20752

In [25]:
print(df_data.shape)
for c in df_data.columns:
    print(c, len(df_data[c].unique()))

(21299, 5)
perturbagenid 21299
pubchemcid 21026
stitchid 20543
compound 21122
lincspertid 21174


In [35]:
for c in df_data.columns:
    if c not in int_columns:
        print(f"{c} str")
    else:
        print(f"{c} int")

perturbagenid str
pubchemcid int
stitchid str
compound str
lincspertid str


In [27]:
df_data["perturbagenid"]

0               BRD-A00100033
1               BRD-A00147595
2               BRD-A00150179
3               BRD-A00218260
4               BRD-A00267231
                 ...         
21294           CMAP-ERGKD328
21295        CMAP-PRISM-1B-NR
21296    CMAP-PRISM-1B-PARENT
21297          CMAP-PRISM-TP7
21298     CMAP-T2DTUNICAMYCIN
Name: perturbagenid, Length: 21299, dtype: object

In [30]:
df_1 = pd.read_csv("../../data/iLINCS/signatures.csv")
df_1.columns

  df_1 = pd.read_csv("../../data/iLINCS/signatures.csv")


Index(['antibodytarget', 'cellline', 'tissue', 'cid', 'compound',
       'concentration', 'concordancetable', 'datasetid', 'factor', 'level1',
       'level2', 'libraryid', 'lincspertid', 'nCtrSamples', 'nTrtSamples',
       'peaktype', 'platform', 'signatureid', 'lincsSigID', 'organism',
       'clueIoCompound', 'integratedMoas', 'GeneTargets', 'time', 'treatment',
       'perturbagenID', 'stitchID', 'pubChemID', 'is_exemplar', 'pert_type'],
      dtype='object')

## Create Table Genes

In [10]:
# 1. Imports, Variables, Functions
import psycopg2
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
path_data = "../../data/iLINCS/genes.csv"
table_name = "genes"
primary_keys = []
int_columns = []  # INT columns - rest TEXT
drop_table = True
foreign_key = ""
parent_table = ""
parent_id = ""
# 2. Load Data
try:
    df_data = pd.read_csv(path_data)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("Data file not found. Please check the file path.")
    exit()

2023-11-27 16:06:40,379 - INFO - Data loaded successfully.


In [11]:
df_data

Unnamed: 0,chromosome,dbxrefs,description,geneid,homologeneid,locustag,maplocation,namenomauth,nomstatus,symbol,symbonomauth,synonyms,taxid,typeofgene
0,19,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410...,alpha-1-B glycoprotein,1,11167.0,-,19q13.4,alpha-1-B glycoprotein,O,A1BG,A1BG,A1B|ABG|GAB|HYST2477,9606,protein-coding
1,12,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899...,alpha-2-macroglobulin,2,37248.0,-,12p13.31,alpha-2-macroglobulin,O,A2M,A2M,A2MD|CPAMD5|FWP007|S863-7,9606,protein-coding
2,12,HGNC:HGNC:8|Ensembl:ENSG00000256069,alpha-2-macroglobulin pseudogene 1,3,,-,12p13.31,alpha-2-macroglobulin pseudogene 1,O,A2MP1,A2MP1,A2MP,9606,pseudo
3,11,HGNC:11|MIM:108985,"atrophia areata, peripapillary chorioretinal d...",8,,-,11p15,"atrophia areata, peripapillary chorioretinal d...",0,AA,AA,-,9606,unknown
4,8,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171...,N-acetyltransferase 1 (arylamine N-acetyltrans...,9,37329.0,-,8p22,N-acetyltransferase 1 (arylamine N-acetyltrans...,O,NAT1,NAT1,AAC1|MNAT|NAT-1|NATI,9606,protein-coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235977,3,-,ribosomal protein S27a pseudogene,105650979,,-,-,-,-,LOC105650979,-,-,10116,pseudo
235978,7,-,ribosomal protein S27a pseudogene,105650980,,-,-,-,-,LOC105650980,-,-,10116,pseudo
235979,12,-,"NLR family, pyrin domain containing 5 pseudogene",105661729,,-,-,-,-,LOC105661729,-,-,10090,pseudo
235980,1,-,long intergenic non-coding RNA muscle differen...,105661730,,-,-,-,-,Linc-md1,-,-,10090,ncRNA


## Query

In [20]:


# SQL variables
dbname = "ilincs"
user = "ddalton"
password = "Teclado$$$111"
host = "localhost"
table_name = "datasets"

# Connect to the database
try:
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    print("Connected to the database successfully.")
except psycopg2.OperationalError as e:
    print(f"Unable to connect to the database: {e}")
    exit()

# Query the table
try:
    query = f"SELECT * FROM {table_name} LIMIT 10;"  # Adjust the query as needed
    query = f"SELECT * FROM {table_name} WHERE antibodytarget IS NOT NULL;"
    query = f"SELECT * FROM {table_name} WHERE experiment IS  NULL;"
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql(query, conn)
    print(df)
except Exception as e:
    print(f"An error occurred while querying the table: {e}")
finally:
    conn.close()

# The DataFrame 'df' now contains the first 10 rows of the table.


Connected to the database successfully.
                  platform                                        description  \
0      GPL16791_humanS1500  Run 1 of 42 Human S1500+ TempO-Seq samples pro...   
1      GPL16791_humanS1500  Run 2 of 42 Human S1500+ TempO-Seq samples pro...   
2      GPL16791_humanS1500  Run 3 of 42 Human S1500+ TempO-Seq samples pro...   
3      GPL16791_humanS1500  Run 1 of 42 Human S1500+ TempO-Seq samples pro...   
4      GPL16791_humanS1500  Run 2 of 42 Human S1500+ TempO-Seq samples pro...   
...                    ...                                                ...   
41267  GPL16791_humanS1500  Run 2 of 42 Human S1500+ TempO-Seq samples pro...   
41268  GPL16791_humanS1500  Run 3 of 42 Human S1500+ TempO-Seq samples pro...   
41269  GPL16791_humanS1500  Run 1 of 42 Human S1500+ TempO-Seq samples pro...   
41270  GPL16791_humanS1500  Run 2 of 42 Human S1500+ TempO-Seq samples pro...   
41271  GPL16791_humanS1500  Run 3 of 42 Human S1500+ TempO-Seq sample

  df = pd.read_sql(query, conn)
