In [1]:
import pandas , numpy
import psycopg2
from dotenv import load_dotenv
import os
import time

In [2]:
load_dotenv()

True

In [3]:
#CONNECTING TO COMPANY DB
db_name = "OpenAlex"
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")

In [4]:
conn = psycopg2.connect(f"dbname ={db_name} user={user} password={password} host={host} port={port}")

In [5]:
#getting top 5 papers for each field
#bronchitis
query = """
WITH sarcoma_related_mesh AS (
    SELECT DISTINCT work_id
    FROM works_mesh wm
    WHERE wm.descriptor_name ILIKE '%bronchitis%'
)
SELECT
    w.open_access_oa_status,
    COUNT(DISTINCT w.id) AS paper_count
FROM works w
JOIN sarcoma_related_mesh mrw ON w.id = mrw.work_id
GROUP BY
    w.open_access_oa_status;
"""

In [6]:
# cursor = conn.cursor()
# cursor.execute(query)
# #cursor.execute(query , (["%cancer%"],))
# rows = cursor.fetchall()
# print(rows)

In [7]:
dict_diseases = {'Bronchitis': ['%bronchitis%'],
 'Covid': ['%covid%', '%coronavirus%', '%sars-cov-2%', '%2019-ncov%'],
 "Alzheimer's": ['%alzheimer%', '%dementia%', '%neurodegenerative%'],
 'Diabetes': ['%diabetes%', '%hyperglycemia%'],
 'Lymphoma': ['%lymphoma%', '%Hodgkin%', '%lymphatic%'],
 'Sarcoma': ['%sarcoma%'],
 'Liver Disease': ['%liver%',
  '%hepatic%',
  '%cirrhosis%',
  '%hepatitis%',
  '%cholestasis%'] ,
  'Heart disease': ['%heart%',
  '%cardiovascular%',
  '%cardiac%',
  '%coronary%',
  '%myocardial%',
  '%angina%',
  '%cardiomyopathy%',
  '%arteriosclerosis%'],
  'Cancer': ['%cancer%', '%carcinoma%', '%neoplasm%', '%tumor%', '%malignant%']}

In [8]:
def number_openaccess_query(keyword = None, lst_keywords = None):
    if keyword:
        query = f"""
            WITH sarcoma_related_mesh AS (
                SELECT DISTINCT work_id
                FROM works_mesh wm
                WHERE wm.descriptor_name ILIKE '{keyword}'
            )
            SELECT
                w.open_access_oa_status,
                COUNT(DISTINCT w.id) AS paper_count
            FROM works w
            JOIN sarcoma_related_mesh mrw ON w.id = mrw.work_id
            GROUP BY
                w.open_access_oa_status;
            """
    else:
        formatted_keywords = ", ".join([f"'{kw}'" for kw in lst_keywords])
        query = f"""
            WITH sarcoma_related_mesh AS (
                SELECT DISTINCT work_id
                FROM works_mesh wm
                WHERE wm.descriptor_name ILIKE ANY(ARRAY[{formatted_keywords}])
            )
            SELECT
                w.open_access_oa_status,
                COUNT(DISTINCT w.id) AS paper_count
            FROM works w
            JOIN sarcoma_related_mesh mrw ON w.id = mrw.work_id
            GROUP BY
                w.open_access_oa_status;
            """
    return query

In [None]:
#running all the diseases
for key , value in dict_diseases.items():
    print(f"running {key}")
    before_running = time.time()
    if len(value) == 1:
        query = number_openaccess_query(keyword = value[0])
    else:
        query = number_openaccess_query(lst_keywords = value)

    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    after_running = time.time()
    print(rows)
    print(f"Ran in {after_running - before_running} seconds")


#running all the diseases
for key , value in dict_diseases.items():
    print(f"running {key}")
    before_running = time.time()
    if len(value) == 1:
        query = number_openaccess_query(keyword = value[0])
    else:
        query = number_openaccess_query(lst_keywords = value)

    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    after_running = time.time()
    print(rows)
    print(f"Ran in {after_running - before_running} seconds")


# running Bronchitis
# [('bronze', 2318), ('closed', 19377), ('diamond', 73), ('gold', 799), ('green', 823), ('hybrid', 307)]
# Ran in 171.20797276496887 seconds
# running Covid
# [('bronze', 36200), ('closed', 48336), ('diamond', 8367), ('gold', 89188), ('green', 47462), ('hybrid', 41241)]
# Ran in 468.01255202293396 seconds
# running Alzheimer's
# [('bronze', 22026), ('closed', 126549), ('diamond', 778), ('gold', 28870), ('green', 25735), ('hybrid', 17069)]
# Ran in 420.55153822898865 seconds
# running Diabetes
# [('bronze', 70844), ('closed', 299767), ('diamond', 4496), ('gold', 65482), ('green', 29647), ('hybrid', 24049)]
# Ran in 233.79336857795715 seconds
# running Lymphoma
# [('bronze', 45750), ('closed', 254411), ('diamond', 1491), ('gold', 25836), ('green', 15958), ('hybrid', 11436)]
# Ran in 306.9700753688812 seconds
# running Sarcoma
# [('bronze', 16899), ('closed', 116311), ('diamond', 687), ('gold', 10469), ('green', 7668), ('hybrid', 3864)]
# Ran in 111.91938781738281 seconds
# running Liver Disease

running Bronchitis
[('bronze', 2318), ('closed', 19377), ('diamond', 73), ('gold', 799), ('green', 823), ('hybrid', 307)]
Ran in 171.20797276496887 seconds
running Covid
[('bronze', 36200), ('closed', 48336), ('diamond', 8367), ('gold', 89188), ('green', 47462), ('hybrid', 41241)]
Ran in 468.01255202293396 seconds
running Alzheimer's
[('bronze', 22026), ('closed', 126549), ('diamond', 778), ('gold', 28870), ('green', 25735), ('hybrid', 17069)]
Ran in 420.55153822898865 seconds
running Diabetes
[('bronze', 70844), ('closed', 299767), ('diamond', 4496), ('gold', 65482), ('green', 29647), ('hybrid', 24049)]
Ran in 233.79336857795715 seconds
running Lymphoma
[('bronze', 45750), ('closed', 254411), ('diamond', 1491), ('gold', 25836), ('green', 15958), ('hybrid', 11436)]
Ran in 306.9700753688812 seconds
running Sarcoma
[('bronze', 16899), ('closed', 116311), ('diamond', 687), ('gold', 10469), ('green', 7668), ('hybrid', 3864)]
Ran in 111.91938781738281 seconds
running Liver Disease
