In [1]:
import pandas , numpy
import psycopg2
from dotenv import load_dotenv
import os
import time

In [2]:
load_dotenv()

True

In [3]:
#CONNECTING TO COMPANY DB
db_name = "OpenAlex"
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")

In [4]:
conn = psycopg2.connect(f"dbname ={db_name} user={user} password={password} host={host} port={port}")

In [5]:
query = """
WITH bronchitis_works AS (
    SELECT DISTINCT w.id
    FROM works w
    JOIN works_mesh wm ON w.id = wm.work_id
    WHERE wm.descriptor_name ILIKE '%bronchitis%'
)
SELECT
    pl.source_display_name AS journal_name,
    SUM(w.cited_by_count) AS total_citations
FROM bronchitis_works bw
JOIN works_locations pl ON bw.id = pl.work_id
JOIN works w ON bw.id = w.id
WHERE pl.source_type = 'journal'
GROUP BY
    pl.source_display_name
ORDER BY
    total_citations DESC
LIMIT 5;
"""

In [None]:
cursor = conn.cursor()
cursor.execute(query)
#cursor.execute(query , (["%cancer%"],))
rows = cursor.fetchall()
print(rows)

In [None]:
dict_diseases = {'Bronchitis': ['%bronchitis%'],
 'Covid': ['%covid%', '%coronavirus%', '%sars-cov-2%', '%2019-ncov%'],
 "Alzheimer's": ['%alzheimer%', '%dementia%', '%neurodegenerative%'],
 'Diabetes': ['%diabetes%', '%hyperglycemia%'],
 'Lymphoma': ['%lymphoma%', '%Hodgkin%', '%lymphatic%'],
 'Sarcoma': ['%sarcoma%']}

In [None]:
def number_authors_query(keyword = None, lst_keywords = None):
    if keyword:
        query = f"""
            WITH cardiovascular_related_works AS (
            SELECT DISTINCT id
            FROM works
            JOIN works_mesh wm ON works.id = wm.work_id
            WHERE wm.descriptor_name ILIKE '{keyword}'
        ),
        work_institutions AS (
            SELECT
                crw.id AS work_id,
                unnest(w.corresponding_institution_ids) AS institution_id
            FROM cardiovascular_related_works crw
            JOIN works w ON crw.id = w.id
            WHERE w.corresponding_institution_ids IS NOT NULL
        )
        SELECT
            i.id AS institution_id,
            i.display_name AS institution_name,
            COUNT(DISTINCT wi.work_id) AS num_publications
        FROM work_institutions wi
        JOIN institutions i ON wi.institution_id = i.id
        GROUP BY
            i.id,
            i.display_name
        ORDER BY
            num_publications DESC
        LIMIT 10;
            """
    else:
        formatted_keywords = ", ".join([f"'{kw}'" for kw in lst_keywords])
        query = f"""
            WITH cardiovascular_related_works AS (
                SELECT DISTINCT id
                FROM works
                JOIN works_mesh wm ON works.id = wm.work_id
                WHERE wm.descriptor_name ILIKE ANY(ARRAY[{formatted_keywords}])
            ),
            work_institutions AS (
                SELECT
                    crw.id AS work_id,
                    unnest(w.corresponding_institution_ids) AS institution_id
                FROM cardiovascular_related_works crw
                JOIN works w ON crw.id = w.id
                WHERE w.corresponding_institution_ids IS NOT NULL
            )
            SELECT
                i.id AS institution_id,
                i.display_name AS institution_name,
                COUNT(DISTINCT wi.work_id) AS num_publications
            FROM work_institutions wi
            JOIN institutions i ON wi.institution_id = i.id
            GROUP BY
                i.id,
                i.display_name
            ORDER BY
                num_publications DESC
            LIMIT 10;
            """
    return query

In [None]:
#running all the diseases
for key , value in dict_diseases.items():
    print(f"running {key}")
    before_running = time.time()
    if len(value) == 1:
        query = number_authors_query(keyword = value[0])
    else:
        query = number_authors_query(lst_keywords = value)

    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    after_running = time.time()
    print(rows)
    print(f"Ran in {after_running - before_running} seconds")
    