In [1]:
import pandas , numpy
import psycopg2
from dotenv import load_dotenv
import os
import time

In [2]:
load_dotenv()

True

In [3]:
#CONNECTING TO COMPANY DB
db_name = "OpenAlex"
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")

In [4]:
conn = psycopg2.connect(f"dbname ={db_name} user={user} password={password} host={host} port={port}")

In [5]:
#getting top 5 papers for each field
#bronchitis
query = """
WITH bronchitis_related_works AS (
    SELECT DISTINCT work_id
    FROM works_mesh wm
    WHERE wm.descriptor_name ILIKE '%bronchitis%'
)
SELECT
    w.publication_year,
    COUNT(DISTINCT w.id) AS publication_count
FROM works w
JOIN bronchitis_related_works brw ON w.id = brw.work_id
WHERE w.publication_year BETWEEN 1990 AND 2025
GROUP BY
    w.publication_year
ORDER BY
    w.publication_year;
"""

In [6]:
# cursor = conn.cursor()
# cursor.execute(query)
# #cursor.execute(query , (["%cancer%"],))
# rows = cursor.fetchall()
# print(rows)

In [7]:
dict_diseases = {'Covid': ['%covid%', '%coronavirus%', '%sars-cov-2%', '%2019-ncov%'] ,
                 'Bronchitis': ['%bronchitis%'],
 "Alzheimer's": ['%alzheimer%', '%dementia%', '%neurodegenerative%'],
 'Diabetes': ['%diabetes%', '%hyperglycemia%'],
 'Lymphoma': ['%lymphoma%', '%Hodgkin%', '%lymphatic%'],
 'Sarcoma': ['%sarcoma%'],
 'Liver Disease': ['%liver%',
  '%hepatic%',
  '%cirrhosis%',
  '%hepatitis%',
  '%cholestasis%'] ,
  'Heart disease': ['%heart%',
  '%cardiovascular%',
  '%cardiac%',
  '%coronary%',
  '%myocardial%',
  '%angina%',
  '%cardiomyopathy%',
  '%arteriosclerosis%'],
  'Cancer': ['%cancer%', '%carcinoma%', '%neoplasm%', '%tumor%', '%malignant%']}

In [8]:
def number_publications_byyear(keyword = None, lst_keywords = None):
    if keyword:
        query = f"""
            WITH bronchitis_related_works AS (
                SELECT DISTINCT work_id
                FROM works_mesh wm
                WHERE wm.descriptor_name ILIKE '{keyword}'
            )
            SELECT
                w.publication_year,
                COUNT(DISTINCT w.id) AS publication_count
            FROM works w
            JOIN bronchitis_related_works brw ON w.id = brw.work_id
            WHERE w.publication_year BETWEEN 1990 AND 2025
            GROUP BY
                w.publication_year
            ORDER BY
                w.publication_year;
            """
    else:
        formatted_keywords = ", ".join([f"'{kw}'" for kw in lst_keywords])
        query = f"""
            WITH bronchitis_related_works AS (
                SELECT DISTINCT work_id
                FROM works_mesh wm
                WHERE wm.descriptor_name ILIKE ANY(ARRAY[{formatted_keywords}])
            )
            SELECT
                w.publication_year,
                COUNT(DISTINCT w.id) AS publication_count
            FROM works w
            JOIN bronchitis_related_works brw ON w.id = brw.work_id
            WHERE w.publication_year BETWEEN 1990 AND 2025
            GROUP BY
                w.publication_year
            ORDER BY
                w.publication_year;
            """
    return query

In [None]:
#running all the diseases
for key , value in dict_diseases.items():
    print(f"running {key}")
    before_running = time.time()
    if len(value) == 1:
        query = number_publications_byyear(keyword = value[0])
    else:
        query = number_publications_byyear(lst_keywords = value)

    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    after_running = time.time()
    print(rows)
    print(f"Ran in {after_running - before_running} seconds")
#complete

running Covid
[(Decimal('1990'), 14), (Decimal('1991'), 9), (Decimal('1992'), 25), (Decimal('1993'), 48), (Decimal('1994'), 138), (Decimal('1995'), 149), (Decimal('1996'), 89), (Decimal('1997'), 106), (Decimal('1998'), 138), (Decimal('1999'), 106), (Decimal('2000'), 108), (Decimal('2001'), 146), (Decimal('2002'), 97), (Decimal('2003'), 231), (Decimal('2004'), 329), (Decimal('2005'), 336), (Decimal('2006'), 382), (Decimal('2007'), 260), (Decimal('2008'), 255), (Decimal('2009'), 248), (Decimal('2010'), 215), (Decimal('2011'), 191), (Decimal('2012'), 252), (Decimal('2013'), 386), (Decimal('2014'), 509), (Decimal('2015'), 598), (Decimal('2016'), 550), (Decimal('2017'), 487), (Decimal('2018'), 444), (Decimal('2019'), 548), (Decimal('2020'), 49256), (Decimal('2021'), 69529), (Decimal('2022'), 69634), (Decimal('2023'), 44799), (Decimal('2024'), 27073), (Decimal('2025'), 3060)]
Ran in 413.11091017723083 seconds
running Bronchitis
[(Decimal('1990'), 460), (Decimal('1991'), 394), (Decimal('1992'), 335), (Decimal('1993'), 297), (Decimal('1994'), 283), (Decimal('1995'), 273), (Decimal('1996'), 276), (Decimal('1997'), 279), (Decimal('1998'), 316), (Decimal('1999'), 278), (Decimal('2000'), 341), (Decimal('2001'), 347), (Decimal('2002'), 265), (Decimal('2003'), 270), (Decimal('2004'), 298), (Decimal('2005'), 269), (Decimal('2006'), 288), (Decimal('2007'), 267), (Decimal('2008'), 259), (Decimal('2009'), 222), (Decimal('2010'), 230), (Decimal('2011'), 232), (Decimal('2012'), 245), (Decimal('2013'), 229), (Decimal('2014'), 225), (Decimal('2015'), 250), (Decimal('2016'), 215), (Decimal('2017'), 203), (Decimal('2018'), 185), (Decimal('2019'), 195), (Decimal('2020'), 162), (Decimal('2021'), 149), (Decimal('2022'), 224), (Decimal('2023'), 170), (Decimal('2024'), 142), (Decimal('2025'), 18)]
Ran in 122.43095064163208 seconds
running Alzheimer's
[(Decimal('1990'), 1794), (Decimal('1991'), 1972), (Decimal('1992'), 1911), (Decimal('1993'), 1963), (Decimal('1994'), 2401), (Decimal('1995'), 2457), (Decimal('1996'), 2641), (Decimal('1997'), 2993), (Decimal('1998'), 3094), (Decimal('1999'), 3189), (Decimal('2000'), 3568), (Decimal('2001'), 3717), (Decimal('2002'), 4160), (Decimal('2003'), 4326), (Decimal('2004'), 4764), (Decimal('2005'), 4747), (Decimal('2006'), 5074), (Decimal('2007'), 5039), (Decimal('2008'), 5490), (Decimal('2009'), 5893), (Decimal('2010'), 6218), (Decimal('2011'), 6609), (Decimal('2012'), 6913), (Decimal('2013'), 7386), (Decimal('2014'), 7668), (Decimal('2015'), 7929), (Decimal('2016'), 8509), (Decimal('2017'), 8715), (Decimal('2018'), 9209), (Decimal('2019'), 9828), (Decimal('2020'), 9527), (Decimal('2021'), 10436), (Decimal('2022'), 12406), (Decimal('2023'), 12646), (Decimal('2024'), 11396), (Decimal('2025'), 1265)]
Ran in 319.98187232017517 seconds
running Diabetes
[(Decimal('1990'), 4638), (Decimal('1991'), 4614), (Decimal('1992'), 4639), (Decimal('1993'), 4825), (Decimal('1994'), 4899), (Decimal('1995'), 4917), (Decimal('1996'), 5293), (Decimal('1997'), 5801), (Decimal('1998'), 5851), (Decimal('1999'), 6185), (Decimal('2000'), 6559), (Decimal('2001'), 7244), (Decimal('2002'), 8480), (Decimal('2003'), 8950), (Decimal('2004'), 9825), (Decimal('2005'), 10519), (Decimal('2006'), 11028), (Decimal('2007'), 11270), (Decimal('2008'), 12573), (Decimal('2009'), 13314), (Decimal('2010'), 13662), (Decimal('2011'), 14351), (Decimal('2012'), 15146), (Decimal('2013'), 16105), (Decimal('2014'), 16770), (Decimal('2015'), 17047), (Decimal('2016'), 17393), (Decimal('2017'), 16692), (Decimal('2018'), 17158), (Decimal('2019'), 17348), (Decimal('2020'), 16967), (Decimal('2021'), 18752), (Decimal('2022'), 22828), (Decimal('2023'), 21980), (Decimal('2024'), 15908), (Decimal('2025'), 1872)]
Ran in 206.98371267318726 seconds
running Lymphoma
[(Decimal('1990'), 5211), (Decimal('1991'), 5015), (Decimal('1992'), 5212), (Decimal('1993'), 5185), (Decimal('1994'), 5427), (Decimal('1995'), 5544), (Decimal('1996'), 5657), (Decimal('1997'), 5765), (Decimal('1998'), 5831), (Decimal('1999'), 5898), (Decimal('2000'), 6433), (Decimal('2001'), 6364), (Decimal('2002'), 6778), (Decimal('2003'), 7314), (Decimal('2004'), 7157), (Decimal('2005'), 7739), (Decimal('2006'), 7842), (Decimal('2007'), 8204), (Decimal('2008'), 8735), (Decimal('2009'), 8438), (Decimal('2010'), 8934), (Decimal('2011'), 9499), (Decimal('2012'), 9543), (Decimal('2013'), 10051), (Decimal('2014'), 10195), (Decimal('2015'), 10565), (Decimal('2016'), 9581), (Decimal('2017'), 9140), (Decimal('2018'), 9243), (Decimal('2019'), 9067), (Decimal('2020'), 7519), (Decimal('2021'), 7860), (Decimal('2022'), 8085), (Decimal('2023'), 7627), (Decimal('2024'), 6447), (Decimal('2025'), 690)]
Ran in 288.29337525367737 seconds
running Sarcoma
[(Decimal('1990'), 2369), (Decimal('1991'), 2327), (Decimal('1992'), 2223), (Decimal('1993'), 2334), (Decimal('1994'), 2299), (Decimal('1995'), 2489), (Decimal('1996'), 2499), (Decimal('1997'), 2523), (Decimal('1998'), 2508), (Decimal('1999'), 2469), (Decimal('2000'), 2499), (Decimal('2001'), 2494), (Decimal('2002'), 2509), (Decimal('2003'), 2738), (Decimal('2004'), 2702), (Decimal('2005'), 2859), (Decimal('2006'), 2769), (Decimal('2007'), 2878), (Decimal('2008'), 2927), (Decimal('2009'), 2908), (Decimal('2010'), 2979), (Decimal('2011'), 3119), (Decimal('2012'), 3024), (Decimal('2013'), 3262), (Decimal('2014'), 3439), (Decimal('2015'), 3493), (Decimal('2016'), 3150), (Decimal('2017'), 3220), (Decimal('2018'), 3270), (Decimal('2019'), 3300), (Decimal('2020'), 2894), (Decimal('2021'), 3062), (Decimal('2022'), 3497), (Decimal('2023'), 3072), (Decimal('2024'), 2479), (Decimal('2025'), 302)]
Ran in 96.31538367271423 seconds
running Liver Disease

running Covid
[(Decimal('1990'), 14), (Decimal('1991'), 9), (Decimal('1992'), 25), (Decimal('1993'), 48), (Decimal('1994'), 138), (Decimal('1995'), 149), (Decimal('1996'), 89), (Decimal('1997'), 106), (Decimal('1998'), 138), (Decimal('1999'), 106), (Decimal('2000'), 108), (Decimal('2001'), 146), (Decimal('2002'), 97), (Decimal('2003'), 231), (Decimal('2004'), 329), (Decimal('2005'), 336), (Decimal('2006'), 382), (Decimal('2007'), 260), (Decimal('2008'), 255), (Decimal('2009'), 248), (Decimal('2010'), 215), (Decimal('2011'), 191), (Decimal('2012'), 252), (Decimal('2013'), 386), (Decimal('2014'), 509), (Decimal('2015'), 598), (Decimal('2016'), 550), (Decimal('2017'), 487), (Decimal('2018'), 444), (Decimal('2019'), 548), (Decimal('2020'), 49256), (Decimal('2021'), 69529), (Decimal('2022'), 69634), (Decimal('2023'), 44799), (Decimal('2024'), 27073), (Decimal('2025'), 3060)]
Ran in 413.11091017723083 seconds
running Bronchitis
[(Decimal('1990'), 460), (Decimal('1991'), 394), (Decimal('1992'