In [None]:
import pandas , numpy
import psycopg2
from dotenv import load_dotenv
import os
import time

In [None]:
load_dotenv()

In [None]:
#CONNECTING TO COMPANY DB
db_name = "OpenAlex"
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")

In [None]:
conn = psycopg2.connect(f"dbname ={db_name} user={user} password={password} host={host} port={port}")

In [None]:
# #getting top 5 papers for each field
# #bronchitis
# query = """
# WITH bronchitis_works AS (
#     SELECT DISTINCT w.id
#     FROM works w
#     JOIN works_mesh wm ON w.id = wm.work_id
#     WHERE wm.descriptor_name ILIKE '%bronchitis%'
# )
# SELECT
#     pl.source_display_name AS journal_name,
#     COUNT(DISTINCT bw.id) AS num_publications
# FROM bronchitis_works bw
# JOIN works_locations pl ON bw.id = pl.work_id
# WHERE pl.source_type = 'journal'
# GROUP BY
#     pl.source_display_name
# ORDER BY
#     num_publications DESC
# LIMIT 5;
# """

In [None]:
# cursor = conn.cursor()
# cursor.execute(query)
# #cursor.execute(query , (["%cancer%"],))
# rows = cursor.fetchall()
# print(rows)

In [None]:
dict_diseases = {
 'Lymphoma': ['%lymphoma%', '%Hodgkin%', '%lymphatic%'],
 'Sarcoma': ['%sarcoma%']}

In [None]:
def number_authors_query(keyword = None, lst_keywords = None):
    if keyword:
        query = f"""
            WITH bronchitis_works AS (
                SELECT DISTINCT w.id
                FROM works w
                JOIN works_mesh wm ON w.id = wm.work_id
                WHERE wm.descriptor_name ILIKE '{keyword}'
            )
            SELECT
                pl.source_display_name AS journal_name,
                COUNT(DISTINCT bw.id) AS num_publications
            FROM bronchitis_works bw
            JOIN works_locations pl ON bw.id = pl.work_id
            WHERE pl.source_type = 'journal'
            GROUP BY
                pl.source_display_name
            ORDER BY
                num_publications DESC
            LIMIT 10;
            """
        
    else:
        formatted_keywords = ", ".join([f"'{kw}'" for kw in lst_keywords])
        query = f"""
            WITH bronchitis_works AS (
                SELECT DISTINCT w.id
                FROM works w
                JOIN works_mesh wm ON w.id = wm.work_id
                WHERE wm.descriptor_name ILIKE ANY(ARRAY[{formatted_keywords}])
            )
            SELECT
                pl.source_display_name AS journal_name,
                COUNT(DISTINCT bw.id) AS num_publications
            FROM bronchitis_works bw
            JOIN works_locations pl ON bw.id = pl.work_id
            WHERE pl.source_type = 'journal'
            GROUP BY
                pl.source_display_name
            ORDER BY
                num_publications DESC
            LIMIT 10;
            """
    return query

In [None]:
#running all the diseases
for key , value in dict_diseases.items():
    print(f"running {key}")
    before_running = time.time()
    if len(value) == 1:
        query = number_authors_query(keyword = value[0])
    else:
        query = number_authors_query(lst_keywords = value)

    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    after_running = time.time()
    print(rows)
    print(f"Ran in {after_running - before_running} seconds")
    

"""
True
running Bronchitis
[('CHEST Journal', 378), ('Thorax', 375), ('Avian Diseases', 316), ('BMJ', 279), ('American Journal of Respiratory and Critical Care Medicine', 203)]
Ran in 686.7531394958496 seconds
running Covid
[('PLoS ONE', 7300), ('International Journal of Environmental Research and Public Health', 6989), ('Scientific Reports', 4840), ('Frontiers in Public Health', 4512), ('Frontiers in Immunology', 2847)]
Ran in 748.5714433193207 seconds
running Alzheimer's
[('Journal of Alzheimer s Disease', 8106), ('Alzheimer s & Dementia', 4015), ('Neurology', 3992), ('Neurobiology of Aging', 3556), ('Journal of the American Geriatrics Society', 2569)]
Ran in 1170.3090727329254 seconds
"""