In [1]:
import pandas , numpy
import psycopg2
from dotenv import load_dotenv
import os
import time

In [2]:
load_dotenv()

True

In [3]:
#CONNECTING TO COMPANY DB
db_name = "OpenAlex"
user = os.getenv("user")
password = os.getenv("password")
host = os.getenv("host")
port = os.getenv("port")

In [4]:
conn = psycopg2.connect(f"dbname ={db_name} user={user} password={password} host={host} port={port}")

In [5]:
def Execute_query(query , keyword = None , lst_array_keywords = None):
    """ 
    desired_data:
    author name 
    link to authors profile
    """

    cursor = conn.cursor()
    if lst_array_keywords:
        cursor.execute(query, (lst_array_keywords,))
    elif keyword:
        cursor.execute(query, keyword)
    else:
        cursor.execute(query)

    rows = cursor.fetchall()
    return rows

In [None]:
dict_disease_names = {
'Cancer': ['%cancer%', '%carcinoma%', '%neoplasm%', '%tumor%', '%malignant%'],
 'Heart disease': ['%heart%', '%cardiovascular%', '%cardiac%', '%coronary%', '%myocardial%',
                   '%angina%','%cardiomyopathy%','%arteriosclerosis%'],
 'Covid': ['%covid%', '%coronavirus%', '%sars-cov-2%', '%2019-ncov%'],
 "Alzheimer's": ['%alzheimer%', '%dementia%', '%neurodegenerative%'],
 'Diabetes': ['%diabetes%', '%hyperglycemia%'],
 'Lymphoma': ['%lymphoma%', '%Hodgkin%', '%lymphatic%'],
 'Sarcoma': ['%sarcoma%'],
 'Bronchitis': ['%bronchitis%'],
 'Liver Disease': ['%liver%', '%hepatic%', '%cirrhosis%', '%hepatitis%', '%cholestasis%']
}


In [7]:
#getting the top 5 authors
""" 
OR wm.descriptor_name ILIKE '%malignancy%'
        OR wm.descriptor_name ILIKE '%neoplasm%'
        OR wm.descriptor_name ILIKE '%tumor%'
        OR wm.descriptor_name ILIKE '%tumour%'
        OR wm.descriptor_name ILIKE '%malignant%'
        OR wm.descriptor_name ILIKE '%cancerous%'
"""

#cancer query
query = """
WITH cancer_related_mesh AS (
    SELECT DISTINCT work_id
    FROM works_mesh wm
    WHERE wm.descriptor_name ILIKE '%cancer%'
)
SELECT
    wa.author_id,
    wa.author_display_name,
    COUNT(DISTINCT w.id) AS works_count,
    COALESCE(SUM(w.cited_by_count), 0) AS total_citations
FROM works w
JOIN cancer_related_mesh crm ON w.id = crm.work_id
JOIN works_authorships wa ON w.id = wa.work_id
GROUP BY
    wa.author_id,
    wa.author_display_name
ORDER BY
    total_citations DESC
LIMIT 5;
"""


In [8]:
query = """
        SELECT
                descriptor_name,
                COUNT(*) AS value_count
        FROM
                works_mesh
        GROUP BY
                descriptor_name
        ORDER BY
                value_count DESC;
        """

In [9]:
cursor = conn.cursor()
cursor.execute(query)
#cursor.execute(query , (["%cancer%"],))

rows = cursor.fetchall()
print(rows)



In [10]:
len(rows)

30614

In [11]:
rows = list(rows)
rows = [(item[0] , item[1]) for item in rows]
rows[:100]

[('Humans', 21871049),
 ('Female', 9829850),
 ('Male', 9539535),
 ('Animals', 7395060),
 ('Adult', 5569723),
 ('Middle Aged', 4831214),
 ('Aged', 3503840),
 ('Adolescent', 2251708),
 ('Child', 1952657),
 ('Mice', 1785997),
 ('Rats', 1661658),
 ('Brain', 1258560),
 ('Time Factors', 1223568),
 ('Retrospective Studies', 1194805),
 ('Treatment Outcome', 1194548),
 ('Neoplasms', 1159178),
 ('United States', 1114105),
 ('Liver', 1069867),
 ('Young Adult', 1058132),
 ('Aged, 80 and over', 1031844),
 ('Pregnancy', 1030116),
 ('Child, Preschool', 1001911),
 ('Risk Factors', 989919),
 ('Breast Neoplasms', 908916),
 ('Infant', 876989),
 ('Anti-Bacterial Agents', 873291),
 ('RNA, Messenger', 866384),
 ('Postoperative Complications', 776602),
 ('Antineoplastic Agents', 768816),
 ('Signal Transduction', 754541),
 ('Lung Neoplasms', 711540),
 ('Follow-Up Studies', 692791),
 ('Prospective Studies', 689264),
 ('Infant, Newborn', 688142),
 ('Neurons', 684407),
 ('Kidney', 642500),
 ('Hypertension', 6386

In [12]:
new_dict_disease_names = {}
for key, values in dict_disease_names.items():
    new_keywords = []
    print(f"\n--- {key} ---")
    for value in values:
        keyword_to_check = value.replace("%", "").lower()
        found_match = False
        match_count = 0
        for entry, count in rows:
            if keyword_to_check in entry.lower():
                found_match = True
                match_count += count
        print(f"'{keyword_to_check}' found in mesh headings? {found_match}, Total occurrences: {match_count}")
        if found_match and match_count > 10:
            new_keywords.append(value)
    new_dict_disease_names[key] = new_keywords

dict_disease_names = new_dict_disease_names

print("\n--- Final Keywords (with over 10 hits) ---")
for disease, keywords in dict_disease_names.items():
    print(f"{disease}: {keywords}")


--- Cancer ---
'cancer' found in mesh headings? True, Total occurrences: 225593
'carcinoma' found in mesh headings? True, Total occurrences: 2570463
'malignancy' found in mesh headings? False, Total occurrences: 0
'neoplasm' found in mesh headings? True, Total occurrences: 11388581
'tumor' found in mesh headings? True, Total occurrences: 2132309
'tumour' found in mesh headings? False, Total occurrences: 0
'malignant' found in mesh headings? True, Total occurrences: 50784

--- Heart disease ---
'heart' found in mesh headings? True, Total occurrences: 2395908
'cardiovascular' found in mesh headings? True, Total occurrences: 637731
'cardiac' found in mesh headings? True, Total occurrences: 707176
'coronary' found in mesh headings? True, Total occurrences: 1259672
'myocardial' found in mesh headings? True, Total occurrences: 805375
'angina' found in mesh headings? True, Total occurrences: 119060
'cardiomyopathy' found in mesh headings? True, Total occurrences: 121283
'arteriosclerosis' fo

In [13]:
dict_disease_names

{'Cancer': ['%cancer%', '%carcinoma%', '%neoplasm%', '%tumor%', '%malignant%'],
 'Heart disease': ['%heart%',
  '%cardiovascular%',
  '%cardiac%',
  '%coronary%',
  '%myocardial%',
  '%angina%',
  '%cardiomyopathy%',
  '%arteriosclerosis%'],
 'Covid': ['%covid%', '%coronavirus%', '%sars-cov-2%', '%2019-ncov%'],
 "Alzheimer's": ['%alzheimer%', '%dementia%', '%neurodegenerative%'],
 'Diabetes': ['%diabetes%', '%hyperglycemia%'],
 'Lymphoma': ['%lymphoma%', '%Hodgkin%', '%lymphatic%'],
 'Sarcoma': ['%sarcoma%'],
 'Bronchitis': ['%bronchitis%'],
 'Liver Disease': ['%liver%',
  '%hepatic%',
  '%cirrhosis%',
  '%hepatitis%',
  '%cholestasis%']}

In [14]:
# [('https://openalex.org/A5000036388', 'Raymond K. H. Chan', 1, None), ('https://openalex.org/A5000014365', 'Ha Huu Phuoc Dang', 1, None), ('https://openalex.org/A5000022739', 'Marta Gazzaneo', 1, None), ('https://openalex.org/A5000001644', 'Z Pilawski', 1, None), ('https://openalex.org/A5000036901', 'Nicola Creighton', 1, None)]
# [('https://openalex.org/A5005301563', 'Alfonso Martin Bernabé', 1, None), ('https://openalex.org/A5007848067', 'Anne-Marie Waser', 1, None), ('https://openalex.org/A5011961771', 'Mathieu Balaguer', 1, None), ('https://openalex.org/A5013101884', 'Jérôme Farinas', 1, None), ('https://openalex.org/A5004327919', 'Christine Le Clainche', 1, None)]
# [('https://openalex.org/A5088652438', 'Saman Warnakulasuriya', 80, Decimal('8816')), ('https://openalex.org/A5112869712', 'Pelayo Correa', 45, Decimal('7000')), ('https://openalex.org/A5017681840', 'B C Morson', 21, Decimal('6439')), ('https://openalex.org/A5016235730', 'Vassilis G. Gorgoulis', 10, Decimal('6393')), ('https://openalex.org/A5055130061', 'Christopher P. Crum', 46, Decimal('6211'))]

# [('https://openalex.org/A5009198168', 'Ahmedin Jemal', 141, Decimal('182743')), ('https://openalex.org/A5047300895', 'Rebecca L. Siegel', 45, Decimal('149779')), ('https://openalex.org/A5015181760', 'Kimberly D. Miller', 15, Decimal('126160')), ('https://openalex.org/A5028882183', 'Markus Wallwiener', 4, Decimal('30571')), ('https://openalex.org/A5049945196', 'Sara Y. Brucker', 4, Decimal('30428'))]

In [15]:
#authors Heart disease

In [16]:
#authors covid 

In [17]:
#authors Alzheimer's

In [18]:
#authors "Leukemia"

In [19]:
#authors diabetes

In [20]:
#authors lymphoma

In [21]:
#authors Bronchitis

In [22]:
#authors liver disease

In [None]:
--- Cancer ---
'cancer' found in mesh headings? True, Total occurrences: 225593
'carcinoma' found in mesh headings? True, Total occurrences: 2570463
'neoplasm' found in mesh headings? True, Total occurrences: 11388581
'tumor' found in mesh headings? True, Total occurrences: 2132309
'malignant' found in mesh headings? True, Total occurrences: 50784

--- Heart disease ---
'heart' found in mesh headings? True, Total occurrences: 2395908
'cardiovascular' found in mesh headings? True, Total occurrences: 637731
'cardiac' found in mesh headings? True, Total occurrences: 707176
'coronary' found in mesh headings? True, Total occurrences: 1259672
'myocardial' found in mesh headings? True, Total occurrences: 805375
'angina' found in mesh headings? True, Total occurrences: 119060
'cardiomyopathy' found in mesh headings? True, Total occurrences: 121283
'arteriosclerosis' found in mesh headings? True, Total occurrences: 159215

--- Covid ---
'covid' found in mesh headings? True, Total occurrences: 446109
'coronavirus' found in mesh headings? True, Total occurrences: 148074
'sars-cov-2' found in mesh headings? True, Total occurrences: 186928
'2019-ncov' found in mesh headings? True, Total occurrences: 1164

--- Alzheimer's ---
'alzheimer' found in mesh headings? True, Total occurrences: 317816
'dementia' found in mesh headings? True, Total occurrences: 196629
'neurodegenerative' found in mesh headings? True, Total occurrences: 65466


--- Diabetes ---
'diabetes' found in mesh headings? True, Total occurrences: 1240804
'hyperglycemia' found in mesh headings? True, Total occurrences: 77064

--- Lymphoma ---
'lymphoma' found in mesh headings? True, Total occurrences: 601345
'hodgkin' found in mesh headings? True, Total occurrences: 185079
'lymphatic' found in mesh headings? True, Total occurrences: 211781

--- Sarcoma ---
'sarcoma' found in mesh headings? True, Total occurrences: 454889

--- Bronchitis ---
'bronchitis' found in mesh headings? True, Total occurrences: 55699

--- Liver Disease ---
'liver' found in mesh headings? True, Total occurrences: 3093111
'hepatic' found in mesh headings? True, Total occurrences: 176796
'cirrhosis' found in mesh headings? True, Total occurrences: 268024
'hepatitis' found in mesh headings? True, Total occurrences: 840776
'cholestasis' found in mesh headings? True, Total occurrences: 74333