In [325]:
import json
from collections import Counter, defaultdict 

import numpy as np
import pandas as pd
import ast




from sklearn.feature_extraction.text import TfidfVectorizer


import numpy as np
from sklearn.metrics import mean_squared_error
import sys
import os
import openai
import time
from tqdm import tqdm
from kneed import KneeLocator




In [326]:
import warnings
warnings.filterwarnings("ignore")

In [327]:
openai_api_key = '' #enter your API key

In [328]:

def load_mesh_trees(meshtree_file):
    """
    Load a MeSH tree file and build ID-to-name and name-to-ID mappings.

    Parameters
    ----------
    meshtree_file : str
        Path to the MeSH tree file (e.g., "mtrees2024.bin").
        Each line in the file is expected to be formatted as:
            <term>;<tree_id>

    Returns
    -------
    mesh_id2name : dict
        Dictionary mapping MeSH tree IDs (e.g., "A01.111") to term names.
    
    mesh_name2id : dict of lists
        Dictionary mapping MeSH term names to a list of associated MeSH IDs.

    Notes
    -----
    The function also injects two additional MeSH-like IDs:
        NEWID1 → Female  
        NEWID2 → Male
    """
    mesh_id2name = {}
    mesh_name2id = defaultdict(list)


    with open(meshtree_file, "r") as ftree:
        for line in ftree:
            term, tree_id = line.strip().split(";")

            mesh_id2name[tree_id] = term
            mesh_name2id[term].append(tree_id)

    # Add the two extra synthetic entries
    extra_entries = {
        'NEWID1': 'Female',
        'NEWID2': 'Male'
    }

    for mid, term in extra_entries.items():
        mesh_id2name[mid] = term
        mesh_name2id[term] = [mid]

    return mesh_id2name, mesh_name2id


In [329]:
def process_mesh_terms(csv_file, mesh_id2name, mesh_name2id):
    """
    Process researcher MeSH terms into hierarchical MeSH ancestor codes.

    Parameters
    ----------
    csv_file : str
        Path to the CSV file containing researcher MeSH terms.
        Expected columns:
            - First Name
            - Last Name
            - PMID
            - Person ID
            - mesh_term   (format: "Term / Subheading" or just "Term")

    mesh_id2name : dict
        Mapping from MeSH ID → MeSH term name.
        Example: {"A01.111": "Heart"}.

    mesh_name2id : dict or defaultdict(list)
        Mapping from MeSH term name → list of associated MeSH IDs.
        Example: {"Heart": ["A01.111", "A01.112"]}.

    Returns
    -------
    df_mesh_term : pandas.DataFrame
        A cleaned dataframe with one row per:
            (author × PMID × ancestor MeSH term)
        Columns:
            First Name, Last Name, PMID,
            Person ID, ancestor_mesh_term

    Notes
    -----
    • Handles MeSH terms with and without subheadings.
    • Automatically assigns:
          Female → NEWID1
          Male   → NEWID2
    • Builds full hierarchical ancestor chains for every MeSH ID.
    """

    # --------------------------------------------------
    # Helper function: split mesh term into main + subheadings
    # --------------------------------------------------
    def split_mesh_term(mesh_term):
        parts = mesh_term.split(" / ")
        if len(parts) > 1:
            return parts[0], parts[1:]
        return parts[0], []

    # --------------------------------------------------
    # Helper function: recursively find ancestors
    # --------------------------------------------------
    def find_mesh_term_ancestors(tid, ancestor_ids):
        ancestors = []
        mesh_name = mesh_id2name[tid]

        # Add current term
        ancestors.append(mesh_name)

        # Get all IDs associated with the same MeSH term
        ids_for_name = mesh_name2id[mesh_name]
        ancestor_ids += ids_for_name

        # Parent IDs = chop off last part of MeSH tree number
        parent_ids = [".".join(mid.split(".")[:-1]) for mid in ids_for_name]

        for pid in parent_ids:
            if pid and pid not in ancestor_ids:
                new_ancestors, ancestor_ids = find_mesh_term_ancestors(pid, ancestor_ids)
                ancestors.extend(new_ancestors)
                ancestors = list(set(ancestors))  # unique
        return ancestors, ancestor_ids

    # --------------------------------------------------
    # Get only ancestor names
    # --------------------------------------------------
    def create_mesh_term_hierarchical_codes(tid):
        ancestors, _ = find_mesh_term_ancestors(tid, [])
        return ancestors

    # --------------------------------------------------
    # Load CSV
    # --------------------------------------------------
    df_mesh = pd.read_csv(csv_file)

    # Split "Term / Subheading"
    df_mesh['mesh_term_only'], df_mesh['mesh_subheading'] = zip(
        *df_mesh['MeSH Term'].apply(split_mesh_term)
    )

    # Map main term → ID(s)
    df_mesh['mesh_term_only_id'] = df_mesh['mesh_term_only'].apply(lambda x: mesh_name2id[x])

    # Assign new synthetic IDs for Female / Male
    df_mesh.loc[df_mesh['mesh_term_only'] == 'Female', 'mesh_term_only_id'] = 'NEWID1'
    df_mesh.loc[df_mesh['mesh_term_only'] == 'Male',   'mesh_term_only_id'] = 'NEWID2'

    # Expand list of IDs to one per row
    df_mesh = df_mesh.explode("mesh_term_only_id")

    # Increase recursion depth (important)
    sys.setrecursionlimit(5000)

    # Compute full MeSH ancestor hierarchy
    df_mesh['ancestor_mesh_term'] = df_mesh['mesh_term_only_id'].apply(
        create_mesh_term_hierarchical_codes
    )

    # Expand ancestors to one per row
    df_mesh = df_mesh.explode('ancestor_mesh_term')

    # Select final output columns
    df_mesh = df_mesh[
        ['First Name', 'Last Name', 'PMID',
         'Person ID', 'ancestor_mesh_term']
    ]

    # Remove duplicates
    df_mesh.drop_duplicates(inplace=True)

    return df_mesh


In [330]:

def filter_low_frequency_mesh_terms(df_mesh_term, min_frequency=2):
    """
    Filter out MeSH ancestor terms that occur less than a specified frequency
    for each researcher.

    Parameters
    ----------
    df_mesh_term : pandas.DataFrame
        DataFrame containing at least:
            - Person ID
            - ancestor_mesh_term

    min_frequency : int, optional (default = 2)
        Minimum number of occurrences required for a researcher to keep a term.
        Terms occurring fewer times than this per Person ID are removed.

    Returns
    -------
    df_filtered : pandas.DataFrame
        DataFrame with columns:
            Person ID, ancestor_mesh_term, count
        containing only the MeSH terms that meet the frequency threshold.
    """

    # Compute frequency of each ancestor term per researcher
    df_freq = (
        df_mesh_term
        .groupby('Person ID')['ancestor_mesh_term']
        .value_counts()
        .rename('count')
        .reset_index()
    )

    # Keep terms at or above frequency threshold
    df_filtered = df_freq[df_freq['count'] >= min_frequency]

    return df_filtered


In [331]:

def categorize_mesh_terms(
    df_mesh_term_freq,
    mesh_id2name,
    mesh_name2id,
    class_file="level01_mesh_selected_class_0702_2100.xlsx"
):
    """
    Categorize MeSH ancestor terms into 'Health Domain' (H) and 'Method' (M)
    classes using a manually tagged level-01 class file, then propagate
    categories to child terms based on MeSH parent–child relationships.

    Parameters
    ----------
    df_mesh_term_freq : pandas.DataFrame
        Output of filter_low_frequency_mesh_terms(), must contain:
            - ancestor_mesh_term

    mesh_id2name : dict
        Mapping MeSH ID -> MeSH term name

    mesh_name2id : dict or defaultdict(list)
        Mapping MeSH term name -> list of MeSH IDs

    class_file : str
        Excel file mapping MeSH terms to top-level class labels.
        Must contain:
            - 'name'  (MeSH term)
            - 'Class' ('H' or 'M')

    Returns
    -------
    mesh_names_health_domain : list
        All MeSH term names categorized under Health Domain (H)

    mesh_names_method : list
        All MeSH term names categorized under Methods (M)
    """

    # ------------------------------
    # Load class definitions
    # ------------------------------
    df_class = pd.read_excel(class_file, index_col=0)

    mesh_names_health_domain_manual = list(df_class[df_class['Class'] == 'H']['name'])
    mesh_names_method_manual = list(df_class[df_class['Class'] == 'M']['name'])

    # -------------------------------------------
    # Get all unique ancestor terms from freq df
    # -------------------------------------------
    df_temp = df_mesh_term_freq[['ancestor_mesh_term']].drop_duplicates()

    # Map term → IDs
    df_temp['term_id'] = df_temp['ancestor_mesh_term'].apply(lambda x: mesh_name2id[x])
    df_temp = df_temp.explode('term_id')

    # Sort IDs by length so parents come first
    df_temp['length'] = df_temp['term_id'].apply(len)
    df_temp = df_temp.sort_values(by='length').drop_duplicates()

    term_ids = list(df_temp['term_id'])

    # -------------------------------------------
    # Initialize category lists
    # -------------------------------------------
    mesh_names_health_domain = mesh_names_health_domain_manual.copy()
    mesh_names_method = mesh_names_method_manual.copy()

    # -------------------------------------------
    # Categorize by tracing parent IDs
    # -------------------------------------------
    for tid in term_ids:
        term_name = mesh_id2name[tid]

        # Skip terms already categorized manually
        if (term_name in mesh_names_health_domain_manual) or (term_name in mesh_names_method):
            continue

        parts = tid.split(".")
        if len(parts) >= 2:
            parent_id = ".".join(parts[:-1])
            parent_name = mesh_id2name[parent_id]

            # Inherit health domain from parent
            if parent_name in mesh_names_health_domain:
                mesh_names_health_domain.append(term_name)

            # Inherit method class from parent
            if parent_name in mesh_names_method:
                mesh_names_method.append(term_name)

    # Remove duplicates
    mesh_names_health_domain = list(set(mesh_names_health_domain))
    mesh_names_method = list(set(mesh_names_method))

    return mesh_names_health_domain, mesh_names_method


In [332]:
def remove_meaningless_mesh_terms(df_mesh_term_freq, mesh_id2name):
    """
    Remove meaningless or overly broad MeSH ancestor terms from the
    df_mesh_term_freq DataFrame.

    This function filters out:
        • a predefined list of non-informative MeSH terms
        • any MeSH terms whose MeSH ID begins with 'Z01' (automatically added)

    Parameters
    ----------
    df_mesh_term_freq : pandas.DataFrame
        DataFrame containing at least:
            - ancestor_mesh_term

    mesh_id2name : dict
        Mapping from MeSH ID → MeSH term name

    Returns
    -------
    df_filtered : pandas.DataFrame
        DataFrame with meaningless ancestor terms removed.
    """

    # Base list of meaningless or overly broad MeSH terms
    meaningless_mesh = [
        'Eukaryota', 'Animals', 'Chordata', 'Vertebrates', 'Mammals', 'Eutheria',
        'Primates', 'Haplorhini', 'Catarrhini', 'Hominidae', 'Humans',
        'Natural Science Disciplines', 'Science', 'Research', 'Methods',
        'Investigative Techniques', 'Persons', 'Health Occupations',
        'Equipment and Supplies', 'Electrical Equipment and Supplies',
        'Biomedical Research', 'Household Products', 'Photography',
        'Financial Management', 'life', 'Medicine', 'Diseases'
    ]

    # Add all terms whose MeSH ID starts with Z01
    for mid, name in mesh_id2name.items():
        if mid.startswith('Z01'):
            meaningless_mesh.append(name)

    # Filter out all meaningless terms
    df_filtered = df_mesh_term_freq[
        ~df_mesh_term_freq['ancestor_mesh_term'].isin(meaningless_mesh)
    ].copy()

    return df_filtered


In [333]:
import os
import pandas as pd

def export_mesh_term_frequency_by_category(
        df_mesh,
        df_mesh_term_freq,
        mesh_names_health_domain,
        mesh_names_method,
        directory
    ):
    """
    Export MeSH term frequency tables for Health Domain and Method categories,
    merging researcher information and saving results to Excel files.

    Parameters
    ----------
    df_mesh : pandas.DataFrame
        Must contain:
            - First Name
            - Last Name
            - Person ID

    df_mesh_term_freq : pandas.DataFrame
        Must contain:
            - Person ID
            - ancestor_mesh_term
            - count

    mesh_names_health_domain : list
        List of MeSH terms classified as Health Domain.

    mesh_names_method : list
        List of MeSH terms classified as Methods.

    directory : str
        Folder where Excel files will be saved.

    Returns
    -------
    mesh_term_freq_list_health_domain : list of lists
        Health domain term frequencies with researcher names included.

    mesh_term_freq_list_method : list of lists
        Method term frequencies with researcher names included.
    """

    # --------------------------
    # Unique researcher info
    # --------------------------
    df_person = df_mesh[['First Name', 'Last Name', 'Person ID']].drop_duplicates()

    # --------------------------
    # Filter categories
    # --------------------------
    df_health = df_mesh_term_freq[
        df_mesh_term_freq['ancestor_mesh_term'].isin(mesh_names_health_domain)
    ].merge(df_person, on='Person ID')

    df_method = df_mesh_term_freq[
        df_mesh_term_freq['ancestor_mesh_term'].isin(mesh_names_method)
    ].merge(df_person, on='Person ID')

    # --------------------------
    # Save Excel files
    # --------------------------
    os.makedirs(directory, exist_ok=True)

    df_health.to_excel(
        os.path.join(directory, "mesh_term_freq_per_faculty_HealthDomain.xlsx"),
        index=False
    )
    df_method.to_excel(
        os.path.join(directory, "mesh_term_freq_per_faculty_Method.xlsx"),
        index=False
    )

    # --------------------------
    # Convert to list format
    # --------------------------
    mesh_term_freq_list_health_domain = df_health.values.tolist()
    mesh_term_freq_list_method = df_method.values.tolist()

    return mesh_term_freq_list_health_domain, mesh_term_freq_list_method


In [334]:
from collections import defaultdict

def build_researcher_mesh_string(mesh_term_freq_list):
    """
    Build a dictionary mapping Person ID → semicolon-separated MeSH terms,
    where each term is repeated according to its frequency.

    Accepts rows that contain more than three columns, such as:
        [Person ID, term, freq, First Name, Last Name]

    Only the first three fields are used:
        Person ID, term, freq

    Parameters
    ----------
    mesh_term_freq_list : list of lists
        Each row must start with:
            [Person ID, term, freq, ...]

    Returns
    -------
    dict_researcher_mesh : dict
        Mapping:
            Person ID → "term;term;term2;term2;..."
    """

    dict_researcher_mesh = defaultdict(str)

    for row in mesh_term_freq_list:
        # Safely unpack only the first three values
        person_id, term, freq = row[:3]

        # Repeat term freq times and append
        dict_researcher_mesh[person_id] += (term + ";") * int(freq)

    # Remove trailing semicolon from each entry
    dict_researcher_mesh = {
        pid: terms.rstrip(";")
        for pid, terms in dict_researcher_mesh.items()
    }

    return dict_researcher_mesh


In [335]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict


def run_mesh_tfidf(
        dict_researcher_mesh,
        df_person,
        directory,
        postfix="HealthDomain"
    ):
    """
    Compute TF-IDF for researcher MeSH term profiles.

    Parameters
    ----------
    dict_researcher_mesh : dict
        Mapping: Person ID -> "term1;term1;term2;term3;..."
        (Repeated terms represent frequencies)

    df_person : pandas.DataFrame
        Contains researcher identity columns:
            - Person ID
            - First Name
            - Last Name

    directory : str
        Output folder where CSV/Excel files will be saved.

    postfix : str
        Labels output files, e.g., "HealthDomain" or "Method".



    Outputs (saved to disk)
    -----------------------
    1. term_per_researcher_tfidf_{postfix}.csv


    Returns
    -------
    df_terms_tfidf : pandas.DataFrame
        TF-IDF table (long format, term × faculty)


    """

    # ---------------------------------------------------------
    # Tokenizer (split text by semicolon)
    # ---------------------------------------------------------
    def custom_tokenizer(text):
        return [token.strip() for token in text.split(';') if token.strip()]

    # ---------------------------------------------------------
    # Build corpus and faculty IDs
    # ---------------------------------------------------------
    corpus = list(dict_researcher_mesh.values())
    faculty_ids = list(dict_researcher_mesh.keys())

    # ---------------------------------------------------------
    # TF-IDF vectorizer
    # ---------------------------------------------------------
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
    X = vectorizer.fit_transform(corpus)

    # ---------------------------------------------------------
    # Build TF-IDF dataframe: long format (term, faculty, score)
    # ---------------------------------------------------------
    df_terms_tfidf = pd.DataFrame(X.toarray())
    df_terms_tfidf["Person ID"] = faculty_ids
    df_terms_tfidf.set_index("Person ID", inplace=True)

    df_terms_tfidf = df_terms_tfidf.T
    df_terms_tfidf["mesh_term"] = vectorizer.get_feature_names_out()
    df_terms_tfidf.set_index("mesh_term", inplace=True)

    df_terms_tfidf = pd.DataFrame(df_terms_tfidf.unstack())
    df_terms_tfidf.reset_index(inplace=True)

    df_terms_tfidf.rename(columns={
        "level_0": "Person ID",
        0: "tfidf_score"
    }, inplace=True)

    # Add researcher names
    df_terms_tfidf = df_terms_tfidf.merge(df_person, on="Person ID")
    df_terms_tfidf["name"] = df_terms_tfidf["First Name"] + " " + df_terms_tfidf["Last Name"]
    df_terms_tfidf.drop(["Person ID", "First Name", "Last Name"], axis=1, inplace=True)

    # Save TF-IDF table
    os.makedirs(directory, exist_ok=True)
    df_terms_tfidf.to_csv(os.path.join(directory, f"term_per_researcher_tfidf_{postfix}.csv"), index=False)

    
    return df_terms_tfidf


In [336]:
from openai import OpenAI
import os
client = OpenAI(api_key = openai_api_key)
def generate_gpt4_response(content, print_output=False):
    try:
        completions = client.chat.completions.create( #a method that allows you to generate text-based chatbot responses using a pre-trained GPT language model.
            model="gpt-4o", 
            top_p = 0.1,
            temperature = 0, #controls the level of randomness or creativity in the generated text; . A higher temperature value will result in a more diverse and creative output, as it increases the probability of sampling lower probability tokens. 
    #         max_tokens = 2000, #controls the maximum number of tokens (words or subwords) in the generated text.
    #         stop = ['###'], #specifies a sequence of tokens that the GPT model should stop generating text when it encounters
            n = 1, #the number of possible chat completions or responses that the GPT model should generate in response to a given prompt
            messages=[
                {'role':'system', 'content': 'You are a dean of a college.'},
              {'role':'user', 'content': content},
              ])

        # Displaying the output can be helpful if things go wrong
        if print_output:
            print(completions)

        # Return the first choice's text
        return completions.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [337]:
def generate_research_focus_summaries(
        postfix,
        directory,
        generate_gpt4_response,
        df_person,
        elbow_S=5
    ):
    """
    Generate research focus summaries for each researcher based on TF-IDF MeSH terms.

    Parameters
    ----------
    postfix : str
        "Method" or "HealthDomain"
        Determines:
            - which TF-IDF file to load
            - how the summary sentence is phrased

    directory : str
        Folder where input files are located and output pickles will be saved.

    generate_gpt4_response : function
        A function that sends a prompt to GPT-4 and returns the response text.
        Must return None on failure, so the retry loop can handle errors.

    df_person : pandas.DataFrame
        Must contain:
            - Person ID
            - First Name
            - Last Name

    elbow_S : int
        Sensitivity parameter for KneeLocator.

    Returns
    -------
    dict_terms_for_a_researcher_for_focus : dict
        researcher_name -> semicolon-separated selected MeSH terms

    dict_research_focus_for_a_researcher : dict
        researcher_name -> GPT summary sentence
    """

    # -----------------------------------------
    # Load TF-IDF file
    # -----------------------------------------
    file_path = os.path.join(directory, f"term_per_researcher_tfidf_{postfix}.csv")
    df_terms_tfidf = pd.read_csv(file_path)

    # Keep only positive-scored terms
    df_terms_tfidf = df_terms_tfidf[df_terms_tfidf["tfidf_score"] > 0]
    print(df_terms_tfidf)
    # Build dict: researcher_name -> {term: score}
    dict_mesh_term_tfidf = (
        df_terms_tfidf.groupby("name")
        .apply(lambda x: dict(zip(x["mesh_term"], x["tfidf_score"])))
        .to_dict()
    )

    # List of researcher names
    name_list = list(dict_mesh_term_tfidf.keys())

    dict_terms_for_a_researcher_for_focus = {}
    dict_research_focus_for_a_researcher = {}

    # -----------------------------------------
    # Process each researcher
    # -----------------------------------------
    for m in tqdm(name_list):
        df_temp = df_terms_tfidf[df_terms_tfidf["name"] == m].copy()
        df_temp.sort_values("tfidf_score", inplace=True, ascending=False)

        df_temp["id_for_elbow_point"] = np.arange(len(df_temp))

        # Use KneeLocator to find elbow
        kneedle = KneeLocator(
            df_temp["id_for_elbow_point"],
            df_temp["tfidf_score"],
            S=elbow_S,
            curve="convex",
            direction="decreasing"
        )

        if kneedle.knee is None:
            # If elbow fails, fallback to top 5% or at least 3 terms
            knee_point = max(3, int(0.05 * len(df_temp)))
        else:
            knee_point = kneedle.knee

        # Select the top-knee terms
        df_select = df_temp.iloc[:knee_point]
        term_list = list(df_select["mesh_term"])

        # Store term list
        dict_terms_for_a_researcher_for_focus[m] = "; ".join(term_list)

        # -----------------------------------------
        # GPT-4 summarization
        # -----------------------------------------
        prompt = (
            "Help me summarize this group of phrases into 1 sentence as a research focus:\n"
            + dict_terms_for_a_researcher_for_focus[m]
            + "\nPlease start with: The research focus is on"
        )

        summary = None
        attempt = 0

        while summary is None:
            summary = generate_gpt4_response(prompt)
            attempt += 1
            if summary is None:
                print(f"[Retry {attempt}] GPT failed, sleeping...")
                time.sleep(3)

        # Rewrite sentence prefix
        if postfix == "Method":
            summary = summary.replace(
                "The research focus is on", 
                "This researcher has mainly contributed to"
            )
        else:
            summary = summary.replace(
                "The research focus is on",
                "This researcher mainly focused on"
            )

        dict_research_focus_for_a_researcher[m] = summary

    # -----------------------------------------
    # Save pickle files
    # -----------------------------------------
    with open(os.path.join(directory, f"dict_terms_for_a_researcher_for_focus{postfix}.pickle"), "wb") as f:
        pickle.dump(dict_terms_for_a_researcher_for_focus, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join(directory, f"dict_research_focus_for_a_researcher_{postfix}.pickle"), "wb") as f:
        pickle.dump(dict_research_focus_for_a_researcher, f, protocol=pickle.HIGHEST_PROTOCOL)

    return dict_terms_for_a_researcher_for_focus, dict_research_focus_for_a_researcher


In [338]:
import os
import pickle
import pandas as pd


def combine_research_focus_summaries(
        directory,
        postfix_hd="HealthDomain",
        postfix_m="Method",
        output_excel="Research_summary_byMesh.xlsx"
    ):
    """
    Combine research focus summaries from HealthDomain and Method categories
    into a single summary per researcher.

    Parameters
    ----------
    directory : str
        Folder where the pickle files are located.

    postfix_hd : str (default="HealthDomain")
        Postfix for the HealthDomain summary pickle file:
        dict_research_focus_for_a_researcher_{postfix_hd}.pickle

    postfix_m : str (default="Method")
        Postfix for the Method summary pickle file:
        dict_research_focus_for_a_researcher_{postfix_m}.pickle

    output_excel : str (default="Research_summary_byMesh.xlsx")
        Name of the final Excel file to save.

    Returns
    -------
    df_research_summary : pandas.DataFrame
        DataFrame with columns:
            - Researcher_name
            - Research_direction
    """

    # ----------------------------
    # Load HealthDomain summary
    # ----------------------------
    hd_path = os.path.join(
        directory, f"dict_research_focus_for_a_researcher_{postfix_hd}.pickle"
    )
    with open(hd_path, "rb") as handle:
        dict_hd = pickle.load(handle)

    df_hd = (
        pd.DataFrame.from_dict(dict_hd, orient="index")
        .reset_index()
        .rename(columns={"index": "Researcher_name", 0: "Research_summary_hd"})
    )

    # ----------------------------
    # Load Method summary
    # ----------------------------
    m_path = os.path.join(
        directory, f"dict_research_focus_for_a_researcher_{postfix_m}.pickle"
    )
    with open(m_path, "rb") as handle:
        dict_m = pickle.load(handle)

    df_m = (
        pd.DataFrame.from_dict(dict_m, orient="index")
        .reset_index()
        .rename(columns={"index": "Researcher_name", 0: "Research_summary_m"})
    )

    # ----------------------------
    # Merge both summaries
    # ----------------------------
    df = df_hd.merge(df_m, on="Researcher_name", how="outer")
    df.fillna("", inplace=True)

    # ----------------------------
    # Combine both text summaries
    # ----------------------------
    df["Research_direction"] = (
        df["Research_summary_hd"].astype(str)
        + "\n"
        + df["Research_summary_m"].astype(str)
    ).str.strip("\n")

    df_final = df[["Researcher_name", "Research_direction"]]

    # ----------------------------
    # Save Excel
    # ----------------------------
    output_path = os.path.join(directory, output_excel)
    df_final.to_excel(output_path, index=False)

    return df_final


In [339]:
directory = "results/intermediate_result"

input_file = "data/output_sample/preprocess/filtered_publications.csv"

In [340]:
meshtree_file = "mtrees2024.bin" 
mesh_id2name, mesh_name2id = load_mesh_trees(meshtree_file)


In [341]:
df_mesh_term = process_mesh_terms(
    input_file,
    mesh_id2name,
    mesh_name2id
)

df_mesh_term.head()

Unnamed: 0,First Name,Last Name,PMID,Person ID,ancestor_mesh_term
0,Chunhua,Weng,38838949,1,Mammals
0,Chunhua,Weng,38838949,1,Catarrhini
0,Chunhua,Weng,38838949,1,Hominidae
0,Chunhua,Weng,38838949,1,Eukaryota
0,Chunhua,Weng,38838949,1,Eutheria


In [342]:
df_mesh_term_freq = filter_low_frequency_mesh_terms(df_mesh_term, min_frequency=2)


In [343]:
mesh_names_health_domain, mesh_names_method = categorize_mesh_terms(
    df_mesh_term_freq,
    mesh_id2name,
    mesh_name2id,
    class_file="level01_mesh_selected_class_0702_2100.xlsx"
)

print(len(mesh_names_health_domain), len(mesh_names_method))


371 1254


In [344]:
df_mesh_term_freq = remove_meaningless_mesh_terms(
    df_mesh_term_freq,
    mesh_id2name
)


In [345]:
mesh_term_freq_list_health_domain, mesh_term_freq_list_method = export_mesh_term_frequency_by_category(
    df_mesh_term,
    df_mesh_term_freq,
    mesh_names_health_domain,
    mesh_names_method,
    directory=directory
)


In [346]:
dict_researcher_mesh_health_domain = build_researcher_mesh_string(mesh_term_freq_list_health_domain)
dict_researcher_mesh_method = build_researcher_mesh_string(mesh_term_freq_list_method)


In [347]:
df_person = df_mesh_term[['First Name', 'Last Name', 'Person ID']]
df_person.drop_duplicates(inplace=True)

run_mesh_tfidf(
    dict_researcher_mesh=dict_researcher_mesh_health_domain,
    df_person=df_person,
    directory=directory,
    postfix="HealthDomain"
)
run_mesh_tfidf(
    dict_researcher_mesh=dict_researcher_mesh_method,
    df_person=df_person,
    directory=directory,
    postfix="Method"
)


Unnamed: 0,mesh_term,tfidf_score,name
0,3' untranslated regions,0.003219,Chunhua Weng
1,abstracting and indexing,0.003219,Chunhua Weng
2,academic medical centers,0.008047,Chunhua Weng
3,access to information,0.003219,Chunhua Weng
4,adolescent,0.014484,Chunhua Weng
...,...,...,...
419,wearable electronic devices,0.003219,Chunhua Weng
420,web browser,0.003219,Chunhua Weng
421,workflow,0.004828,Chunhua Weng
422,writing,0.009656,Chunhua Weng


In [348]:
dict_terms_method, dict_focus_method = generate_research_focus_summaries(
    postfix="Method",
    directory=directory,
    generate_gpt4_response=generate_gpt4_response,
    df_person=df_person
)
dict_terms_hd, dict_focus_hd = generate_research_focus_summaries(
    postfix="HealthDomain",
    directory=directory,
    generate_gpt4_response=generate_gpt4_response,
    df_person=df_person
)


                       mesh_term  tfidf_score          name
0        3' untranslated regions     0.003219  Chunhua Weng
1       abstracting and indexing     0.003219  Chunhua Weng
2       academic medical centers     0.008047  Chunhua Weng
3          access to information     0.003219  Chunhua Weng
4                     adolescent     0.014484  Chunhua Weng
..                           ...          ...           ...
419  wearable electronic devices     0.003219  Chunhua Weng
420                  web browser     0.003219  Chunhua Weng
421                     workflow     0.004828  Chunhua Weng
422                      writing     0.009656  Chunhua Weng
423                  young adult     0.017703  Chunhua Weng

[424 rows x 3 columns]


100%|██████████| 1/1 [00:03<00:00,  3.07s/it]


                mesh_term  tfidf_score          name
0       alzheimer disease     0.031428  Chunhua Weng
1                attitude     0.054999  Chunhua Weng
2      attitude to health     0.047142  Chunhua Weng
3     autoimmune diseases     0.015714  Chunhua Weng
4    bacterial infections     0.015714  Chunhua Weng
..                    ...          ...           ...
107     urogenital system     0.031428  Chunhua Weng
108     urologic diseases     0.031428  Chunhua Weng
109     vascular diseases     0.031428  Chunhua Weng
110        virus diseases     0.133569  Chunhua Weng
111              vomiting     0.015714  Chunhua Weng

[112 rows x 3 columns]


100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


In [349]:
df_summary = combine_research_focus_summaries(
    directory=directory,
    postfix_hd="HealthDomain",
    postfix_m="Method",
    output_excel="Research_summary_byMesh.xlsx"
)

df_summary.head()


Unnamed: 0,Researcher_name,Research_direction
0,Chunhua Weng,This researcher mainly focused on understandin...


In [350]:
'''This researcher mainly focused on understanding the mechanisms of behavior and behavioral sciences in relation to pathological conditions, including infections, neoplasms, and various diseases such as virus, lung, respiratory tract, nutritional, and metabolic diseases, with an emphasis on psychology and RNA virus infections.\nThis researcher has mainly contributed to integrating information science, health services administration, and medical informatics to enhance health care quality, access, and evaluation through advanced computing methodologies, epidemiologic methods, and artificial intelligence, while considering population characteristics, public health, and social sciences to improve patient care management, clinical studies, and health care delivery systems.'''

'This researcher mainly focused on understanding the mechanisms of behavior and behavioral sciences in relation to pathological conditions, including infections, neoplasms, and various diseases such as respiratory, nutritional, metabolic, and RNA virus infections, with an emphasis on psychology and virus-related lung diseases.\nThis researcher has mainly contributed to integrating information science, health services administration, and informatics to enhance health care quality, access, and evaluation through advanced methodologies such as artificial intelligence, data mining, and computational biology, while considering population characteristics, epidemiologic methods, and social sciences to improve public health outcomes and health care delivery systems.'