In [1]:
# Standard library imports
import os
import sys
import pickle
import time
import warnings

# Data processing
import numpy as np
import pandas as pd

# Machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from kneed import KneeLocator

# Utilities
from collections import defaultdict
from tqdm import tqdm

# OpenAI
from openai import OpenAI

warnings.filterwarnings("ignore")

# Research Profile Generation Pipeline

This notebook processes MeSH terms from researcher publications and generates natural language research profiles using GPT-4.

**Pipeline Steps:**
1. Load MeSH tree hierarchy
2. Process and expand MeSH terms to ancestors
3. Filter low-frequency terms
4. Categorize terms (Health Domain vs Methods)
5. Remove meaningless terms
6. Compute TF-IDF scores
7. Select top terms using elbow detection
8. Generate summaries with GPT-4
9. Combine and export final profiles

In [None]:
# =============================================================================
# CONFIGURATION - Set your API key here
# =============================================================================
openai_api_key = ''  # Enter your OpenAI API key

## 1. Function Definitions

The following cells define all the helper functions used in the pipeline.

In [None]:

def load_mesh_trees(meshtree_file):
    """
    Load a MeSH tree file and build ID-to-name and name-to-ID mappings.

    Parameters
    ----------
    meshtree_file : str
        Path to the MeSH tree file (e.g., "data/reference_files/mesh_tree_hierarchy.bin").
        Each line in the file is expected to be formatted as:
            <term>;<tree_id>

    Returns
    -------
    mesh_id2name : dict
        Dictionary mapping MeSH tree IDs (e.g., "A01.111") to term names.
    
    mesh_name2id : dict of lists
        Dictionary mapping MeSH term names to a list of associated MeSH IDs.

    Notes
    -----
    The function also injects two additional MeSH-like IDs:
        NEWID1 → Female  
        NEWID2 → Male
    """
    mesh_id2name = {}
    mesh_name2id = defaultdict(list)


    with open(meshtree_file, "r") as ftree:
        for line in ftree:
            term, tree_id = line.strip().split(";")

            mesh_id2name[tree_id] = term
            mesh_name2id[term].append(tree_id)

    # Add the two extra synthetic entries
    extra_entries = {
        'NEWID1': 'Female',
        'NEWID2': 'Male'
    }

    for mid, term in extra_entries.items():
        mesh_id2name[mid] = term
        mesh_name2id[term] = [mid]

    return mesh_id2name, mesh_name2id

In [4]:
def process_mesh_terms(csv_file, mesh_id2name, mesh_name2id):
    """
    Process researcher MeSH terms into hierarchical MeSH ancestor codes.

    Parameters
    ----------
    csv_file : str
        Path to the CSV file containing researcher MeSH terms.
        Expected columns:
            - First Name
            - Last Name
            - PMID
            - Person ID
            - mesh_term   (format: "Term / Subheading" or just "Term")

    mesh_id2name : dict
        Mapping from MeSH ID → MeSH term name.
        Example: {"A01.111": "Heart"}.

    mesh_name2id : dict or defaultdict(list)
        Mapping from MeSH term name → list of associated MeSH IDs.
        Example: {"Heart": ["A01.111", "A01.112"]}.

    Returns
    -------
    df_mesh_term : pandas.DataFrame
        A cleaned dataframe with one row per:
            (author × PMID × ancestor MeSH term)
        Columns:
            First Name, Last Name, PMID,
            Person ID, ancestor_mesh_term

    Notes
    -----
    • Handles MeSH terms with and without subheadings.
    • Automatically assigns:
          Female → NEWID1
          Male   → NEWID2
    • Builds full hierarchical ancestor chains for every MeSH ID.
    """

    # --------------------------------------------------
    # Helper function: split mesh term into main + subheadings
    # --------------------------------------------------
    def split_mesh_term(mesh_term):
        parts = mesh_term.split(" / ")
        if len(parts) > 1:
            return parts[0], parts[1:]
        return parts[0], []

    # --------------------------------------------------
    # Helper function: recursively find ancestors
    # --------------------------------------------------
    def find_mesh_term_ancestors(tid, ancestor_ids):
        ancestors = []
        mesh_name = mesh_id2name[tid]

        # Add current term
        ancestors.append(mesh_name)

        # Get all IDs associated with the same MeSH term
        ids_for_name = mesh_name2id[mesh_name]
        ancestor_ids += ids_for_name

        # Parent IDs = chop off last part of MeSH tree number
        parent_ids = [".".join(mid.split(".")[:-1]) for mid in ids_for_name]

        for pid in parent_ids:
            if pid and pid not in ancestor_ids:
                new_ancestors, ancestor_ids = find_mesh_term_ancestors(pid, ancestor_ids)
                ancestors.extend(new_ancestors)
                ancestors = list(set(ancestors))  # unique
        return ancestors, ancestor_ids

    # --------------------------------------------------
    # Get only ancestor names
    # --------------------------------------------------
    def create_mesh_term_hierarchical_codes(tid):
        ancestors, _ = find_mesh_term_ancestors(tid, [])
        return ancestors

    # --------------------------------------------------
    # Load CSV
    # --------------------------------------------------
    df_mesh = pd.read_csv(csv_file)

    # Split "Term / Subheading"
    df_mesh['mesh_term_only'], df_mesh['mesh_subheading'] = zip(
        *df_mesh['MeSH Term'].apply(split_mesh_term)
    )

    # Map main term → ID(s)
    df_mesh['mesh_term_only_id'] = df_mesh['mesh_term_only'].apply(lambda x: mesh_name2id[x])

    # Assign new synthetic IDs for Female / Male
    df_mesh.loc[df_mesh['mesh_term_only'] == 'Female', 'mesh_term_only_id'] = 'NEWID1'
    df_mesh.loc[df_mesh['mesh_term_only'] == 'Male',   'mesh_term_only_id'] = 'NEWID2'

    # Expand list of IDs to one per row
    df_mesh = df_mesh.explode("mesh_term_only_id")

    # Increase recursion depth (important)
    sys.setrecursionlimit(5000)

    # Compute full MeSH ancestor hierarchy
    df_mesh['ancestor_mesh_term'] = df_mesh['mesh_term_only_id'].apply(
        create_mesh_term_hierarchical_codes
    )

    # Expand ancestors to one per row
    df_mesh = df_mesh.explode('ancestor_mesh_term')

    # Select final output columns
    df_mesh = df_mesh[
        ['First Name', 'Last Name', 'PMID',
         'Person ID', 'ancestor_mesh_term']
    ]

    # Remove duplicates
    df_mesh.drop_duplicates(inplace=True)

    return df_mesh


In [5]:

def filter_low_frequency_mesh_terms(df_mesh_term, min_frequency=2):
    """
    Filter out MeSH ancestor terms that occur less than a specified frequency
    for each researcher.

    Parameters
    ----------
    df_mesh_term : pandas.DataFrame
        DataFrame containing at least:
            - Person ID
            - ancestor_mesh_term

    min_frequency : int, optional (default = 2)
        Minimum number of occurrences required for a researcher to keep a term.
        Terms occurring fewer times than this per Person ID are removed.

    Returns
    -------
    df_filtered : pandas.DataFrame
        DataFrame with columns:
            Person ID, ancestor_mesh_term, count
        containing only the MeSH terms that meet the frequency threshold.
    """

    # Compute frequency of each ancestor term per researcher
    df_freq = (
        df_mesh_term
        .groupby('Person ID')['ancestor_mesh_term']
        .value_counts()
        .rename('count')
        .reset_index()
    )

    # Keep terms at or above frequency threshold
    df_filtered = df_freq[df_freq['count'] >= min_frequency]

    return df_filtered


In [None]:

def categorize_mesh_terms(
    df_mesh_term_freq,
    mesh_id2name,
    mesh_name2id,
    class_file="data/reference_files/mesh_category_classification.xlsx"
):
    """
    Categorize MeSH ancestor terms into 'Health Domain' (H) and 'Method' (M)
    classes using a manually tagged level-01 class file, then propagate
    categories to child terms based on MeSH parent–child relationships.

    Parameters
    ----------
    df_mesh_term_freq : pandas.DataFrame
        Output of filter_low_frequency_mesh_terms(), must contain:
            - ancestor_mesh_term

    mesh_id2name : dict
        Mapping MeSH ID -> MeSH term name

    mesh_name2id : dict or defaultdict(list)
        Mapping MeSH term name -> list of MeSH IDs

    class_file : str
        Excel file mapping MeSH terms to top-level class labels.
        Must contain:
            - 'name'  (MeSH term)
            - 'Class' ('H' or 'M')

    Returns
    -------
    mesh_names_health_domain : list
        All MeSH term names categorized under Health Domain (H)

    mesh_names_method : list
        All MeSH term names categorized under Methods (M)
    """

    # ------------------------------
    # Load class definitions
    # ------------------------------
    df_class = pd.read_excel(class_file, index_col=0)

    mesh_names_health_domain_manual = list(df_class[df_class['Class'] == 'H']['name'])
    mesh_names_method_manual = list(df_class[df_class['Class'] == 'M']['name'])

    # -------------------------------------------
    # Get all unique ancestor terms from freq df
    # -------------------------------------------
    df_temp = df_mesh_term_freq[['ancestor_mesh_term']].drop_duplicates()

    # Map term → IDs
    df_temp['term_id'] = df_temp['ancestor_mesh_term'].apply(lambda x: mesh_name2id[x])
    df_temp = df_temp.explode('term_id')

    # Sort IDs by length so parents come first
    df_temp['length'] = df_temp['term_id'].apply(len)
    df_temp = df_temp.sort_values(by='length').drop_duplicates()

    term_ids = list(df_temp['term_id'])

    # -------------------------------------------
    # Initialize category lists
    # -------------------------------------------
    mesh_names_health_domain = mesh_names_health_domain_manual.copy()
    mesh_names_method = mesh_names_method_manual.copy()

    # -------------------------------------------
    # Categorize by tracing parent IDs
    # -------------------------------------------
    for tid in term_ids:
        term_name = mesh_id2name[tid]

        # Skip terms already categorized manually
        if (term_name in mesh_names_health_domain_manual) or (term_name in mesh_names_method):
            continue

        parts = tid.split(".")
        if len(parts) >= 2:
            parent_id = ".".join(parts[:-1])
            parent_name = mesh_id2name[parent_id]

            # Inherit health domain from parent
            if parent_name in mesh_names_health_domain:
                mesh_names_health_domain.append(term_name)

            # Inherit method class from parent
            if parent_name in mesh_names_method:
                mesh_names_method.append(term_name)

    # Remove duplicates
    mesh_names_health_domain = list(set(mesh_names_health_domain))
    mesh_names_method = list(set(mesh_names_method))

    return mesh_names_health_domain, mesh_names_method

In [7]:
# Terms that are too broad or non-informative to be useful
MEANINGLESS_MESH_TERMS = [
    'Eukaryota', 'Animals', 'Chordata', 'Vertebrates', 'Mammals', 'Eutheria',
    'Primates', 'Haplorhini', 'Catarrhini', 'Hominidae', 'Humans',
    'Natural Science Disciplines', 'Science', 'Research', 'Methods',
    'Investigative Techniques', 'Persons', 'Health Occupations',
    'Equipment and Supplies', 'Electrical Equipment and Supplies',
    'Biomedical Research', 'Household Products', 'Photography',
    'Financial Management', 'life', 'Medicine', 'Diseases'
]


def remove_meaningless_mesh_terms(df_mesh_term_freq, mesh_id2name):
    """
    Remove overly broad or non-informative MeSH terms.

    Filters out:
    - Predefined list of generic terms (e.g., "Humans", "Animals", "Science")
    - Geographic terms (MeSH IDs starting with 'Z01')

    Parameters
    ----------
    df_mesh_term_freq : pandas.DataFrame
        DataFrame with ancestor_mesh_term column.
    mesh_id2name : dict
        Mapping from MeSH ID to term name.

    Returns
    -------
    pandas.DataFrame
        Filtered DataFrame with meaningless terms removed.
    """
    # Start with predefined list
    terms_to_remove = MEANINGLESS_MESH_TERMS.copy()
    
    # Add geographic terms (Z01.*)
    for mesh_id, name in mesh_id2name.items():
        if mesh_id.startswith('Z01'):
            terms_to_remove.append(name)

    return df_mesh_term_freq[
        ~df_mesh_term_freq['ancestor_mesh_term'].isin(terms_to_remove)
    ].copy()

In [8]:
def export_mesh_term_frequency_by_category(
        df_mesh,
        df_mesh_term_freq,
        mesh_names_health_domain,
        mesh_names_method,
        directory
    ):
    """
    Export MeSH term frequency tables for Health Domain and Method categories.

    Parameters
    ----------
    df_mesh : pandas.DataFrame
        Must contain: First Name, Last Name, Person ID
    df_mesh_term_freq : pandas.DataFrame
        Must contain: Person ID, ancestor_mesh_term, count
    mesh_names_health_domain : list
        List of MeSH terms classified as Health Domain.
    mesh_names_method : list
        List of MeSH terms classified as Methods.
    directory : str
        Folder where Excel files will be saved.

    Returns
    -------
    tuple
        (health_domain_list, method_list) - Term frequencies as lists.
    """
    df_person = df_mesh[['First Name', 'Last Name', 'Person ID']].drop_duplicates()

    # Filter by category
    df_health = df_mesh_term_freq[
        df_mesh_term_freq['ancestor_mesh_term'].isin(mesh_names_health_domain)
    ].merge(df_person, on='Person ID')

    df_method = df_mesh_term_freq[
        df_mesh_term_freq['ancestor_mesh_term'].isin(mesh_names_method)
    ].merge(df_person, on='Person ID')

    # Save to Excel
    os.makedirs(directory, exist_ok=True)
    df_health.to_excel(os.path.join(directory, "mesh_term_freq_per_faculty_HealthDomain.xlsx"), index=False)
    df_method.to_excel(os.path.join(directory, "mesh_term_freq_per_faculty_Method.xlsx"), index=False)

    return df_health.values.tolist(), df_method.values.tolist()

In [9]:
def build_researcher_mesh_string(mesh_term_freq_list):
    """
    Build frequency-weighted term strings for TF-IDF input.
    
    Each term is repeated according to its frequency count.
    Example: If "Heart" appears 3 times -> "Heart;Heart;Heart"

    Parameters
    ----------
    mesh_term_freq_list : list of lists
        Each row: [Person ID, term, freq, ...]

    Returns
    -------
    dict
        Mapping: Person ID -> "term;term;term2;term2;..."
    """
    dict_researcher_mesh = defaultdict(str)

    for row in mesh_term_freq_list:
        person_id, term, freq = row[:3]
        dict_researcher_mesh[person_id] += (term + ";") * int(freq)

    # Remove trailing semicolons
    return {pid: terms.rstrip(";") for pid, terms in dict_researcher_mesh.items()}

In [10]:
def run_mesh_tfidf(dict_researcher_mesh, df_person, directory, postfix="HealthDomain"):
    """
    Compute TF-IDF scores for researcher MeSH term profiles.

    Parameters
    ----------
    dict_researcher_mesh : dict
        Mapping: Person ID -> "term1;term1;term2;..." (frequency-weighted)
    df_person : pandas.DataFrame
        Contains: Person ID, First Name, Last Name
    directory : str
        Output folder for CSV files.
    postfix : str
        Label for output files ("HealthDomain" or "Method").

    Returns
    -------
    pandas.DataFrame
        TF-IDF scores in long format (term, score, name).
    """
    def custom_tokenizer(text):
        return [token.strip() for token in text.split(';') if token.strip()]

    corpus = list(dict_researcher_mesh.values())
    faculty_ids = list(dict_researcher_mesh.keys())

    # Compute TF-IDF
    vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
    X = vectorizer.fit_transform(corpus)

    # Convert to long-format DataFrame
    df_tfidf = pd.DataFrame(X.toarray())
    df_tfidf["Person ID"] = faculty_ids
    df_tfidf.set_index("Person ID", inplace=True)

    df_tfidf = df_tfidf.T
    df_tfidf["mesh_term"] = vectorizer.get_feature_names_out()
    df_tfidf.set_index("mesh_term", inplace=True)

    df_tfidf = pd.DataFrame(df_tfidf.unstack()).reset_index()
    df_tfidf.rename(columns={"level_0": "Person ID", 0: "tfidf_score"}, inplace=True)

    # Add researcher names
    df_tfidf = df_tfidf.merge(df_person, on="Person ID")
    df_tfidf["name"] = df_tfidf["First Name"] + " " + df_tfidf["Last Name"]
    df_tfidf.drop(["Person ID", "First Name", "Last Name"], axis=1, inplace=True)

    # Save output
    os.makedirs(directory, exist_ok=True)
    df_tfidf.to_csv(os.path.join(directory, f"term_per_researcher_tfidf_{postfix}.csv"), index=False)

    return df_tfidf

In [11]:
def generate_gpt4_response(content, print_output=False):
    """
    Send a prompt to GPT-4 and return the generated response.
    
    Parameters
    ----------
    content : str
        The prompt to send to GPT-4.
    print_output : bool
        If True, print the full API response for debugging.
    
    Returns
    -------
    str or None
        The generated text, or None if an error occurred.
    """
    client = OpenAI(api_key=openai_api_key)
    
    try:
        completions = client.chat.completions.create(
            model="gpt-4o",
            temperature=0,  # Deterministic output
            top_p=0.1,      # Focused sampling
            n=1,
            messages=[
                {'role': 'system', 'content': 'You are a dean of a college.'},
                {'role': 'user', 'content': content},
            ]
        )

        if print_output:
            print(completions)

        return completions.choices[0].message.content
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [12]:
def generate_research_focus_summaries(postfix, directory, generate_gpt4_response, df_person, elbow_S=5):
    """
    Generate research focus summaries for each researcher using GPT-4.

    Uses TF-IDF scores to select top terms via elbow-point detection,
    then sends them to GPT-4 for natural language summarization.

    Parameters
    ----------
    postfix : str
        Category label ("Method" or "HealthDomain").
    directory : str
        Folder for input/output files.
    generate_gpt4_response : callable
        Function to call GPT-4 API.
    df_person : pandas.DataFrame
        Researcher info (Person ID, First Name, Last Name).
    elbow_S : int
        Sensitivity parameter for KneeLocator (default: 5).

    Returns
    -------
    tuple
        (dict_terms, dict_summaries) - Selected terms and generated summaries.
    """
    # Load TF-IDF scores
    file_path = os.path.join(directory, f"term_per_researcher_tfidf_{postfix}.csv")
    df_tfidf = pd.read_csv(file_path)
    df_tfidf = df_tfidf[df_tfidf["tfidf_score"] > 0]

    # Build researcher -> {term: score} mapping
    dict_mesh_tfidf = (
        df_tfidf.groupby("name")
        .apply(lambda x: dict(zip(x["mesh_term"], x["tfidf_score"])))
        .to_dict()
    )

    dict_terms = {}
    dict_summaries = {}

    for researcher_name in tqdm(dict_mesh_tfidf.keys()):
        # Sort terms by TF-IDF score
        df_temp = df_tfidf[df_tfidf["name"] == researcher_name].copy()
        df_temp.sort_values("tfidf_score", ascending=False, inplace=True)
        df_temp["rank"] = np.arange(len(df_temp))

        # Find elbow point for term selection
        kneedle = KneeLocator(
            df_temp["rank"], df_temp["tfidf_score"],
            S=elbow_S, curve="convex", direction="decreasing"
        )
        
        # Fallback: top 5% or at least 3 terms
        knee_point = kneedle.knee if kneedle.knee else max(3, int(0.05 * len(df_temp)))
        
        # Select top terms
        selected_terms = list(df_temp.iloc[:knee_point]["mesh_term"])
        dict_terms[researcher_name] = "; ".join(selected_terms)

        # Generate GPT-4 summary
        prompt = (
            f"Help me summarize this group of phrases into 1 sentence as a research focus:\n"
            f"{dict_terms[researcher_name]}\n"
            f"Please start with: The research focus is on"
        )

        summary = None
        while summary is None:
            summary = generate_gpt4_response(prompt)
            if summary is None:
                print(f"GPT-4 failed, retrying...")
                time.sleep(3)

        # Adjust prefix based on category
        if postfix == "Method":
            summary = summary.replace("The research focus is on", "This researcher has mainly contributed to")
        else:
            summary = summary.replace("The research focus is on", "This researcher mainly focused on")

        dict_summaries[researcher_name] = summary

    # Save results
    with open(os.path.join(directory, f"dict_terms_for_a_researcher_for_focus{postfix}.pickle"), "wb") as f:
        pickle.dump(dict_terms, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(directory, f"dict_research_focus_for_a_researcher_{postfix}.pickle"), "wb") as f:
        pickle.dump(dict_summaries, f, protocol=pickle.HIGHEST_PROTOCOL)

    return dict_terms, dict_summaries

In [13]:
def combine_research_focus_summaries(
        directory,
        postfix_hd="HealthDomain",
        postfix_m="Method",
        output_excel="Research_summary_byMesh.xlsx"
    ):
    """
    Combine Health Domain and Method summaries into a single research profile.

    Parameters
    ----------
    directory : str
        Folder containing the pickle files.
    postfix_hd : str
        Postfix for Health Domain pickle file.
    postfix_m : str
        Postfix for Method pickle file.
    output_excel : str
        Output Excel filename.

    Returns
    -------
    pandas.DataFrame
        Combined profiles with columns: Researcher_name, Research_direction
    """
    # Load Health Domain summaries
    with open(os.path.join(directory, f"dict_research_focus_for_a_researcher_{postfix_hd}.pickle"), "rb") as f:
        dict_hd = pickle.load(f)
    df_hd = pd.DataFrame.from_dict(dict_hd, orient="index").reset_index()
    df_hd.columns = ["Researcher_name", "Research_summary_hd"]

    # Load Method summaries
    with open(os.path.join(directory, f"dict_research_focus_for_a_researcher_{postfix_m}.pickle"), "rb") as f:
        dict_m = pickle.load(f)
    df_m = pd.DataFrame.from_dict(dict_m, orient="index").reset_index()
    df_m.columns = ["Researcher_name", "Research_summary_m"]

    # Merge and combine
    df = df_hd.merge(df_m, on="Researcher_name", how="outer").fillna("")
    df["Research_direction"] = (df["Research_summary_hd"] + "\n" + df["Research_summary_m"]).str.strip("\n")

    df_final = df[["Researcher_name", "Research_direction"]]
    df_final.to_excel(os.path.join(directory, output_excel), index=False)

    return df_final

## 2. Pipeline Execution

The following cells execute the pipeline step by step.

In [None]:
# =============================================================================
# FILE PATHS
# =============================================================================
directory = "results/final_result"
input_file = "data/output_sample/preprocess/filtered_publications.csv"

In [None]:
# Step 1: Load MeSH tree hierarchy
meshtree_file = "data/reference_files/mesh_tree_hierarchy.bin"
mesh_id2name, mesh_name2id = load_mesh_trees(meshtree_file)
print(f"Loaded {len(mesh_id2name)} MeSH terms")

In [16]:
# Step 2: Process MeSH terms and expand to ancestors
df_mesh_term = process_mesh_terms(input_file, mesh_id2name, mesh_name2id)
print(f"Processed {len(df_mesh_term)} term-publication pairs")
df_mesh_term.head()

Processed 9652 term-publication pairs


Unnamed: 0,First Name,Last Name,PMID,Person ID,ancestor_mesh_term
0,Chunhua,Weng,38838949,1,Eutheria
0,Chunhua,Weng,38838949,1,Mammals
0,Chunhua,Weng,38838949,1,Chordata
0,Chunhua,Weng,38838949,1,Animals
0,Chunhua,Weng,38838949,1,Hominidae


In [17]:
# Step 3: Filter low-frequency terms (keep terms appearing 2+ times)
df_mesh_term_freq = filter_low_frequency_mesh_terms(df_mesh_term, min_frequency=2)
print(f"Kept {len(df_mesh_term_freq)} term-researcher pairs after frequency filtering")

Kept 559 term-researcher pairs after frequency filtering


In [None]:
# Step 4: Categorize terms into Health Domain vs Methods
mesh_names_health_domain, mesh_names_method = categorize_mesh_terms(
    df_mesh_term_freq,
    mesh_id2name,
    mesh_name2id,
    class_file="data/reference_files/mesh_category_classification.xlsx"
)
print(f"Health Domain terms: {len(mesh_names_health_domain)}")
print(f"Method terms: {len(mesh_names_method)}")

In [19]:
# Step 5: Remove meaningless/overly broad terms
df_mesh_term_freq = remove_meaningless_mesh_terms(df_mesh_term_freq, mesh_id2name)
print(f"Remaining terms after removing meaningless: {len(df_mesh_term_freq)}")

Remaining terms after removing meaningless: 527


In [20]:
# Step 6: Export term frequencies by category
mesh_term_freq_list_health_domain, mesh_term_freq_list_method = export_mesh_term_frequency_by_category(
    df_mesh_term,
    df_mesh_term_freq,
    mesh_names_health_domain,
    mesh_names_method,
    directory=directory
)
print(f"Exported {len(mesh_term_freq_list_health_domain)} health domain entries")
print(f"Exported {len(mesh_term_freq_list_method)} method entries")

Exported 109 health domain entries
Exported 424 method entries


In [21]:
# Step 7: Build frequency-weighted term strings for TF-IDF
dict_researcher_mesh_health_domain = build_researcher_mesh_string(mesh_term_freq_list_health_domain)
dict_researcher_mesh_method = build_researcher_mesh_string(mesh_term_freq_list_method)

In [22]:
# Step 8: Compute TF-IDF scores
df_person = df_mesh_term[['First Name', 'Last Name', 'Person ID']].drop_duplicates()

print("Computing TF-IDF for Health Domain terms...")
run_mesh_tfidf(
    dict_researcher_mesh=dict_researcher_mesh_health_domain,
    df_person=df_person,
    directory=directory,
    postfix="HealthDomain"
)

print("Computing TF-IDF for Method terms...")
run_mesh_tfidf(
    dict_researcher_mesh=dict_researcher_mesh_method,
    df_person=df_person,
    directory=directory,
    postfix="Method"
)

Computing TF-IDF for Health Domain terms...
Computing TF-IDF for Method terms...


Unnamed: 0,mesh_term,tfidf_score,name
0,3' untranslated regions,0.003219,Chunhua Weng
1,abstracting and indexing,0.003219,Chunhua Weng
2,academic medical centers,0.008047,Chunhua Weng
3,access to information,0.003219,Chunhua Weng
4,adolescent,0.014484,Chunhua Weng
...,...,...,...
419,wearable electronic devices,0.003219,Chunhua Weng
420,web browser,0.003219,Chunhua Weng
421,workflow,0.004828,Chunhua Weng
422,writing,0.009656,Chunhua Weng


In [23]:
# Step 9: Generate research focus summaries using GPT-4
print("Generating Method summaries...")
dict_terms_method, dict_focus_method = generate_research_focus_summaries(
    postfix="Method",
    directory=directory,
    generate_gpt4_response=generate_gpt4_response,
    df_person=df_person
)

print("\nGenerating Health Domain summaries...")
dict_terms_hd, dict_focus_hd = generate_research_focus_summaries(
    postfix="HealthDomain",
    directory=directory,
    generate_gpt4_response=generate_gpt4_response,
    df_person=df_person
)

Generating Method summaries...


100%|██████████| 1/1 [00:03<00:00,  3.55s/it]



Generating Health Domain summaries...


100%|██████████| 1/1 [00:02<00:00,  2.44s/it]


In [None]:
# Step 10: Combine summaries and export final results
df_summary = combine_research_focus_summaries(
    directory=directory,
    postfix_hd="HealthDomain",
    postfix_m="Method",
    output_excel="Research_summary_byMesh.xlsx"
)

print(f"Generated profiles for {len(df_summary)} researchers")
print(f"Output saved to: {directory}/Research_summary_byMesh.xlsx")
df_summary