In [None]:
# =============================================================================
# PACKAGE INSTALLATION
# =============================================================================
# Install necessary packages. (Run these in your environment if not already installed.)
# You should choose GPU runtime if possible
!pip install biopython sentence-transformers pandas numpy requests openai

# =============================================================================
# IMPORTS AND INITIAL SETUP
# =============================================================================
import os
import re
import numpy as np
import pandas as pd

from Bio import Entrez, Medline
from sentence_transformers import SentenceTransformer, util
import openai

# Set your email (required by NCBI) and your OpenAI API key.
Entrez.email = "your.email@example.com"  # Replace with your email address.
openai.api_key = "YOUR_OPENAI_API_KEY"     # Replace with your OpenAI API key.

# =============================================================================
# PUBMED DATA FETCHING AND PARSING
# =============================================================================
# Define a list of target PMIDs (these are the 9 papers we used in our publications, update with your own PMIDs as needed)
target_pmids = [
    "11990444", "12186348", "11990441", "19053917",
    "20054593", "23484181", "23379539", "26556577", "31811645"
]

def fetch_pubmed_records(pmids):
    """
    Given a list of PMIDs, fetch the corresponding PubMed records and return them as a list of Medline records.
    """
    handle = Entrez.efetch(db="pubmed", id=pmids, rettype="medline", retmode="text")
    records = list(Medline.parse(handle))
    handle.close()
    return records

def parse_pubmed_records(records):
    """
    Parse Medline records to extract Title, Abstract, Year, and Journal.
    Returns a pandas DataFrame.
    """
    data = []
    for record in records:
        title = record.get("TI", "")
        abstract = record.get("AB", "")
        journal = record.get("JT", "")
        # Extract publication year from the date (if available)
        pub_date = record.get("DP", "")
        year_match = re.search(r"\b(19|20)\d{2}\b", pub_date)
        year = year_match.group(0) if year_match else ""
        data.append({
            "Title": title,
            "Abstract": abstract,
            "Year": year,
            "Journal": journal
        })
    return pd.DataFrame(data)

# Fetch target records and parse them.
target_records = fetch_pubmed_records(target_pmids)
data_target = parse_pubmed_records(target_records)
data_target["Target"] = 1
data_target["PMID"] = target_pmids

# Save target data to a generic location.
data_target.to_csv("data/target.csv", index=False)

# =============================================================================
# RETRIEVE ADDITIONAL PMIDs FROM PUBMED BY YEAR (CHUNKING)
# =============================================================================
def esearch_query(query, mindate, maxdate, retstart=0, retmax=10000):
    """
    Perform an ESearch call on PubMed for a given date range.
    """
    handle = Entrez.esearch(
        db="pubmed",
        term=query,
        mindate=str(mindate),
        maxdate=str(maxdate),
        retstart=retstart,
        retmax=retmax
    )
    results = Entrez.read(handle)
    handle.close()
    return results

def fetch_pmids_by_year_chunks(query, start_year, end_year, step=5):
    """
    Partition the query by date ranges and return a set of unique PMIDs.
    """
    all_pmids = set()
    current_year = start_year
    while current_year <= end_year:
        chunk_start = current_year
        chunk_end = min(current_year + step - 1, end_year)
        print(f"Searching {chunk_start} to {chunk_end}")
        chunk_search = esearch_query(query, chunk_start, chunk_end, retmax=0)
        chunk_count = int(chunk_search["Count"])
        if chunk_count == 0:
            current_year += step
            continue
        print(f"  Found {chunk_count} articles in {chunk_start}-{chunk_end}")
        batch_size = 10000
        for start in range(0, chunk_count, batch_size):
            sub_results = esearch_query(query, chunk_start, chunk_end, retstart=start, retmax=batch_size)
            pmids_batch = sub_results["IdList"]
            all_pmids.update(pmids_batch)
            print(f"    Retrieved {len(pmids_batch)} PMIDs; total so far {len(all_pmids)}")
        current_year += step
    return all_pmids

# Example usage: searching for articles on "periodontal regeneration"
search_query = "periodontal regeneration"
start_year = 1970
end_year = 2025
pmid_set = fetch_pmids_by_year_chunks(query=search_query, start_year=start_year, end_year=end_year, step=5)
print("Total unique PMIDs retrieved:", len(pmid_set))
print("Sample PMIDs:", list(pmid_set)[:10])

def fetch_pmids_in_batches(pmids, batch_size=200):
    """
    Batch-fetch PubMed records for a list of PMIDs.
    """
    all_records = []
    pmids_list = list(pmids)
    for start_idx in range(0, len(pmids_list), batch_size):
        batch_pmids = pmids_list[start_idx:start_idx+batch_size]
        records = fetch_pubmed_records(batch_pmids)
        all_records.extend(records)
    return all_records

# Fetch records from the additional PMIDs.
pmid_list = list(pmid_set)
pool_records = fetch_pmids_in_batches(pmid_list, batch_size=200)
data_pool = parse_pubmed_records(pool_records)

# Remove any duplicates already in the target set.
data_pool = data_pool[~data_pool["PMID"].isin(set(data_target["PMID"]))]
data_pool["Target"] = 0

# =============================================================================
# EMBEDDINGS AND SIMILARITY CALCULATION
# =============================================================================
# Load the sentence transformer model for encoding article texts.
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_model = SentenceTransformer(model_name)

def encode_text(df):
    """
    Concatenate the Title and Abstract columns and encode them into embeddings.
    """
    texts = (df["Title"] + " " + df["Abstract"]).tolist()
    embeddings = embedding_model.encode(texts, convert_to_tensor=True)
    return embeddings

target_embeddings = encode_text(data_target)
pool_embeddings = encode_text(data_pool)
print("target_embeddings shape:", target_embeddings.shape)
print("pool_embeddings shape:", pool_embeddings.shape)

# Compute the mean embedding of target articles.
mean_target_embedding = target_embeddings.mean(dim=0, keepdim=True)
similarities = util.cos_sim(pool_embeddings, mean_target_embedding).squeeze()
data_pool["similarity"] = similarities.cpu().numpy()

# =============================================================================
# DIVIDE THE POOL INTO QUARTILES BASED ON SIMILARITY
# =============================================================================
q1 = data_pool["similarity"].quantile(0.25)
q2 = data_pool["similarity"].quantile(0.50)
q3 = data_pool["similarity"].quantile(0.75)

df_q1 = data_pool[data_pool["similarity"] <= q1].copy()
df_q2 = data_pool[(data_pool["similarity"] > q1) & (data_pool["similarity"] <= q2)].copy()
df_q3 = data_pool[(data_pool["similarity"] > q2) & (data_pool["similarity"] <= q3)].copy()
df_q4 = data_pool[data_pool["similarity"] > q3].copy()

print("Quartile shapes:", df_q1.shape, df_q2.shape, df_q3.shape, df_q4.shape)
print("Mean similarities:", df_q1["similarity"].mean(), df_q2["similarity"].mean(),
      df_q3["similarity"].mean(), df_q4["similarity"].mean())

def merge_and_save(data_target, df_quartile, quartile_label, out_dir="data/"):
    """
    Merge target records with a given quartile subset and save to CSV.
    """
    merged_df = pd.concat([data_target, df_quartile], ignore_index=True)
    os.makedirs(out_dir, exist_ok=True)
    filename = os.path.join(out_dir, f"merged_quartile_{quartile_label}.csv")
    merged_df.to_csv(filename, index=False)
    print(f"Saved {filename} with shape {merged_df.shape}.")

# Save merged quartile files.
merge_and_save(data_target, df_q1, "1")
merge_and_save(data_target, df_q2, "2")
merge_and_save(data_target, df_q3, "3")
merge_and_save(data_target, df_q4, "4")

# =============================================================================
# CREATE SUB-SAMPLES FROM MERGED DATA
# =============================================================================
input_files = [
    "data/merged_quartile_1.csv",
    "data/merged_quartile_2.csv",
    "data/merged_quartile_3.csv",
    "data/merged_quartile_4.csv",
]
output_files = [
    "data/red_merged_quartile_1.csv",
    "data/red_merged_quartile_2.csv",
    "data/red_merged_quartile_3.csv",
    "data/red_merged_quartile_4.csv",
]

total_sample_size = 200

for input_file, output_file in zip(input_files, output_files):
    df = pd.read_csv(input_file)
    # Separate target (label 1) and non-target (label 0) articles.
    target_1_df = df[df["Target"] == 1]
    target_0_df = df[df["Target"] == 0]
    if len(target_1_df) > total_sample_size:
        raise ValueError("There are more than 200 target articles. Adjust the sample size.")
    remaining_sample_size = total_sample_size - len(target_1_df)
    target_0_sample = target_0_df.sample(n=remaining_sample_size, random_state=42)
    reduced_df = pd.concat([target_1_df, target_0_sample], ignore_index=True)
    reduced_df = reduced_df.sample(frac=1.0, random_state=42).reset_index(drop=True)
    reduced_df.to_csv(output_file, index=False)
    print(f"Saved {output_file} with shape {reduced_df.shape}")

# =============================================================================
# GPT-3.5 TURBO CLASSIFICATION
# =============================================================================
# Below are two functions that use GPT-3.5 Turbo to classify articles.
# We used a "soft approach" prompt.

def classify_article_with_gpt3_soft(title, abstract):
    """
    Zero-shot classification using GPT-3.5 Turbo with a soft-approach prompt.
    Returns "ACCEPT" if the article meets the criteria, otherwise "REJECT".
    """
    prompt = f"""
You are assisting in a systematic review on periodontal regeneration comparing
Emdogain (EMD) + bone graft (BG) versus BG alone.

Your task is to decide whether the following article should be **ACCEPTED**
or **REJECTED** based on the following “soft approach” criteria:

**Inclusion Criteria**:
1. Population: Adult periodontitis patients (≥18 years) with at least one intrabony or furcation defect.
2. Intervention: Regenerative surgical procedures involving EMD combined with any bone graft material (EMD+BG).
3. Comparison: Surgical procedures using bone graft alone.
4. Outcomes: Primary outcomes like CAL gain and PD reduction; secondary outcomes may include pocket closure, wound healing, gingival recession, tooth loss, PROMs, or adverse events.
5. Study Design: RCT (parallel or split-mouth), with ≥10 patients per arm and ≥6 months follow-up.

**Soft Approach**:
- If at least one of these criteria is explicitly met or strongly implied, and none are contradicted, then respond with ACCEPT.
- Otherwise, respond with REJECT.

Title: {title}
Abstract: {abstract}

Respond with exactly:
ACCEPT
or
REJECT
"""
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=64
        )
        answer = response["choices"][0]["message"]["content"].strip().upper()
        return "ACCEPT" if "ACCEPT" in answer else "REJECT"
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return "REJECT"


# Process reduced CSV files using both GPT-3.5 Turbo classification functions.
reduced_files = [
    "data/red_merged_quartile_1.csv",
    "data/red_merged_quartile_2.csv",
    "data/red_merged_quartile_3.csv",
    "data/red_merged_quartile_4.csv",
]

# Directory to store GPT-3.5 Turbo scored results.
gpt3_output_dir = "data/GPT3/"
os.makedirs(gpt3_output_dir, exist_ok=True)

# --- Soft Approach ---
for csv_file in reduced_files:
    print(f"\nProcessing file (soft approach): {csv_file}")
    df = pd.read_csv(csv_file)
    df["Title"] = df["Title"].fillna("")
    df["Abstract"] = df["Abstract"].fillna("")
    decisions = []
    for _, row in df.iterrows():
        decision = classify_article_with_gpt3_soft(row["Title"], row["Abstract"])
        decisions.append(1 if decision == "ACCEPT" else 0)
    df["Accepted_GPT3"] = decisions
    output_filename = os.path.join(gpt3_output_dir, os.path.basename(csv_file).replace(".csv", "_gpt3_scored.csv"))
    df.to_csv(output_filename, index=False)
    print(f"Saved GPT-3.5 Turbo (soft) scored file to: {output_filename}")


# =============================================================================
# END OF SCRIPT
# =============================================================================
