## Get the query results of "fmri & love" from PubMed:

In [2]:
from pathlib import Path
import requests

# List of PMC article URLs
urls = [
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC4863427",
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC7223160",
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC7264388",
]

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://pmc.ncbi.nlm.nih.gov/",
}

for url in urls:
    pmcid = url.split("/")[-1]
    outfile = Path(f"{pmcid}.html")
    with requests.Session() as s:
        s.headers.update(headers)
        resp = s.get(url, timeout=30)
        resp.raise_for_status()
        if not resp.encoding:
            resp.encoding = "utf-8"
        outfile.write_text(resp.text, encoding=resp.encoding)
    print(f"Saved {outfile.resolve()} ({outfile.stat().st_size:,} bytes)")

Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC4863427.html (180,248 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7223160.html (357,542 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7223160.html (357,542 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7264388.html (205,454 bytes)
Saved C:\Users\USER\github-classroom\ntu-info\neurosynth-etl-11rachelh\PMC7264388.html (205,454 bytes)


# Extract PMIDs from a PubMed HTML page
Here we use **BeautifulSoup** and **regular expressions** to extract PMIDs from a saved PubMed HTML page.

In [9]:
from bs4 import BeautifulSoup
import re
from pathlib import Path
import pandas as pd

HTML_PATH = Path('front_psychol_fmri_love.html')  # change this if needed
html_text = HTML_PATH.read_text(encoding='utf-8', errors='ignore')

In [26]:
soup = BeautifulSoup(html_text, 'html.parser')
meta = soup.find('meta', attrs={'name': 'log_displayeduids'})
pmids_str=meta.get('content')
print(pmids_str)
pmids= re.findall(r'\d+', pmids_str)
# or simply pmids=pmids_str.split(',')
print(pmids)

26617535,27242579,32457675,32528365
['26617535', '27242579', '32457675', '32528365']


In [29]:
# See all the available models:
import json

url = "http://localhost:1234/v1/models"
response = requests.get(url)
models = response.json()
print(json.dumps(models, indent=4))

{
    "data": [
        {
            "id": "text-embedding-nomic-embed-text-v1.5",
            "object": "model",
            "owned_by": "organization_owner"
        },
        {
            "id": "meta-llama-3-8b-instruct",
            "object": "model",
            "owned_by": "organization_owner"
        },
        {
            "id": "gemma-3-12b-it-qat",
            "object": "model",
            "owned_by": "organization_owner"
        },
        {
            "id": "gemma-3-4b-it-qat",
            "object": "model",
            "owned_by": "organization_owner"
        },
        {
            "id": "qwen2.5-14b-instruct-mlx",
            "object": "model",
            "owned_by": "organization_owner"
        },
        {
            "id": "phi-3.5-mini-instruct",
            "object": "model",
            "owned_by": "organization_owner"
        },
        {
            "id": "llama-3.2-3b-instruct",
            "object": "model",
            "owned_by": "organization_owner"
 

In [31]:
# Test text -> text:

url = "http://localhost:1234/v1/chat/completions"

payload = {
    "model": "gemma-3-4b-it-qat",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "1+1=?"}
    ]
}

response = requests.post(url, json=payload)
data = response.json()
print(data["choices"][0]["message"]["content"])

1 + 1 = 2



In [32]:
# Test [image, text] -> text:

import base64

with open("32528365_Table3.jpg", "rb") as f:
    image_base64 = base64.b64encode(f.read()).decode("utf-8")

payload = {
    "model": "gemma-3-4b-it-qat",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's this?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_base64}"
                    }
                }
            ]
        }
    ]
}

response = requests.post(url, json=payload)
data = response.json()
print(data["choices"][0]["message"]["content"])

This table presents the results of a structural analysis, likely from a neuroimaging study (such as fMRI). Here's a breakdown of what the table shows:

**Overall Purpose:** The table identifies "significant clusters of activity for the main effects of video type." This means researchers were examining how brain regions responded differently to different types of video content (e.g., action vs. nature, or educational vs. entertainment).

**Columns Explained:**

*   **Structural location:** This column indicates the brain region where a significant difference in activity was found.  The locations are described using abbreviations (e.g., "Medial frontal pole," "Sup. temporal gyrus").
*   **Voxels:** This represents the number of individual brain measurement points (voxels) within that region showing a significant difference in activity based on the video type.  A higher number indicates a more robust finding for that region.
*   **Z-max:** This is the maximum Z-score within a cluster. The

# Write your own [X, Y, Z] coordinate extractor
You can either extract the coordinates from *.html or *.pdf of the 3 target articles:

https://pmc.ncbi.nlm.nih.gov/articles/PMC4863427
https://pmc.ncbi.nlm.nih.gov/articles/PMC7223160
https://pmc.ncbi.nlm.nih.gov/articles/PMC7264388

You can assume you know what tables to extract, IF NEEDED. 

The sprint goal is to generate info_data.csv BY YOURSELF.

In [37]:
import pandas as pd
df = pd.read_csv('info_data.csv')
print(df.head())
print(df['Keywords'][0])

       PMID    PMCID                                           Keywords  \
0  27242579  4863427  AI; MPFC; aMCC; fMRI; intrasexual competition;...   
1  27242579  4863427  AI; MPFC; aMCC; fMRI; intrasexual competition;...   
2  27242579  4863427  AI; MPFC; aMCC; fMRI; intrasexual competition;...   
3  27242579  4863427  AI; MPFC; aMCC; fMRI; intrasexual competition;...   
4  27242579  4863427  AI; MPFC; aMCC; fMRI; intrasexual competition;...   

   Table   X   Y   Z  
0      2   8  12  58  
1      2 -30  22   4  
2      2  42  10   0  
3      2  -6  14  42  
4      2 -62 -22  34  
AI; MPFC; aMCC; fMRI; intrasexual competition; pain empathy


In [None]:
# Extraction handles multi-row headers (e.g., 'Coordinates' with x, y, z underneath), flattens headers, and extracts correct columns, including negative values
import re
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

html_files = [
    'PMC4863427.html',
    'PMC7223160.html',
    'PMC7264388.html',
]

results = []

def extract_keywords(soup, html_text):
    meta_keywords = soup.find('meta', attrs={'name': 'citation_keywords'})
    if meta_keywords and meta_keywords.get('content'):
        return meta_keywords.get('content')
    for tag in soup.find_all(['b', 'strong', 'h3', 'h4', 'p', 'span']):
        if tag.get_text(strip=True).lower().startswith('keywords'):
            next_text = ''
            text = tag.get_text(separator=' ', strip=True)
            if ':' in text:
                next_text = text.split(':', 1)[1].strip()
            if not next_text:
                sibling = tag.find_next_sibling(text=True)
                if sibling:
                    next_text = sibling.strip()
            if not next_text and tag.parent:
                sibling = tag.parent.find_next_sibling(text=True)
                if sibling:
                    next_text = sibling.strip()
            if next_text:
                return next_text
    kw_match = re.search(r'Keywords?[:\s]+([\w\s;,-]+)', html_text, re.IGNORECASE)
    if kw_match:
        return kw_match.group(1).strip()
    return ''

def is_number(val):
    try:
        # Remove spaces between minus and number, e.g., '- 36' -> '-36'
        val = re.sub(r'^-\s+', '-', val.strip())
        float(val)
        return True
    except Exception:
        return False

def find_xyz_indices_from_multiline_header(header_rows):
    n_cols = max(len(row) for row in header_rows)
    flat_headers = [''] * n_cols
    for row in header_rows:
        for i in range(n_cols):
            part = row[i].strip().lower() if i < len(row) else ''
            if part:
                if flat_headers[i]:
                    flat_headers[i] += ' ' + part
                else:
                    flat_headers[i] = part
    x_idx = y_idx = z_idx = None
    for i, h in enumerate(flat_headers):
        if x_idx is None and re.search(r'\bx\b', h):
            x_idx = i
        elif y_idx is None and re.search(r'\by\b', h):
            y_idx = i
        elif z_idx is None and re.search(r'\bz\b', h):
            z_idx = i
    if None not in (x_idx, y_idx, z_idx):
        return x_idx, y_idx, z_idx
    return None

for html_file in html_files:
    html_path = Path(html_file)
    if not html_path.exists():
        print(f"File not found: {html_file}")
        continue
    html_text = html_path.read_text(encoding='utf-8', errors='ignore')
    soup = BeautifulSoup(html_text, 'html.parser')

    pmcid = re.search(r'PMC\d+', html_file).group() if re.search(r'PMC\d+', html_file) else ''

    pmid = ''
    meta_pmid = soup.find('meta', attrs={'name': 'citation_pmid'})
    if meta_pmid:
        pmid = meta_pmid.get('content')
    else:
        pmid_match = re.search(r'PMID[:\s]+(\d+)', html_text)
        if pmid_match:
            pmid = pmid_match.group(1)

    keywords = extract_keywords(soup, html_text)

    tables = soup.find_all('table')
    for table_idx, table in enumerate(tables, 1):
        rows = table.find_all('tr')
        if not rows:
            continue
        header_rows = []
        for row in rows[:2]:
            cells = row.find_all(['th', 'td'])
            if all(cell.name == 'th' for cell in cells):
                header_rows.append([cell.get_text(strip=True) for cell in cells])
        if not header_rows:
            header_rows = [[cell.get_text(strip=True) for cell in rows[0].find_all(['th', 'td'])]]
        xyz_indices = find_xyz_indices_from_multiline_header(header_rows)
        if xyz_indices:
            x_idx, y_idx, z_idx = xyz_indices
            for row in rows[len(header_rows):]:
                cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                if len(cols) > max(x_idx, y_idx, z_idx):
                    x_val, y_val, z_val = cols[x_idx], cols[y_idx], cols[z_idx]
                    # Remove spaces between minus and number for each value
                    x_val = re.sub(r'^-\s+', '-', x_val.strip())
                    y_val = re.sub(r'^-\s+', '-', y_val.strip())
                    z_val = re.sub(r'^-\s+', '-', z_val.strip())
                    if all(is_number(v) for v in [x_val, y_val, z_val]):
                        x = float(x_val)
                        y = float(y_val)
                        z = float(z_val)
                        results.append({
                            'PMID': pmid,
                            'PMCID': pmcid.replace('PMC',''),
                            'Keywords': keywords,
                            'Table': table_idx,
                            'X': x,
                            'Y': y,
                            'Z': z,
                        })

df = pd.DataFrame(results)
df.to_csv('b11504006_info_data.csv', index=False)
print('Saved to b11504006_info_data.csv')

Saved to b11504006_info_data.csv
