1)  Import & load datasets

In [1]:
import pandas as pd

# Load CSV files
poi_df = pd.read_csv("Datasets/poi_info_updated.csv")
descr_df = pd.read_csv("Datasets/data_descr_en_updated.csv")

# Print shape and preview
print("POI dataset shape:", poi_df.shape)
print("Descriptions dataset shape:", descr_df.shape)

# Preview first rows
poi_df.head()



POI dataset shape: (78, 6)
Descriptions dataset shape: (77, 4)


Unnamed: 0,poi_id,poi_name,category_id,category_name,longitude,latitude
0,54,Basilica di Santa Anastasia,1,Chiese,16673.45.00,45.445.176.000.000.000
1,52,complesso del Duomo,1,Chiese,166142.21.00,4.544.707.660.000.000
2,70,Chiesa di San Bernardino,1,Chiese,163530.46.00,73195.54.00
3,74,Chiesa di Santa Maria in Organo,1,Chiese,729.57.00,74111.56.00
4,51,Chiesa di San Lorenzo,1,Chiese,165261.42.00,73567.17.00


2) Merge the two datasets on common IDs

In [2]:
# Find common IDs between POI and descriptions
common_ids = set(poi_df["poi_id"]).intersection(set(descr_df["classref"]))

# Merge datasets on matching IDs
merged_df = pd.merge(
    poi_df[poi_df["poi_id"].isin(common_ids)],
    descr_df[descr_df["classref"].isin(common_ids)],
    left_on="poi_id",
    right_on="classref",
    how="inner"
)

# Keep only relevant columns
merged_df = merged_df[[
    "poi_id", "poi_name", "category_name", "descr_trad_value"
]]

# Show result
print(f"POIs with description available: {merged_df.shape[0]}")
merged_df.head()


POIs with description available: 78


Unnamed: 0,poi_id,poi_name,category_name,descr_trad_value
0,54,Basilica di Santa Anastasia,Chiese,The church of St. Anastasia is a fine example ...
1,52,complesso del Duomo,Chiese,"The Cathedral, which is dedicated to Santa Mar..."
2,70,Chiesa di San Bernardino,Chiese,The Church of San Bernardino is a Catholic pla...
3,74,Chiesa di Santa Maria in Organo,Chiese,"The church, near the Organo gate, already exis..."
4,51,Chiesa di San Lorenzo,Chiese,San Lorenzo is a Romanesque Roman Catholic chu...


In [3]:
display(merged_df)

Unnamed: 0,poi_id,poi_name,category_name,descr_trad_value
0,54,Basilica di Santa Anastasia,Chiese,The church of St. Anastasia is a fine example ...
1,52,complesso del Duomo,Chiese,"The Cathedral, which is dedicated to Santa Mar..."
2,70,Chiesa di San Bernardino,Chiese,The Church of San Bernardino is a Catholic pla...
3,74,Chiesa di Santa Maria in Organo,Chiese,"The church, near the Organo gate, already exis..."
4,51,Chiesa di San Lorenzo,Chiese,San Lorenzo is a Romanesque Roman Catholic chu...
...,...,...,...,...
73,33,Multisala Rivoli,Cinema,"A multi-screen cinema just off Piazza Bra, Riv..."
74,34,Cinema Fiume,Cinema,Part of a local network of art-house and first...
75,35,A.M.E.N,Discoteca,Set on the Torricelle hillside above the histo...
76,36,Berfi’s Club,Discoteca,"A staple of Verona’s club scene for decades, B..."


3) Setup & utils 

In [4]:
# === Cell 3: Setup & small text utils ===
import pandas as pd
import numpy as np
import spacy
from pathlib import Path
from datetime import datetime, timedelta, timezone
import json, re, time
from collections import Counter, defaultdict

# Where to store the personal knowledge base (parquet files)
DATA_DIR = Path("data_store")
DATA_DIR.mkdir(exist_ok=True)
MEM_PATH = DATA_DIR / "memory_stream.parquet"
ENT_PATH = DATA_DIR / "entity_store.parquet"

# Minimal stopwords; expand as needed
STOPWORDS = set("""
a an the of and or for to in on with from by at as is are was were be been being
this that it its these those he she they we you him her them my your his her our their
all any each every no not but if then when where why how what which who whom whose
more most less least such so than too very just only now here there up down out over under
into onto through across along around before after during between among within without
above below behind beside beyond inside outside near far
""".split())

ENTITY_MIN_LEN = 3

def _normalize_token(tok: str) -> str:
    """Lowercase token, strip punctuation, remove short and stopwords."""
    t = re.sub(r"[^\w\-']", "", tok.lower()).strip("'")
    if len(t) < ENTITY_MIN_LEN: return ""
    if t in STOPWORDS: return ""
    if t.isdigit(): return ""
    return t

# Load English model (small and fast)
# Run in terminal once if missing: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def extract_entities(text: str) -> list[str]:
    """
    Extract named entities using spaCy NER.
    Filters out less useful types (dates, numbers, ordinals).
    Returns a list of unique entity strings in lowercase.
    """
    if not text:
        return []

    doc = nlp(text)
    entities = []

    for ent in doc.ents:
        # ent.text = the actual entity string (e.g. "Verona")
        # ent.label_ = the entity type (e.g. GPE, ORG, DATE)
        if ent.label_ not in {"DATE", "TIME", "CARDINAL", "ORDINAL"}:
            entities.append(ent.text.lower())

    # Deduplicate by converting to set, then back to list
    return list(set(entities))

def _load_parquet(path: Path) -> pd.DataFrame:
    """Load a parquet file if it exists; otherwise return an empty DataFrame."""
    if path.exists():
        return pd.read_parquet(path)
    return pd.DataFrame()

def _save_parquet(df: pd.DataFrame, path: Path):
    """Save DataFrame to parquet (creates/overwrites)."""
    df.to_parquet(path, index=False)

def reset_entity_store(full: bool = True):
    """
    Reset both entity store and memory stream.
    - full=True  -> delete ENT_PATH and MEM_PATH
    - full=False -> delete only ENT_PATH
    """

    ent_path = Path(ENT_PATH)
    if ent_path.exists():
        ent_path.unlink()
        print(f"[OK] Entity store {ent_path} deleted.")
    else:
        print(f"[INFO] Entity store {ent_path} already empty.")

    if full:
        mem_path = Path(MEM_PATH)
        if mem_path.exists():
            mem_path.unlink()
            print(f"[OK] Memory stream {mem_path} deleted.")
        else:
            print(f"[INFO] Memory stream {mem_path} already empty.")

    # recreate empty DataFrames and save them
    empty_ent = pd.DataFrame(columns=["user_id","entity","count","first_seen","last_seen","sources_count"])
    _save_parquet(empty_ent, ENT_PATH)

    empty_mem = pd.DataFrame(columns=["user_id","timestamp","source","text","meta"])
    _save_parquet(empty_mem, MEM_PATH)

    print("[DONE] Store succesfully resetted.")


In [6]:
#reset_entity_store(full=True)  # Reset both entity store and memory stream

4) Memory stream (append) + Entity store (aggregate)

In [7]:
# === Cell 4: Memory stream appenders & entity store rebuild ===

def append_memory(user_id: str,
                  source: str,           # "query" | "page" | "orcid"
                  text: str,
                  meta: dict | None = None,
                  ts: datetime | None = None):
    """Append a single event to the memory stream."""
    mem = _load_parquet(MEM_PATH)
    row = {
        "user_id": user_id,
        "timestamp": pd.to_datetime(ts or datetime.now(timezone.utc)),
        "source": source,
        "text": text or "",
        "meta": json.dumps(meta or {}, ensure_ascii=False),
    }
    mem = pd.concat([mem, pd.DataFrame([row])], ignore_index=True)
    _save_parquet(mem, MEM_PATH)

def rebuild_entity_store():
    """
    Aggregate memory_stream -> entity_store:
    one row per (user_id, entity) with counts and first/last seen.
    """
    mem = _load_parquet(MEM_PATH)
    if mem.empty:
        ent = pd.DataFrame(columns=["user_id","entity","count","first_seen","last_seen","sources_count"])
        _save_parquet(ent, ENT_PATH)
        return

    records = []
    for (user_id,), dfu in mem.groupby(["user_id"], dropna=False):
        rows = []
        for _, r in dfu.iterrows():
            ents = extract_entities(r["text"])
            if r["source"] == "orcid" and not ents:
                # keep the full ORCID keyword as a single entity
                kw = (r["text"] or "").strip().lower()
                if kw:
                    ents = [kw]
            rows.append((r["timestamp"], r["source"], ents))

        counter = Counter()
        first_seen, last_seen = {}, {}
        src_counter = defaultdict(lambda: Counter())

        for ts, src, ents in rows:
            for e in ents:
                counter[e] += 1
                if e not in first_seen or ts < first_seen[e]:
                    first_seen[e] = ts
                if e not in last_seen or ts > last_seen[e]:
                    last_seen[e] = ts
                src_counter[e][src] += 1

        for e, c in counter.items():
            records.append({
                "user_id": user_id,
                "entity": e,
                "count": int(c),
                "first_seen": first_seen[e],
                "last_seen": last_seen[e],
                "sources_count": json.dumps(src_counter[e]),
            })

    ent = pd.DataFrame.from_records(records)
    ent = ent.sort_values(["user_id","entity","last_seen"])\
             .drop_duplicates(["user_id","entity"], keep="last")
    _save_parquet(ent, ENT_PATH)

ent = pd.read_parquet(ENT_PATH)
display(ent.head()) 


Unnamed: 0,user_id,entity,count,first_seen,last_seen,sources_count


5) ORCID integration

In [8]:
# === Cell 5: ORCID integration with file cache ===
import requests
CACHE_DIR = Path("cache_orcid")
CACHE_DIR.mkdir(exist_ok=True)

def _cache_path(orcid_id: str) -> Path:
    return CACHE_DIR / f"{orcid_id}.json"

def get_orcid_profile_cached(orcid_id: str, ttl_hours: int = 24) -> dict | None:
    """Fetch ORCID record with a simple time-based cache."""
    p = _cache_path(orcid_id)
    if p.exists() and (time.time() - p.stat().st_mtime) < ttl_hours * 3600:
        try:
            return json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            pass

    url = f"https://pub.orcid.org/v3.0/{orcid_id}/record"
    resp = requests.get(url, headers={"Accept": "application/json"}, timeout=15)
    resp.raise_for_status()
    data = resp.json()
    p.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
    return data

def extract_keywords_from_orcid(data: dict, keywords_from_title: bool = False) -> list[str]:
    """Return keywords from ORCID profile + keywords from work titles."""
    kws = []
    person = (data or {}).get("person", {})
    kw_list = (person.get("keywords") or {}).get("keyword", []) or []
    for kw in kw_list:
        val = (kw or {}).get("content") or ""
        if val:
            kws.append(val)

    # also derive keywords from work titles
    if not keywords_from_title:
        return sorted(set(kws))
    
    groups = (((data or {}).get("activities-summary") or {}).get("works") or {}).get("group", []) or []
    for g in groups:
        for s in (g or {}).get("work-summary", []) or []:
            t = (((s or {}).get("title") or {}).get("title") or {}).get("value") or ""
            if t:
                kws.extend(extract_entities(t))
    return sorted(set(kws))

def insert_orcid_keywords_to_memory(user_id: str, orcid_id: str) -> list[str]:
    """Append ORCID keywords/terms into the memory stream as 'orcid' events."""
    data = get_orcid_profile_cached(orcid_id)
    if not data:
        return []
    terms = extract_keywords_from_orcid(data, keywords_from_title=True)
    for term in terms:
        append_memory(user_id=user_id, source="orcid", text=term, meta={"orcid_id": orcid_id})
    return terms


# Example usage: insert ORCID keywords into memory (keywords + title of works/publications)
terms = insert_orcid_keywords_to_memory("u1", "0000-0001-6092-6831")

rebuild_entity_store()
ent = pd.read_parquet(ENT_PATH)
display(ent)



  mem = pd.concat([mem, pd.DataFrame([row])], ignore_index=True)


Unnamed: 0,user_id,entity,count,first_seen,last_seen,sources_count
0,u1,artdeco,1,2025-08-27 12:39:43.809239+00:00,2025-08-27 12:39:43.809239+00:00,"{""orcid"": 1}"
1,u1,automatic data personalization,1,2025-08-27 12:39:43.818490+00:00,2025-08-27 12:39:43.818490+00:00,"{""orcid"": 1}"
2,u1,cad,1,2025-08-27 12:39:43.826891+00:00,2025-08-27 12:39:43.826891+00:00,"{""orcid"": 1}"
3,u1,camus,1,2025-08-27 12:39:43.837821+00:00,2025-08-27 12:39:43.837821+00:00,"{""orcid"": 1}"
4,u1,context,1,2025-08-27 12:39:43.846668+00:00,2025-08-27 12:39:43.846668+00:00,"{""orcid"": 1}"
5,u1,context-aware recommendation system,1,2025-08-27 12:39:43.856520+00:00,2025-08-27 12:39:43.856520+00:00,"{""orcid"": 1}"
6,u1,context-awareness,1,2025-08-27 12:39:43.741752+00:00,2025-08-27 12:39:43.741752+00:00,"{""orcid"": 1}"
7,u1,data management,1,2025-08-27 12:39:43.778412+00:00,2025-08-27 12:39:43.778412+00:00,"{""orcid"": 1}"
8,u1,data science,1,2025-08-27 12:39:43.760160+00:00,2025-08-27 12:39:43.760160+00:00,"{""orcid"": 1}"
9,u1,database,1,2025-08-27 12:39:43.770142+00:00,2025-08-27 12:39:43.770142+00:00,"{""orcid"": 1}"


6) Logging helpers for queries and pages

In [None]:
# === Cell 6: Logging helpers for queries and pages ===

USER_ID = "u1"  # if you have only one user, keep it simple

def log_query_event(current_query: str, user_id: str):
    """Append a user query into the memory stream."""
    append_memory(user_id=user_id, source="query", text=current_query, meta=None)

def log_page_viewed_event(poi_row: pd.Series, user_id: str):
    """
    Append a 'page view' using your merged_df row.
    We concatenate name, category, and description into the 'text' field.
    """
    text = f"{poi_row['poi_name']} ({poi_row['category_name']}): {poi_row['descr_trad_value']}"
    meta = {"poi_id": str(poi_row["poi_id"]), "poi_name": poi_row["poi_name"]}
    append_memory(user_id=user_id, source="page", text=text, meta=meta)

print("Example logging:")
log_query_event("What are the most famous churches in Verona?", USER_ID)
log_page_viewed_event(merged_df.iloc[0], USER_ID)  # log the first PO
display(pd.read_parquet(MEM_PATH).tail(3))
rebuild_entity_store()
ent = pd.read_parquet(ENT_PATH)
display(ent.head(15))



Example logging:


Unnamed: 0,user_id,timestamp,source,text,meta
25,u1,2025-08-27 12:39:43.972824+00:00,orcid,xquery,"{""orcid_id"": ""0000-0001-6092-6831""}"
26,u1,2025-08-27 12:39:54.294380+00:00,query,What are the most famous churches in Verona?,{}
27,u1,2025-08-27 12:39:54.303969+00:00,page,Basilica di Santa Anastasia (Chiese): The chur...,"{""poi_id"": ""54"", ""poi_name"": ""Basilica di Sant..."


Unnamed: 0,user_id,entity,count,first_seen,last_seen,sources_count
0,u1,altichiero,1,2025-08-27 12:39:54.303969+00:00,2025-08-27 12:39:54.303969+00:00,"{""page"": 1}"
1,u1,anastasia,1,2025-08-27 12:39:54.303969+00:00,2025-08-27 12:39:54.303969+00:00,"{""page"": 1}"
2,u1,artdeco,1,2025-08-27 12:39:43.809239+00:00,2025-08-27 12:39:43.809239+00:00,"{""orcid"": 1}"
3,u1,automatic data personalization,1,2025-08-27 12:39:43.818490+00:00,2025-08-27 12:39:43.818490+00:00,"{""orcid"": 1}"
4,u1,basilica,1,2025-08-27 12:39:54.303969+00:00,2025-08-27 12:39:54.303969+00:00,"{""page"": 1}"
5,u1,brusasorzi,1,2025-08-27 12:39:54.303969+00:00,2025-08-27 12:39:54.303969+00:00,"{""page"": 1}"
6,u1,cad,1,2025-08-27 12:39:43.826891+00:00,2025-08-27 12:39:43.826891+00:00,"{""orcid"": 1}"
7,u1,camus,1,2025-08-27 12:39:43.837821+00:00,2025-08-27 12:39:43.837821+00:00,"{""orcid"": 1}"
8,u1,chiese,1,2025-08-27 12:39:54.303969+00:00,2025-08-27 12:39:54.303969+00:00,"{""page"": 1}"
9,u1,context,1,2025-08-27 12:39:43.846668+00:00,2025-08-27 12:39:43.846668+00:00,"{""orcid"": 1}"


7) Retrieve personal entities for the current context

In [10]:
# === Cell 7: Retrieve personal entities for the current context  ===

from datetime import datetime, timezone, timedelta
import pandas as pd
import random
from typing import List

def pick_personal_entities_k_lamp(user_id: str,
                         query: str,
                         page_text: str,
                         strategy: str = "familiar",   # "familiar" | "unfamiliar" | "lapsed"
                         k: int = 5,
                         lapsed_days: int = 14,
                         seed: int | None = None) -> List[str]:
    """
    K-LaMP selection:
      - Context entities = entities(query) ∪ entities(page)
      - Look up (count, last_seen) in user's entity store
      - Sample k entities according to strategy:
          familiar:     sample ∝ count (exclude count==0)
          unfamiliar:   sample ∝ 1/(count+1)  (include unseen with high prob)
          lapsed:       keep last_seen < now-14d, sample ∝ count
    Sampling is WITHOUT replacement. Use `seed` for reproducibility.
    """
    # context entities (order-preserving unique)
    ctx = extract_entities(query) + extract_entities(page_text)
    ctx = list(dict.fromkeys([e for e in ctx if e]))
    if not ctx:
        return []

    # user store lookup
    ent = _load_parquet(ENT_PATH)
    ent_user = ent[ent["user_id"] == user_id].copy()
    ent_user.set_index("entity", inplace=True)

    now = datetime.now(timezone.utc)
    cutoff = now - timedelta(days=lapsed_days)

    items = []  # (entity, count, last_seen)
    for e in ctx:
        if e in ent_user.index:   # check if entity exists in user's store
            row = ent_user.loc[e] # get the row for this entity
            cnt = int(row["count"]) # count of occurrences
            last_seen = pd.to_datetime(row["last_seen"], utc=True, errors="coerce") # last seen timestamp
        else:
            cnt = 0 # unseen entity
            last_seen = None
        items.append((e, cnt, last_seen))

    # candidates + weights

    if strategy == "familiar":
        cand = [(e, c, ls) for (e, c, ls) in items if c > 0]
        weights = [float(c) for (_, c, _) in cand]  # ∝ count

    elif strategy == "unfamiliar":
        cand = items[:]  # include unseen
        weights = [1.0 / (c + 1.0) for (_, c, _) in cand]  # ∝ 1/(count+1)

    elif strategy == "lapsed":
        cand = [(e, c, ls) for (e, c, ls) in items if (ls is not None and ls < cutoff)]
        weights = [float(c) for (_, c, _) in cand]  # ∝ count

    else:
        raise ValueError("strategy must be 'familiar', 'unfamiliar', or 'lapsed'")

    if not cand: # if no candidates after filtering
        return []

    if sum(weights) <= 0: 
        weights = [1.0] * len(cand)  # fallback uniform

    # weighted sampling without replacement
    rng = random.Random(seed)
    chosen: List[str] = []
    cand_e = [e for (e, _, _) in cand] # candidate entities
    cand_w = [float(w) for w in weights] # candidate weights

    for _ in range(min(k, len(cand_e))):    
        total = sum(cand_w) # sum of weights
        r = rng.random() * total # random threshold
        acc = 0.0   # reset accumulator for weights
        idx = 0 # index of chosen entity
         # find the index where the accumulated weight exceeds the random threshold
        for i, w in enumerate(cand_w):
            acc += w
            if r <= acc:
                idx = i
                break
        chosen.append(cand_e.pop(idx))
        cand_w.pop(idx)

    return chosen




8) Prompt builder for Gemini

In [21]:
# === Cell 8: Prompt builder for Gemini ===

def _load_memory() -> pd.DataFrame:
    expected = ["user_id","timestamp","source","text","meta"]
    mem = _load_parquet(MEM_PATH)
    for c in expected:
        if c not in mem.columns:
            mem[c] = pd.Series(dtype="object")
    mem["timestamp"] = pd.to_datetime(mem["timestamp"], utc=True, errors="coerce")
    return mem[expected]

def get_session_queries(user_id: str,
                        n: int | None = None,
                        hours: int | None = None,
                        order: str = "desc") -> list[str]:
    """Return the user's session queries."""
    mem = _load_memory()
    q = mem[(mem["user_id"] == user_id) & (mem["source"] == "query")].copy()

    if hours is not None:
        cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
        q = q[q["timestamp"] >= cutoff]

    q = q.sort_values("timestamp", ascending=(order == "asc"))
    queries = [str(t) for t in q["text"].tolist()]

    if n is not None:
        queries = queries[:n]
    return queries

def get_latest_article(user_id: str) -> tuple[str, str]:
    """Return (title, text) of the most recent 'page' event for a user."""
    mem = _load_memory()
    pages = mem[(mem["user_id"] == user_id) & (mem["source"] == "page")].copy()
    if pages.empty:
        return "", ""
    r = pages.sort_values("timestamp", ascending=False).iloc[0]
    try:
        meta = json.loads(r["meta"] or "{}")
    except Exception:
        meta = {}
    title = meta.get("poi_name") or (str(r["text"]).split(":")[0][:120] if isinstance(r["text"], str) else "")
    text = str(r["text"] or "")
    return title, text

def build_k_lamp_prompt_from_context(user_id: str,
                                     current_query: str,
                                     page_title: str,
                                     page_text: str,
                                     strategy: str = "familiar",
                                     k_entities: int = 5,
                                     personal_keywords: list[str] | None = None,
                                     n_session: int | None = None,     # None -> all queries
                                     max_article_chars: int = 1200) -> dict:
    """
    Build {system,user} messages as in K-LaMP, with 'Personal Entities' = entities
    sampled from the current context [query · page] according to `strategy`.
    """
    # System message (rules)
    system_msg = (
        "You are an AI assistant whose primary goal is to suggest a next search query, "
        "to help the user search and find information better on the search engine. "
        "Different queries and entities are separated by '|'."
    )

    # Session (last N or all)
    session_list = get_session_queries(user_id=user_id, n=n_session, order="desc")
    session_str = " | ".join(session_list or [])

    # Article
    art_title = page_title or ""
    art_text = (page_text or "")[:max_article_chars]

    # Personal Entities via K-LaMP sampler (context-dependent)
    personal_ents = pick_personal_entities_k_lamp(
        user_id=user_id,
        query=current_query,
        page_text=page_text,
        strategy=strategy,
        lapsed_days=14,
        k=k_entities,
        seed=42,              # set for reproducibility (optional)
    )
    personal_str = " | ".join(personal_ents)

    # Personal Keywords (static, e.g. from ORCID)
    personal_keywords_str = " | ".join(personal_keywords or [])



    # User message (payload)
    user_msg = (
        "Read the following query, session, article, user context entities, and user profile keywords of the user as the context information, "
        "which might be helpful and relevant to suggest the next query.\n"
        f"Query: {current_query}\n"
        f"Session: {session_str}\n"
        f"Article Title: {art_title}\n"
        f"Article Text: {art_text}\n\n"
        f"User Context Entities: {personal_str}\n\n"

        #f"The user is a Data Scientist\n"
        f"User Profile Keywords: {personal_keywords_str}\n\n"
        "Based on the above query, session, article, user context entities, and user profile keywords, please generate one next query "
        "suggestion with the rationale, in the format of\n"
        "Query Suggestion:\n"
        "Rationale:"
    )
    return {"system": system_msg, "user": user_msg}




9) End-to-end usage with merged_df and Gemini

In [20]:
# === Cell 9: end-to-end with K-LaMP prompt ===

import google.generativeai as genai
import os

# Use the API key from environment variable
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Load Gemini model (chat-style)
model = genai.GenerativeModel(model_name="models/gemini-1.5-pro-latest")

# Choose a current query and a visited page
current_query = "What to do in Verona?"
poi_row = merged_df.iloc[21]

# Log to memory
log_query_event(current_query, USER_ID)
log_page_viewed_event(poi_row, USER_ID)

#  Rebuild entity store
rebuild_entity_store()

# Build page title/text
page_title = poi_row["poi_name"]
page_text  = f"{poi_row['poi_name']} ({poi_row['category_name']}): {poi_row['descr_trad_value']}"

profile_user1 = get_orcid_profile_cached("0000-0001-6092-6831")
u1_keywords = extract_keywords_from_orcid(profile_user1, keywords_from_title=False)

# Build messages with strategy = familiar | unfamiliar | lapsed
msgs = build_k_lamp_prompt_from_context(
    user_id=USER_ID,
    current_query=current_query,
    page_title=page_title,
    page_text=page_text,
    strategy="familiar",     # "familiar" | "unfamiliar" | "lapsed"
    k_entities=5,        
    personal_keywords=u1_keywords,  # from ORCID    
    n_session=None          
)

# Print FULL messages
print("\n\n=== SYSTEM ===\n")
print(msgs["system"])

print("\n=== USER ===\n")
print(msgs["user"])

# Call Gemini
model = genai.GenerativeModel(
    model_name="models/gemini-1.5-pro-latest",
    system_instruction=msgs["system"]
)
resp = model.generate_content(msgs["user"])
print("\nGemini next", resp.text.strip())





=== SYSTEM ===

You are an AI assistant whose primary goal is to suggest a next search query, to help the user search and find information better on the search engine. Different queries and entities are separated by '|'.

=== USER ===

Read the following query, session, article, user context entities, and user profile keywords of the user as the context information, which might be helpful and relevant to suggest the next query.
 Give more weight to the user Profile keywords.
Query: What to do in Verona?
User Context Entities: monumenti | verona | romans | bc | german

User Profile Keywords: Context-awareness | Data science | Database | Ethics in Data Management | Personalization | Recommender Systems

Based on the above query, session, article, user context entities, and user profile keywords, please generate one next query suggestion with the rationale, in the format of
Query Suggestion:
Rationale:

Gemini next Query Suggestion: Recommender Systems for Tourist Attractions in Verona
