1)  Import & load datasets

In [40]:
import pandas as pd

# Load CSV files
poi_df = pd.read_csv("Datasets/poi_info.csv")
descr_df = pd.read_csv("Datasets/data_descr_en.csv")

# Print shape and preview
print("POI dataset shape:", poi_df.shape)
print("Descriptions dataset shape:", descr_df.shape)

# Preview first rows
poi_df.head()



POI dataset shape: (42, 6)
Descriptions dataset shape: (28, 4)


Unnamed: 0,poi_id,poi_name,category_id,category_name,longitude,latitude
0,54,Basilica di Santa Anastasia,1,Chiese,10.999825,45.445176
1,52,complesso del Duomo,1,Chiese,10.996794,45.447077
2,70,Chiesa di San Bernardino,1,Chiese,10.981125,45.438905
3,74,Chiesa di Santa Maria in Organo,1,Chiese,11.004314,45.444402
4,51,Chiesa di San Lorenzo,1,Chiese,10.99151,45.441134


2) Merge the two datasets on common IDs

In [41]:
# Find common IDs between POI and descriptions
common_ids = set(poi_df["poi_id"]).intersection(set(descr_df["id"]))

# Merge datasets on matching IDs
merged_df = pd.merge(
    poi_df[poi_df["poi_id"].isin(common_ids)],
    descr_df[descr_df["id"].isin(common_ids)],
    left_on="poi_id",
    right_on="id",
    how="inner"
)

# Keep only relevant columns
merged_df = merged_df[[
    "poi_id", "poi_name", "category_name", "descr_trad_value"
]]

# Show result
print(f"POIs with description available: {merged_df.shape[0]}")
merged_df.head()


POIs with description available: 13


Unnamed: 0,poi_id,poi_name,category_name,descr_trad_value
0,74,Chiesa di Santa Maria in Organo,Chiese,The Basilica of San Zeno is without any doubt ...
1,63,Basilica di San Zeno,Chiese,"The building, dating back to the XIII century,..."
2,48,Porta Leoni,Monumenti,The Archeaeological Museum was founded in 1924...
3,77,Piazza Bra,Monumenti,"Among Europe's most ancient public museums, th..."
4,53,Biblioteca Capitolare,Monumenti,"Palazzo della Ragione, located between beautif..."


In [42]:
display(merged_df)

Unnamed: 0,poi_id,poi_name,category_name,descr_trad_value
0,74,Chiesa di Santa Maria in Organo,Chiese,The Basilica of San Zeno is without any doubt ...
1,63,Basilica di San Zeno,Chiese,"The building, dating back to the XIII century,..."
2,48,Porta Leoni,Monumenti,The Archeaeological Museum was founded in 1924...
3,77,Piazza Bra,Monumenti,"Among Europe's most ancient public museums, th..."
4,53,Biblioteca Capitolare,Monumenti,"Palazzo della Ragione, located between beautif..."
5,58,Palazzo della Ragione o del Comune,Monumenti,"The Forum, centre of city life during Roman ti..."
6,46,Ponte Pietra,Monumenti,Walking through the Costa Arch you arrive in P...
7,47,Teatro Romano,Monumenti,It stretches over the ancient ford across the ...
8,55,Arche Scaligere,Monumenti,"It is the highest tower of Verona, 84 meters, ..."
9,43,Arco dei Gavi,Monumenti,"Near the small church of S. Maria Antica, whic..."


3) Fetch ORCID keywords from public API

In [None]:
import requests
import re

def clean_text(text):
    """Cleans text by removing special characters and double spaces."""
    if not text:
        return ""
    return re.sub(r"\s+", " ", text.strip())

def extract_terms_from_titles(titles):
    """Extracts (simple) keywords from titles."""
    stopwords = {"the", "and", "of", "in", "for", "to", "a", "on", "with"}
    terms = set()
    for t in titles:
        words = re.findall(r"\b\w+\b", t.lower())
        for w in words:
            if w not in stopwords and len(w) > 2:
                terms.add(w)
    return sorted(terms)

def get_orcid_profile(orcid_id):
    """Reads and structures the public data of an ORCID profile."""
    base_url = f"https://pub.orcid.org/v3.0/{orcid_id}"
    headers = {"Accept": "application/json"}

    resp = requests.get(f"{base_url}/record", headers=headers)
    if resp.status_code != 200:
        raise Exception(f"Errore nella richiesta ORCID: {resp.status_code}")

    data = resp.json()
    profile = {}

    # Keywords
    profile["keywords"] = [
        clean_text(kw.get("content", ""))
        for kw in data["person"].get("keywords", {}).get("keyword", [])
    ]

    # Biography
    profile["bio"] = clean_text(
        (data["person"].get("biography")or {}).get("content", "")
    )

    # Works - titoli
    works = data.get("activities-summary", {}).get("works", {}).get("group", [])
    work_titles = []
    for group in works:
        for summary in group.get("work-summary", []):
            title = summary.get("title", {}).get("title", {}).get("value", "")
            if title:
                work_titles.append(clean_text(title))
    profile["work_titles"] = work_titles
    profile["work_topics_keywords"] = extract_terms_from_titles(work_titles)

    # Employment
    employment_list = []
    employments = (
        data.get("activities-summary", {})
        .get("employments", {})
        .get("employment-summary", [])
    )
    for emp in employments:
        org = emp.get("organization", {}).get("name", "")
        role = emp.get("role-title", "")
        employment_list.append(
            clean_text(f"{role} @ {org}" if role else org)
        )
    profile["employment"] = employment_list

    # Education
    education_list = []
    educations = (
        data.get("activities-summary", {})
        .get("educations", {})
        .get("education-summary", [])
    )
    for edu in educations:
        org = edu.get("organization", {}).get("name", "")
        degree = edu.get("role-title", "")
        education_list.append(
            clean_text(f"{degree} @ {org}" if degree else org)
        )
    profile["education"] = education_list

    return profile



orcid_id_1 = "0000-0001-6092-6831"
profilo1 = get_orcid_profile(orcid_id_1)

orcid_id_2 = "0000-0002-9809-1005"
profilo2 = get_orcid_profile(orcid_id_2)

orcid_1_keywords = profilo["keywords"]
orcid_2_keywords = profilo2["keywords"]
    
from pprint import pprint
print("\nKeywords from ORCID profile 1:")
pprint(profilo1["keywords"] )
print("\nKeywords from ORCID profile 2:")
pprint(profilo2["keywords"])


Keywords from ORCID profile 1:
['Database',
 'Data science',
 'Ethics in Data Management',
 'Recommender Systems',
 'Context-awareness',
 'Personalization']

Keywords from ORCID profile 2:
['diabetes, metabolism, pancreatic beta cell function, insulin-resistance']


In [44]:
## SIMPLER ORCID PROFILE

#import requests

#def get_orcid_keywords(orcid_id):
    #"""
    #Fetches keywords from the ORCID public API for a given user.
    #Returns a list of keywords.
    #"""
    #url = f"https://pub.orcid.org/v3.0/{orcid_id}/person"
    #headers = {"Accept": "application/json"}
    #try:
        #response = requests.get(url, headers=headers)
        #response.raise_for_status()
        #data = response.json()
        #keywords = data.get("keywords", {}).get("keyword", [])
        #return [kw["content"] for kw in keywords]
    #except Exception as e:
        #print(f"Error fetching ORCID keywords for {orcid_id}:", e)
        #return []

#orcid_id_1 = "0000-0001-6092-6831"
#orcid_id_2 = "0000-0002-9809-1005"

#orcid_1_keywords = get_orcid_keywords(orcid_id_1)
#orcid_2_keywords = get_orcid_keywords(orcid_id_2)

#print("ORCID 1 Keywords:", orcid_1_keywords)
#print("ORCID 2 Keywords:", orcid_2_keywords)



4) Define user query, visited POI, and build context

In [50]:
# Use the ORCID keywords as user profile
user_profile = orcid_1_keywords

# Simulated user query
current_query = "monuments in Verona"

# Choose one POI description as visited content
sample = merged_df.iloc[2]  # You can change the index
poi_name = sample["poi_name"]
poi_category = sample["category_name"]
poi_description = sample["descr_trad_value"]

# Print input context
print("Current query:", current_query)
print("Visited POI:", poi_name, f"({poi_category})")
print("User profile (from ORCID):", user_profile)



Current query: monuments in Verona
Visited POI: Porta Leoni (Monumenti)
User profile (from ORCID): ['Database', 'Data science', 'Ethics in Data Management', 'Recommender Systems', 'Context-awareness', 'Personalization']


5) Build prompt for GPT

In [51]:
# Build GPT-style prompt based on user profile and context
prompt = f"""
You are a smart assistant that suggests the next search query for a user.
The user just searched for: "{current_query}".
They are currently reading about: "{poi_name}", which is a {poi_category}.
Here is the description of the place:
"{poi_description}"

The user has a known interest in the following topics: {', '.join(user_profile)}.

Suggest the next query they might be interested in, considering the context and their interests.
Return only the query, without explanation.
"""

print(prompt.strip())



You are a smart assistant that suggests the next search query for a user.
The user just searched for: "monuments in Verona".
They are currently reading about: "Porta Leoni", which is a Monumenti.
Here is the description of the place:
"The Archeaeological Museum was founded in 1924 and hosts archaeological finds both from Verona and its surrounding areas (epigraphs, sculptures, mosaics, bronzes, etc.) as well as pieces from other collections.\n
About 600 works of art are exhibited in the museum and some 150 more can be seen displayed within the external cloister and the Theatre area whereas some thousands pieces are preserved in store-rooms.\n
The whole complex of buildings and cloisters of the former Jesuates convent is itself a monument worth visiting. Many finds, inscriptions and sarcophaguses are located in the Museum cloisters and in the Theatre external area"

The user has a known interest in the following topics: Database, Data science, Ethics in Data Management, Recommender Syst

Cell 6 – Invio del prompt a GPT

In [52]:
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()  # carica .env
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))


In [53]:
import google.generativeai as genai
import os

# Use the API key from environment variable
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Load Gemini model (chat-style)
model = genai.GenerativeModel(model_name="models/gemini-1.5-pro-latest")

def get_next_query_gemini(prompt):
    """
    Sends prompt to Gemini and returns the generated text.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print("Gemini API error:", e)
        return None


suggested_query = get_next_query_gemini(prompt)
print("Next suggested query (Gemini):", suggested_query)



Next suggested query (Gemini): "museum information retrieval systems"


3 SCENARI:

In [55]:
# 📌 Shared base data for all scenarios
current_query = "monuments in Verona"

# Select a realistic POI from the merged dataset
poi_row = merged_df.iloc[2]
poi_name = poi_row["poi_name"]
poi_category = poi_row["category_name"]
poi_description = poi_row["descr_trad_value"]

# 📦 Prompt construction function
def build_prompt(query, poi_name, category, description, user_keywords=None):
    """
    Builds the input prompt for Gemini based on query, POI, and optional user profile.
    """
    prompt = f"""You are an assistant that suggests personalized next search queries.
The user just searched for: "{query}".
They are reading about: "{poi_name}" ({category}).
Description: "{description}"

"""
    if user_keywords:
        prompt += f"The user is interested in: {', '.join(user_keywords)}.\n"
    else:
        prompt += "The user has no known profile or preferences.\n"
    
    prompt += "Suggest the next query they might be interested in. Only return the query."

    return prompt

# 🧪 Scenario 1 – User without ORCID profile
prompt_1 = build_prompt(current_query, poi_name, poi_category, poi_description)
result_1 = get_next_query_gemini(prompt_1)

# 🧪 Scenario 2 – User with ORCID profile 1
prompt_2 = build_prompt(current_query, poi_name, poi_category, poi_description, orcid_1_keywords)
result_2 = get_next_query_gemini(prompt_2)

# 🧪 Scenario 3 – User with ORCID profile 2
prompt_3 = build_prompt(current_query, poi_name, poi_category, poi_description, orcid_2_keywords)
result_3 = get_next_query_gemini(prompt_3)

# ✅ Print results for all three scenarios
print("\n--- SCENARIO 1 (no ORCID) ---\n", result_1)
print("\n--- SCENARIO 2 (ORCID 1) ---\n", result_2)
print("\n--- SCENARIO 3 (ORCID 2) ---\n", result_3)




--- SCENARIO 1 (no ORCID) ---
 Archaeological Museum Verona

--- SCENARIO 2 (ORCID 1) ---
 "dataset of roman monuments"

--- SCENARIO 3 (ORCID 2) ---
 roman ruins verona
