In [None]:
!pip install --upgrade pandas==2.1.4

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_df = pd.read_csv("./Exhibit_Master_List(Data).csv", encoding="unicode_escape", names=["Experience Name", "Description", "IntendedAge", "ContentDifficulty", "Topics", "GS Assistance", "Popularity", "Visitor Duration", "Copy", "More Copy", "a", "b"])
data_df = data_df.drop(columns=["a", "b"])
data_df.head()

In [None]:
# Remove first 4 rows
data_df = data_df.iloc[4:]
data_df = data_df.reset_index(drop=True)

In [None]:
data_df

In [None]:
topic_set = set()

topics = []
for i in range(len(data_df)):
    topic = data_df["Topics"][i]
    topic = str(topic).split(", ")
    for t in topic:
        if t != "nan" and t != "":
            topic_set.add(t)

topic_set = list(topic_set)
topic_set

In [None]:
# Embed every topic and meta topic. Use mini-lm-v6
import torch
from sentence_transformers import SentenceTransformer

model_name = "sentence-transformers/all-MiniLM-L12-v2"
model = SentenceTransformer(model_name)

In [None]:
META_TOPICS =  [
    "Science and Technology",
    "Nature and Environment",
    "Transportation and Engineering",
    "History and Archaeology",
    "Art and Music",
    "Geography and Places",
    "Physics and Mechanics",
    "Energy and Sustainability",
    "Interactive Activities"
]

META_TOPIC_EMOJIS = {
    "Science and Technology": "🔬",
    "Nature and Environment": "🌿",
    "Transportation and Engineering": "🚗",
    "History and Archaeology": "🏺",
    "Art and Music": "🎨",
    "Geography and Places": "🌍",
    "Physics and Mechanics": "⚙️",
    "Energy and Sustainability": "🌱",
    "Interactive Activities": "🎮"
}

META_TOPIC_COLORS = {
    "Science and Technology": "#FFD700",
    "Nature and Environment": "#228B22",
    "Transportation and Engineering": "#FF4500",
    "History and Archaeology": "#8A2BE2",
    "Art and Music": "#FF1493",
    "Geography and Places": "#1E90FF",
    "Physics and Mechanics": "#FF8C00",
    "Energy and Sustainability": "#32CD32",
    "Interactive Activities": "#FF69B4"
}

meta_topic_embeddings = model.encode(META_TOPICS)
topic_embeddings = model.encode(topic_set)

# For each topic, find the closest meta topic
topic_to_meta_topic = {}
meta_topic_to_topic = {topic: [] for topic in META_TOPICS}

for i, topic in enumerate(topic_set):
    topic_embedding = topic_embeddings[i]
    closest_meta_topic = None
    closest_meta_topic_distance = float("inf")
    for j, meta_topic in enumerate(META_TOPICS):
        meta_topic_embedding = meta_topic_embeddings[j]
        distance = np.linalg.norm(topic_embedding - meta_topic_embedding)
        if distance < closest_meta_topic_distance:
            closest_meta_topic_distance = distance
            closest_meta_topic = meta_topic

    topic_to_meta_topic[topic] = closest_meta_topic
    meta_topic_to_topic[closest_meta_topic].append(topic)

In [None]:
for meta_topic in META_TOPICS:
    print(meta_topic, len(meta_topic_to_topic[meta_topic]))

In [None]:
meta_topic_to_topic["Interactive Activities"]

In [None]:
topic_objs = []
for i,meta_topic in enumerate(META_TOPICS):
    topic_objs.append({
        "id": "topic_" + str(i),
        "label": meta_topic,
        "icon": META_TOPIC_EMOJIS[meta_topic],
        "color": META_TOPIC_COLORS[meta_topic]
    })

topic_objs[:5]

In [None]:
import json

exhibit_topic_objs = []
exhibit_objs = []
for i, row in data_df.iterrows():
    topics = row["Topics"]
    topics = str(topics).split(", ")

    meta_topics = []
    meta_topic_names = []
    for topic in topics:
        if topic != "nan" and topic != "":
            meta_topic = topic_to_meta_topic[topic]

            # Get meta topic id
            for j, meta_topic_name in enumerate(META_TOPICS):
                if meta_topic_name == meta_topic:
                    meta_topic = "topic_" + str(j)
                    meta_topic_names.append(meta_topic_name)
                    break

            
                
            meta_topics.append(meta_topic)

    # construct a vector for each exhibit based on the meta topics
    print(meta_topics)
    vector = np.zeros(len(META_TOPICS))
    for meta_topic in meta_topic_names:
        for j, meta_topic_name in enumerate(META_TOPICS):
            if meta_topic_name == meta_topic:
                vector[j] += 1

    m = np.max(vector)
    vector = vector / m

    if m != 0:
        for j, meta_topic_name in enumerate(META_TOPICS):
            if vector[j] > 0:
                exhibit_topic_objs.append({
                    "exhibit_id": "exhibit_" + str(i),
                    "topic_id": "topic_" + str(j),
                    "relevance": vector[j]
                })
    else:
        # Create weak association with all topics
        for j, meta_topic_name in enumerate(META_TOPICS):
            exhibit_topic_objs.append({
                "exhibit_id": "exhibit_" + str(i),
                "topic_id": "topic_" + str(j),
                "relevance": 0.1
            })

    exhibit = {}
    exhibit["id"] = "exhibit_" + str(i)
    exhibit["title"] = row["Experience Name"]
    exhibit["description"] = row["Description"]
    exhibit["image"] = "N/A"
    exhibit["topic_id"] = meta_topics
    exhibit["details"] = {
        "text": (str(row["Copy"]) + "\n" + str(row["More Copy"])).strip(),
        "intended_age": row["IntendedAge"],
        "difficulty": row["ContentDifficulty"],
        "gs_assistance": row["GS Assistance"],
        "popularity": row["Popularity"],
        "visitor_duration": row["Visitor Duration"]
    }
    exhibit["details"] = json.dumps(exhibit["details"])

    exhibit_objs.append(exhibit)



In [None]:
exibits[:1]

In [None]:
exhibit_topic_objs[:15]

In [None]:
# Write all data to json files
with open("topics.json", "w") as f:
    json.dump(topic_objs, f)

with open("exhibits.json", "w") as f:
    json.dump(exhibit_objs, f)

with open("exhibit_topics.json", "w") as f:
    json.dump(exhibit_topic_objs, f)

In [None]:
# # import networkx
# import networkx as nx

# filtered_exhibit_topic_objs = [edge for edge in exhibit_topic_objs if edge["relevance"] > 0.1]
# filtered_exhibit_ids = set([edge["exhibit_id"] for edge in filtered_exhibit_topic_objs])
# filtered_exhibits = [exhibit for exhibit in exhibit_objs if exhibit["id"] in filtered_exhibit_ids]

# G = nx.Graph()

# for topic in topic_objs:
#     G.add_node(topic["id"], label=topic["label"], icon=topic["icon"], color=topic["color"], type="topic")

# for exhibit in filtered_exhibits:
#     G.add_node(exhibit["id"], label=exhibit["title"], description=exhibit["description"], image=exhibit["image"], details=exhibit["details"], type="exhibit")

# for edge in filtered_exhibit_topic_objs:
#     G.add_edge(edge["exhibit_id"], edge["topic_id"], relevance=edge["relevance"])

# nx.write_gexf(G, "exhibit_topics.gexf")

In [None]:
topic_objs[:5]

In [None]:
# class Topic(Base):
#     __tablename__ = "topics"
   
#     id = Column(String, primary_key=True)
#     label = Column(String, nullable=False)
#     icon = Column(String, nullable=False)
#     color = Column(String, nullable=False)
 
# class TopicExhibitRelation(Base):
#     __tablename__ = "topic_exhibit_relations"
#    __table_args__ = (
#         PrimaryKeyConstraint("topic_id", "exhibit_id"),
#     )
#    topic_id = Column(String, ForeignKey("topics.id"), nullable=False)
#    exhibit_id = Column(String, ForeignKey("exhibits.id"), nullable=False)
#    strength = Column(Float, nullable=False)

# class Exhibit(Base):
#     __tablename__ = "exhibits"
   
#     id = Column(String, primary_key=True)
#     title = Column(String, nullable=False)
#     description = Column(String, nullable=False)
#     image = Column(String, nullable=False)
#     details = Column(JSON, nullable=True)