In [None]:
!pip install transformers datasets
!pip install networkx matplotlib
import networkx as nx
import matplotlib.pyplot as plt
!apt install graphviz libgraphviz-dev
!pip install pygraphviz

from sklearn.datasets import fetch_20newsgroups
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
!pip install faiss-cpu
import faiss
import numpy as np

!pip install plotly umap-learn
import umap
import plotly.express as px

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

The code below sets up the vector database with the 20newsgroup sets

In [None]:
newsgroups = fetch_20newsgroups(subset='all')
documents = newsgroups.data

In [None]:
# Vectorization for search/indexing in a retrieval system (RAG)
# Change runtime to GPU to run this

model = SentenceTransformer('all-MiniLM-L6-v2')

vectors = model.encode(
    documents,
    batch_size=64,
    show_progress_bar=True,
    device='cuda'
)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
# SETTING UP A VECTOR DATABASE CALLED FAISS
# normalizes and then uses cosine similatiy for semantic similarity
faiss.normalize_L2(vectors)
index = faiss.IndexFlatIP(vectors.shape[1])
index.add(vectors)

In [None]:
# Vector Search
query = "Internet history"
query_vector = model.encode([query])
D, I = index.search(np.array(query_vector), k=5)

# Show top results
for i in I[0]:
    print(documents[i][:500])  # Preview first 500 chars of each match
    print("------")

From: ld231782@longs.lance.colostate.edu (L. Detweiler)
Subject: Privacy & Anonymity on the Internet FAQ (2 of 3)
Supersedes: <net-privacy/part2_733153240@GZA.COM>
Organization: TMP Enterprises
Lines: 1543
Expires: 21 May 1993 04:00:06 GMT
Reply-To: ld231782@longs.lance.colostate.edu
NNTP-Posting-Host: pad-thai.aktis.com
Summary: Email and account privacy, anonymity, file encryption, 
 academic computer policies, relevant legislation and references, 
 EFF, and other privacy and rights issues ass
------
From: ld231782@longs.lance.colostate.edu (L. Detweiler)
Subject: Privacy & Anonymity on the Internet FAQ (3 of 3)
Supersedes: <net-privacy/part3_733153240@GZA.COM>
Organization: TMP Enterprises
Lines: 1201
Expires: 21 May 1993 04:00:06 GMT
Reply-To: ld231782@longs.lance.colostate.edu
NNTP-Posting-Host: pad-thai.aktis.com
Summary: Notes on the use, history, and value of anonymous Usenet
 posting and email remailing services
X-Last-Updated: 1993/03/04

Archive-name: net-privacy/part3
Last-

In [None]:
# Reduce to 2D
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine')
embedding = reducer.fit_transform(vectors)

# color by category
target_labels = [newsgroups.target_names[i] for i in newsgroups.target]

# Make hover text: show first 200 chars
hover_texts = [doc[:200].replace("\n", " ") for doc in documents]

fig = px.scatter(
    x=embedding[:, 0],
    y=embedding[:, 1],
    color=target_labels,
    hover_name=target_labels,
    hover_data={'Text': hover_texts},
    labels={'x': 'UMAP-1', 'y': 'UMAP-2'},
    title="Newsgroup Messages in Latent Space"
)


fig.update_traces(marker=dict(size=5, opacity=0.6))
fig.show()



The code below generates the online site with the rag search attached to an LLM and an interactive vector map.

In [None]:
# Our program requires a Princeton Sandbox API key stored in google collab secrets
# If one is using a normal OpenAI API key, the client below has to be changed
# When this block of code runs, click on the ngrok link to access the program
# The site will stay on as long as the tunnel is open which is usually around an hour
# Reference our slide videos for examples of use

!pip install dash
!pip install jupyter-dash
!pip install dash pyngrok

from dash import Dash, html, dcc, Input, Output, State
import pandas as pd
from pyngrok import ngrok
import requests
from openai import AzureOpenAI
from google.colab import userdata


sandbox_key = userdata.get('sandbox_api')
if not sandbox_key:
  raise ValueError('API key not found')
else:
  print('API key found')
client = AzureOpenAI(
    api_key=sandbox_key,
    azure_endpoint="https://api-ai-sandbox.princeton.edu/",
    api_version="2024-02-01"
)

# Prepare document previews and labels
hover_texts = [doc[:200].replace('\n', ' ') for doc in documents]
labels = [newsgroups.target_names[i] for i in newsgroups.target]

# DataFrame for plotting
df = pd.DataFrame({
    'x': embedding[:, 0],
    'y': embedding[:, 1],
    'label': labels,
    'preview': hover_texts,
    'full_text': documents
})
df['id'] = df.index  # add unique ID for mapping

# UMAP Scatter Plot
fig = px.scatter(
    df,
    x='x',
    y='y',
    color='label',
    hover_data={'preview': True, 'id': True},
    labels={'x': 'UMAP-1', 'y': 'UMAP-2'},
    title='Newsgroups Latent Space'
)
fig.update_traces(marker=dict(size=5, opacity=0.6))

# Dash App Setup
app = Dash(__name__)
app.layout = html.Div([
    html.H1("Explore Newsgroup Messages"),

    dcc.Graph(id='scatter-plot', figure=fig, style={'height': '80vh'}),

    html.Div(id='clicked-text', style={
        'whiteSpace': 'pre-wrap',
        'border': '1px solid #ccc',
        'padding': '1em',
        'marginTop': '1em',
        'maxHeight': '300px',
        'overflowY': 'auto'
    }),

    html.H2("Ask a Question About the Newsgroups"),
    dcc.Input(id='query-input', type='text', placeholder='Type your question...', debounce=True, style={'width': '60%'}),
    html.Button('Submit', id='submit-btn'),
    html.Div(id='rag-response', style={
        'whiteSpace': 'pre-wrap',
        'border': '1px solid #aaa',
        'padding': '1em',
        'marginTop': '1em',
        'backgroundColor': '#f8f8f8'
    })
])

# Callback to display full document text on click
@app.callback(
    Output('clicked-text', 'children'),
    Input('scatter-plot', 'clickData')
)
def display_text(clickData):
    if clickData is None:
        return "Click on a point to see the full message."
    doc_id = clickData['points'][0]['customdata'][1]
    return df.loc[doc_id, 'full_text']

# Callback to query the RAG sandbox API
@app.callback(
    Output('rag-response', 'children'),
    Input('submit-btn', 'n_clicks'),
    State('query-input', 'value'),
    prevent_initial_call=True
)
def query_rag_api(n_clicks, query):
    if not query:
        return "Please enter a question."

    # Embed query
    query_vec = model.encode([query], normalize_embeddings=True).astype("float32")
    _, I = index.search(query_vec, 5)
    top_docs = [documents[i] for i in I[0]]

    # Compose strict grounded prompt
    context = "\n\n".join(top_docs)


    prompt = (
    "You are an assistant that answers ONLY based on the provided Usenet posts.\n\n"
    "Use the information from the posts as much as possible to answer the user's question.\n"
    "If the posts are completely unrelated to the question, reply with 'Insufficient information retrieved.'\n"
    "Do not use your own outside knowledge unless making a minor and reasonable inference.\n\n"
    f"Usenet Posts:\n{context}\n\n"
    f"User Question: {query}\n\n"
    "Answer:"
    )

    # Call Sandbox API via AzureOpenAI client
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4,
            max_tokens=300
        )
        answer = response.choices[0].message.content

        # Format and return both answer + sources
        source_snippets = "\n\n".join([
          f"Source {i+1}:\n{doc[:300]}...\n---" for i, doc in enumerate(top_docs)
        ])

        return f"**Answer:**\n{answer}\n\n**Retrieved Messages:**\n{source_snippets}"

    except Exception as e:
        return f"Sandbox API error: {str(e)}"


ngrok.set_auth_token("28aMaQDMNLKUxFRKxCeHry73z6C_hPwvPvfi1MfHUj5jk6NN")

# Close existing tunnels to avoid ERR_NGROK_324
for tunnel in ngrok.get_tunnels():
    ngrok.disconnect(tunnel.public_url)

# Expose port 8050
public_url = ngrok.connect(8050)
print(f"Dash app available at: {public_url}")

# Launch Dash app
app.run(debug=True)


Collecting dash
  Downloading dash-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-3.0.4-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading flask-3.0.3-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading werkzeug-3.0.6-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: Werkzeug, retry

<IPython.core.display.Javascript object>