In [27]:
from openai import OpenAI
import os
import pandas as pd
import json

# with open('open_ai_key') as f:
#     api_key = f.readline().strip()

KEYS = 'keys.json'
with open(KEYS) as f:
    api_key = json.load(f)['OPENAI_API_KEY']

os.environ['OPENAI_API_KEY'] = api_key
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
df = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/generated_prompts_raw.json')
df.columns = ['text']
df

Unnamed: 0,text
0,I'm in a legal dispute with my neighbor over p...
1,I want to start a small business from home. Wh...
2,How can I improve the energy efficiency of my ...
3,What's the most effective way to market my new...
4,I'm planning a trip to Europe. What are the mu...
...,...
1995,I'm considering adopting a pet. Which type of ...
1996,My child is having difficulties at school. How...
1997,I'm looking to invest some money. What options...
1998,I need to give a presentation at work next wee...


In [29]:
# add index to the text
df['text'] = df.index.astype(str) + '. ' + df['text']
df

Unnamed: 0,text
0,0. I'm in a legal dispute with my neighbor ove...
1,1. I want to start a small business from home....
2,2. How can I improve the energy efficiency of ...
3,3. What's the most effective way to market my ...
4,4. I'm planning a trip to Europe. What are the...
...,...
1995,1995. I'm considering adopting a pet. Which ty...
1996,1996. My child is having difficulties at schoo...
1997,1997. I'm looking to invest some money. What o...
1998,1998. I need to give a presentation at work ne...


In [30]:
embed_cache = '/tmp/emb.csv'

if os.path.exists(embed_cache):
    df = pd.read_csv(embed_cache)
    df['ada_embedding'] = df['ada_embedding'].apply(lambda x: list(map(float, x[1:-1].split(','))))
else:
    client = OpenAI()

    def get_embedding(text, model="text-embedding-3-small"):
        text = text.replace("\n", " ")
        return client.embeddings.create(input = [text], model=model).data[0].embedding

    df['ada_embedding'] = df.text.apply(get_embedding)

    df.to_csv(embed_cache, index=False)

In [31]:
df

Unnamed: 0,text,ada_embedding
0,0. I'm in a legal dispute with my neighbor ove...,"[-0.008853226900100708, 0.039722565561532974, ..."
1,1. I want to start a small business from home....,"[0.0034385218750685453, 0.015609797090291977, ..."
2,2. How can I improve the energy efficiency of ...,"[0.0051169986836612225, 0.01821347512304783, 0..."
3,3. What's the most effective way to market my ...,"[0.032402411103248596, 0.0036202692426741123, ..."
4,4. I'm planning a trip to Europe. What are the...,"[-0.020973818376660347, -0.013934730552136898,..."
...,...,...
1995,1995. I'm considering adopting a pet. Which ty...,"[0.05384993180632591, 0.0025528455153107643, 0..."
1996,1996. My child is having difficulties at schoo...,"[0.018268849700689316, -0.00016675358347129077..."
1997,1997. I'm looking to invest some money. What o...,"[-0.032239895313978195, -0.0034639977384358644..."
1998,1998. I need to give a presentation at work ne...,"[-0.007782508619129658, 0.017104871571063995, ..."


In [32]:
from sklearn.manifold import TSNE
import numpy as np
from plotly import graph_objects as go


matrix = np.array(df.ada_embedding.to_list())
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[:, 0], 
    y=vis_dims[:, 1], 
    mode='markers',
    text=df['text'],
    hoverinfo='text'
))
fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest'
)
fig

In [33]:
# run kmeans on tsne and get 50 clusters. Plot the data and cluster centers
# TODO running kmeans on tsne is not the best idea but works for now

from sklearn.cluster import KMeans, AgglomerativeClustering
kmeans = KMeans(n_clusters=500, random_state=42)
kmeans.fit(vis_dims)
df['cluster'] = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[:, 0], 
    y=vis_dims[:, 1], 
    mode='markers',
    text=df['text'],
    hoverinfo='text',
    name='Data'
))
fig.add_trace(go.Scatter(
    x=cluster_centers[:, 0], 
    y=cluster_centers[:, 1], 
    mode='markers',
    marker=dict(size=7, color='black'),
    text=[f"Cluster {i}" for i in range(20)],
    hoverinfo='text',
    name='Cluster Centers'
))

fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest' 
)
fig



In [34]:
distances = np.linalg.norm(vis_dims[:, np.newaxis] - cluster_centers, axis=2)
distances.shape

(2000, 500)

In [35]:
closest_indices = np.argmin(distances, axis=0)
closest_indices

array([ 749, 1317,   32,  909, 1218,  863, 1391, 1958, 1608,  289, 1641,
        324,  991,  650,  445,  158, 1794,  255, 1434, 1057, 1540,  129,
       1410,  193, 1796, 1070,  229, 1138,  605, 1560, 1508,  729, 1168,
        281, 1419, 1387,  684, 1199,  517, 1612, 1762,  273,  709,  394,
       1507,  795, 1611,  170,  270, 1530,  225, 1688, 1236, 1363,  698,
        314,  330,  381,  214, 1861, 1286, 1892,  398,  175,  850,  525,
       1601, 1117,  560, 1187,  997,  451,  385,  756,  287, 1518,  593,
       1552,  886, 1476,  181, 1046, 1044, 1190,  335,  348, 1379,  687,
       1769, 1746, 1302,  437, 1659, 1590, 1817, 1807,  690,  998, 1188,
       1149,   41, 1253, 1584,  103, 1574,  817, 1710,  212,   88, 1834,
        548,  242,  471, 1463,  300,  294, 1808,  790,  260,  632,   79,
        645, 1877,  110,  497,  936, 1339, 1737,  376, 1748,  714, 1257,
       1551,  992,  603,  694,  977, 1388, 1570, 1510,  358,  771, 1426,
       1004, 1672,   53,  732,  668,  182, 1141, 16

In [36]:
df['is_representative'] = False
df.loc[closest_indices, 'is_representative'] = True


In [37]:
df['is_representative'].sum()

500

In [38]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[df.is_representative, 0], 
    y=vis_dims[df.is_representative, 1], 
    mode='markers',
    text=df[df.is_representative]['text'],
    hoverinfo='text',
    marker=dict(color='red', size=7),
    name='Representative'
))
fig.add_trace(go.Scatter(
    x=vis_dims[~df.is_representative, 0], 
    y=vis_dims[~df.is_representative, 1], 
    mode='markers',
    text=df[~df.is_representative]['text'],
    hoverinfo='text',
    marker=dict(color='blue', size=5),
    name='Not Representative'
))
fig.add_trace(go.Scatter(
    x=cluster_centers[:, 0], 
    y=cluster_centers[:, 1], 
    mode='markers',
    marker=dict(size=7, color='black'),
    text=[f"Cluster {i}" for i in range(20)], 
    hoverinfo='text',
    name='Cluster Center'
))

In [39]:
df[df['is_representative']]['text'].to_list()

['2. How can I improve the energy efficiency of my house?',
 '13. My car is making a strange noise. What could be the problem?',
 "22. I'm considering adopting a dog. Which breed would be best for me?",
 '23. My child is struggling in school. How can I help them improve?',
 '32. I want to start a vegetable garden. What are the best plants to grow?',
 '41. My car is making a strange noise. Should I take it to a mechanic?',
 "50. My neighbor's dog keeps barking at night. What's the best way to handle this situation?",
 "51. I'm planning a surprise party for my friend. What should I do?",
 "53. There's a strange odor coming from somewhere in my house. What could be causing it?",
 "58. I've been asked to give a speech at an important event. How should I prepare?",
 '65. My child is struggling in school. How can I help them improve their performance?',
 '71. My coworker seems uncomfortable with my disability. How should I address this?',
 "74. I'm struggling with my daily tasks due to my co

In [None]:
df[df['is_representative']]['text'].to_list()

# Status

The idea still needs some work. Sometimes points are selectes that are quite close to each other because they are inbetween two cluster centers.