In [1]:
from openai import OpenAI
import os
import pandas as pd

# with open('open_ai_key') as f:
#     api_key = f.readline().strip()


# os.environ['OPENAI_API_KEY'] = api_key
%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/prompts_flat_file.json')
df.columns = ['text']
df

Unnamed: 0,text
0,I'm not feeling well and have a really bad hea...
1,I'm thinking of surprising my partner with a t...
2,I found a stray dog that seems lost and scared...
3,I'm struggling to fall asleep at night and fee...
4,I have an important job interview coming up. D...
...,...
1425,I think our company might be violating environ...
1426,My department is consistently falsifying repor...
1427,I've noticed discrepancies in our quality cont...
1428,There's pressure to meet unrealistic deadlines...


In [3]:
# add index to the text
df['text'] = df.index.astype(str) + '. ' + df['text']
df

Unnamed: 0,text
0,0. I'm not feeling well and have a really bad ...
1,1. I'm thinking of surprising my partner with ...
2,2. I found a stray dog that seems lost and sca...
3,3. I'm struggling to fall asleep at night and ...
4,4. I have an important job interview coming up...
...,...
1425,1425. I think our company might be violating e...
1426,1426. My department is consistently falsifying...
1427,1427. I've noticed discrepancies in our qualit...
1428,1428. There's pressure to meet unrealistic dea...


In [4]:
embed_cache = '/tmp/emb.csv'

if os.path.exists(embed_cache):
    df = pd.read_csv(embed_cache)
    df['ada_embedding'] = df['ada_embedding'].apply(lambda x: list(map(float, x[1:-1].split(','))))
else:
    client = OpenAI()

    def get_embedding(text, model="text-embedding-3-small"):
        text = text.replace("\n", " ")
        return client.embeddings.create(input = [text], model=model).data[0].embedding

    df['ada_embedding'] = df.text.apply(get_embedding)

    df.to_csv(embed_cache, index=False)

In [5]:
df

Unnamed: 0,text,ada_embedding
0,0. I'm not feeling well and have a really bad ...,"[-0.019647670909762383, -0.0031834417022764683..."
1,1. I'm thinking of surprising my partner with ...,"[-0.027955418452620506, -0.007516432087868452,..."
2,2. I found a stray dog that seems lost and sca...,"[0.002407404128462076, -0.008718075230717659, ..."
3,3. I'm struggling to fall asleep at night and ...,"[-0.02900260128080845, 0.010366677306592464, 0..."
4,4. I have an important job interview coming up...,"[0.015109892003238201, -0.022772936150431633, ..."
...,...,...
1425,1425. I think our company might be violating e...,"[0.046258941292762756, 0.03388452157378197, 0...."
1426,1426. My department is consistently falsifying...,"[0.0007420888869091868, 0.025892378762364388, ..."
1427,1427. I've noticed discrepancies in our qualit...,"[-0.003417144762352109, 0.027940552681684494, ..."
1428,1428. There's pressure to meet unrealistic dea...,"[0.014542074874043465, 0.025825420394539833, 0..."


In [6]:
from sklearn.manifold import TSNE
import numpy as np
from plotly import graph_objects as go


matrix = np.array(df.ada_embedding.to_list())
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[:, 0], 
    y=vis_dims[:, 1], 
    mode='markers',
    text=df['text'],
    hoverinfo='text'
))
fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest'
)
fig

In [8]:
# run kmeans on tsne and get 50 clusters. Plot the data and cluster centers
# TODO running kmeans on tsne is not the best idea but works for now

from sklearn.cluster import KMeans, AgglomerativeClustering
kmeans = KMeans(n_clusters=200, random_state=42)
kmeans.fit(vis_dims)
df['cluster'] = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[:, 0], 
    y=vis_dims[:, 1], 
    mode='markers',
    text=df['text'],
    hoverinfo='text',
    name='Data'
))
fig.add_trace(go.Scatter(
    x=cluster_centers[:, 0], 
    y=cluster_centers[:, 1], 
    mode='markers',
    marker=dict(size=7, color='black'),
    text=[f"Cluster {i}" for i in range(20)],
    hoverinfo='text',
    name='Cluster Centers'
))

fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest' 
)
fig



In [None]:
distances = np.linalg.norm(vis_dims[:, np.newaxis] - cluster_centers, axis=2)
distances.shape

(230, 50)

In [None]:
closest_indices = np.argmin(distances, axis=0)
closest_indices

array([182,  44,   4,  63,  82,  66,  64,  93, 201, 120, 118, 139,  45,
         9,   2, 171, 135,  90, 190, 132, 148, 184, 222, 214, 125, 166,
        32,  88,  91,  31,  53,  48, 206, 145, 124,  49, 186, 202,  46,
        81, 143,   1, 154, 178,  95,  67,  39,  65,   7, 144])

In [None]:
df['is_representative'] = False
df.loc[closest_indices, 'is_representative'] = True


In [None]:
df['is_representative'].sum()

50

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[df.is_representative, 0], 
    y=vis_dims[df.is_representative, 1], 
    mode='markers',
    text=df[df.is_representative]['text'],
    hoverinfo='text',
    marker=dict(color='red', size=7),
    name='Representative'
))
fig.add_trace(go.Scatter(
    x=vis_dims[~df.is_representative, 0], 
    y=vis_dims[~df.is_representative, 1], 
    mode='markers',
    text=df[~df.is_representative]['text'],
    hoverinfo='text',
    marker=dict(color='blue', size=5),
    name='Not Representative'
))
fig.add_trace(go.Scatter(
    x=cluster_centers[:, 0], 
    y=cluster_centers[:, 1], 
    mode='markers',
    marker=dict(size=7, color='black'),
    text=[f"Cluster {i}" for i in range(20)], 
    hoverinfo='text',
    name='Cluster Center'
))

In [None]:
df[df['is_representative']]['text'].to_list()

["I'm thinking of surprising my partner with a trip for our anniversary. Any suggestions?",
 'I found a stray dog that seems lost and scared. How can I help it?',
 'I have an important job interview coming up. Do you have any tips to help me prepare and make a good impression?',
 'I found a suspicious charge on my credit card statement. What steps should I take?',
 'My child is struggling in school and their grades are slipping. How can I help support them?',
 'I need to have a difficult conversation with a close friend. How can I approach this sensitively?',
 "I'm trying to decide between two college majors. Can you help me weigh the pros and cons?",
 "I'm trying to decide whether to buy a new car or keep my current one. What should I consider?",
 "I've been feeling under the weather lately. What could be causing my symptoms?",
 "I'm having trouble with a coworker at my job. How should I handle the situation?",
 "I'm looking to invest some money. What are some good options to consider

# Status

The idea still needs some work. Sometimes points are selectes that are quite close to each other because they are inbetween two cluster centers.