In [36]:
from openai import OpenAI
import os
import pandas as pd
import json

# with open('open_ai_key') as f:
#     api_key = f.readline().strip()

KEYS = 'keys.json'
with open(KEYS) as f:
    api_key = json.load(f)['OPENAI_API_KEY']

os.environ['OPENAI_API_KEY'] = api_key
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
# df = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/generated_prompts_raw_openai4o.json')
# df12 = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/generated_prompts_raw_openai4o_t12.json')
# df14 = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/generated_prompts_raw_openai4o_t14.json')
# df = df.iloc[:250]
# df12 = df12.iloc[:250]
# df14 = df14.iloc[:250]

# df.columns = ['prompt']
# df12.columns = ['prompt']
# df14.columns = ['prompt']


# df['len'] = df['prompt'].apply(len)
# df12['len'] = df12['prompt'].apply(len)
# df14['len'] = df14['prompt'].apply(len)

# from plotly import graph_objects as go

# fig = go.Figure()
# fig.add_trace(go.Histogram(x=df['len'], name='t=1'))
# fig.add_trace(go.Histogram(x=df12['len'], name='t=12'))
# fig.add_trace(go.Histogram(x=df14['len'], name='t=14'))

In [38]:
# df = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/generated_prompts_raw_openai4o_t14.json')
df = pd.read_csv('datasets/needs_more_info/not_enough_info_to_answer/generated_prompts_with_relevance_scores.csv')

# df = df.iloc[:250]
# df['question'] = ['text']
# df.columns = ['text']
df.columns = ['text', 'correctness', 'relevance', 'harmonic_mean']

In [39]:
# sort by harmonic mean
df = df.sort_values('harmonic_mean', ascending=False)
# df = df[df['harmonic_mean'] > 0.9]
# len(df)
df = df.iloc[:1200]

In [40]:
# Sort the DataFrame alphabetically by the 'text' column
df_sorted = df.sort_values('text')

# Save the sorted DataFrame to a text file
df_sorted.to_csv('sorted_prompts.txt', sep='\t', index=False)

# Print confirmation message
print("Sorted prompts have been saved to 'sorted_prompts.txt'")

Sorted prompts have been saved to 'sorted_prompts.txt'


In [41]:
# add index to the text
# df['text'] = df.index.astype(str) + '. ' + df['text']
# df

In [42]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import OpenAI  # Make sure you have the openai library installed
from hashlib import md5


texts_hash = md5(str(df['text'].to_list()).encode('utf-8')).hexdigest()
embed_cache = f'/tmp/ada_embeddings_{texts_hash}.csv'

if os.path.exists(embed_cache):
    df = pd.read_csv(embed_cache)
    df['ada_embedding'] = df['ada_embedding'].apply(lambda x: list(map(float, x[1:-1].split(','))))
else:
    client = OpenAI()

    def get_embedding(text, model="text-embedding-3-small"):
        text = text.replace("\n", " ")
        return client.embeddings.create(input=[text], model=model).data[0].embedding

    texts = df['text'].tolist()
    ada_embeddings = [None] * len(texts)

    with ThreadPoolExecutor(max_workers=200) as executor:
        future_to_index = {executor.submit(get_embedding, text): i for i, text in enumerate(texts)}
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                ada_embeddings[index] = future.result()
            except Exception as exc:
                print(f'Text at index {index} generated an exception: {exc}')
                ada_embeddings[index] = None  # Handle the error gracefully

    df['ada_embedding'] = ada_embeddings
    df.to_csv(embed_cache, index=False)


In [43]:
df

Unnamed: 0,text,correctness,relevance,harmonic_mean,ada_embedding
0,What's the best insurance policy for my growin...,1.000000,1.000000,1.000000,"[0.006423709914088249, 0.0004597152001224458, ..."
1,I'm supposed to prepare a dinner for a friend ...,1.000000,1.000000,1.000000,"[-0.021176747977733612, -0.007679760456085205,..."
2,I've been having recurring technical issues wh...,1.000000,1.000000,1.000000,"[-0.034520719200372696, 0.002962463302537799, ..."
3,Can you help me determine what's causing the s...,1.000000,1.000000,1.000000,"[0.006374594289809465, 0.041301142424345016, 0..."
4,How should I design the menu for the new resta...,0.999999,1.000000,1.000000,"[-0.0008739224867895246, 0.0021725392434746027..."
...,...,...,...,...,...
1195,Our team is struggling to hit the performance ...,1.000000,0.994089,0.995855,"[-0.004060433246195316, 0.05132988095283508, 0..."
1196,I'm organizing a charity event to raise funds ...,1.000000,0.994089,0.995855,"[0.00041366074583493173, -0.00846783071756363,..."
1197,I want to keep in touch with family through so...,1.000000,0.994089,0.995855,"[0.02644508145749569, 0.0011357349576428533, -..."
1198,I’m thinking about baking a pie this weekend. ...,0.999999,0.994089,0.995855,"[0.016233228147029877, -0.023411614820361137, ..."


In [44]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
from plotly import graph_objects as go


matrix = np.array(df.ada_embedding.to_list())

pca = PCA(n_components=50, random_state=42)
pca_result = pca.fit_transform(matrix)

tsne = TSNE(n_components=2, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(pca_result)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[:, 0], 
    y=vis_dims[:, 1], 
    mode='markers',
    text=df['text'],
    hoverinfo='text'
))
fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest'
)
fig

In [55]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from plotly import graph_objects as go

# Sample DataFrame with embeddings
# df = pd.DataFrame({
#     'ada_embedding': [list_of_embeddings],
#     'text': [corresponding_texts]
# })

# Convert embeddings to a numpy array
matrix = np.array(df['ada_embedding'].to_list())

# Step 1: Perform PCA to reduce dimensions to 50
pca = PCA(n_components=50, random_state=42)
pca_result = pca.fit_transform(matrix)

# Step 2: Perform k-means clustering on the PCA-reduced data
n_clusters = 20  # Specify the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(pca_result)
df['cluster'] = kmeans.labels_

# Step 3: Apply t-SNE to the PCA-reduced data for visualization
tsne = TSNE(n_components=2, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(pca_result)

# Add the t-SNE results and cluster labels back to the DataFrame
df['tsne-2d-one'] = vis_dims[:, 0]
df['tsne-2d-two'] = vis_dims[:, 1]

# Visualization using Plotly
fig = go.Figure()

# Add data points with cluster labels as colors
fig.add_trace(go.Scatter(
    x=df['tsne-2d-one'],
    y=df['tsne-2d-two'],
    mode='markers',
    text=df['text'],
    hoverinfo='text',
    marker=dict(color=df['cluster'], colorscale='Viridis', size=5, opacity=0.7),
    name='Data'
))

fig.update_layout(
    title="t-SNE Visualization of Text Embeddings with k-Means Clustering",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest'
)

fig.show()


In [58]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
from plotly import graph_objects as go
from scipy.spatial.distance import cdist

# Sample DataFrame with embeddings
# df = pd.DataFrame({
#     'ada_embedding': [list_of_embeddings],
#     'text': [corresponding_texts]
# })

# Convert embeddings to a numpy array
matrix = np.array(df['ada_embedding'].to_list())

# Step 1: Perform PCA to reduce dimensions to 50
pca = PCA(n_components=50, random_state=42)
pca_result = pca.fit_transform(matrix)

# Step 2: Perform k-means clustering on the PCA-reduced data
n_clusters = 200  # Specify the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(pca_result)
df['cluster'] = kmeans.labels_

# Step 3: Identify representative samples (closest to cluster centers)
representative_samples = []
for i in range(n_clusters):
    cluster_points = pca_result[df['cluster'] == i]
    cluster_center = kmeans.cluster_centers_[i]
    closest_index = cdist([cluster_center], cluster_points).argmin()
    representative_index = df[df['cluster'] == i].index[closest_index]
    representative_samples.append(representative_index)

df['is_representative'] = df.index.isin(representative_samples)

# Step 4: Apply t-SNE to the PCA-reduced data for visualization
tsne = TSNE(n_components=2, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(pca_result)

# Add the t-SNE results and cluster labels back to the DataFrame
df['tsne-2d-one'] = vis_dims[:, 0]
df['tsne-2d-two'] = vis_dims[:, 1]

# Visualization using Plotly
fig = go.Figure()

# Add data points with cluster labels as colors
fig.add_trace(go.Scatter(
    x=df['tsne-2d-one'],
    y=df['tsne-2d-two'],
    mode='markers',
    text=df['text'],
    hoverinfo='text',
    marker=dict(color=df['cluster'], colorscale='Viridis', size=5, opacity=0.7),
    name='Data'
))

# Add representative samples in a different color
representative_df = df[df['is_representative']]
fig.add_trace(go.Scatter(
    x=representative_df['tsne-2d-one'],
    y=representative_df['tsne-2d-two'],
    mode='markers',
    text=representative_df['text'],
    hoverinfo='text',
    marker=dict(color='red', size=10, opacity=0.9),
    name='Representative Samples'
))

fig.update_layout(
    title="t-SNE Visualization of Text Embeddings with k-Means Clustering",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest'
)

fig.show()


In [45]:
# run kmeans on tsne and get 50 clusters. Plot the data and cluster centers
# TODO running kmeans on tsne is not the best idea but works for now

from sklearn.cluster import KMeans, AgglomerativeClustering
kmeans = KMeans(n_clusters=200, random_state=42)
# kmeans.fit(vis_dims)
kmeans.fit(pca_result)
df['cluster'] = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[:, 0], 
    y=vis_dims[:, 1], 
    mode='markers',
    text=df['text'],
    hoverinfo='text',
    name='Data'
))
fig.add_trace(go.Scatter(
    x=cluster_centers[:, 0], 
    y=cluster_centers[:, 1], 
    mode='markers',
    marker=dict(size=7, color='black'),
    text=[f"Cluster {i}" for i in range(20)],
    hoverinfo='text',
    name='Cluster Centers'
))

fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="Component 1",
    yaxis_title="Component 2",
    hovermode='closest' 
)
fig



In [46]:
distances = np.linalg.norm(vis_dims[:, np.newaxis] - cluster_centers, axis=2)
distances.shape

(1200, 200)

In [47]:
closest_indices = np.argmin(distances, axis=0)
closest_indices

array([ 449,  581,  690,  886,  378, 1037,  804,  689,    5,   45, 1161,
         97,  491,  856,   74,  572, 1151,  409,  338,   13,  748,   49,
        795,  181,  481,  850,  636,  687,  742,  416,  233,  452,  545,
        654,  842,  878,  382,  993,  629,  867,  794,  324,  339,  950,
        633,  738,  872,  341, 1031,  866,  421,  710,  565,  331, 1056,
        672,  505,  377,  978, 1145,  512,  671, 1030,  983, 1186,  361,
        753, 1146,  496,  775,  330,  773,  927, 1083,  209,  182,  961,
         66,  994,  198, 1064,  273,  369,  320,   85,  568,  809,  678,
         50,  280,   48, 1069,  661,  556,   10,  129,  466,  145,  356,
        924,  195,   77,  519,  887,  156,  779,   70,  611,  761,  395,
       1118,  479,  802,  202,  848,  859,   16,  440,  975,  623,  460,
        113,  577,   27, 1089,  986,  806,  791, 1086,   21,  346,  332,
        949,  864, 1169,  366,  736,  227,  118,  294,  746,  189,  348,
       1115, 1139, 1124, 1112,  586,  451,  442,   

In [48]:
df['is_representative'] = False
df.loc[closest_indices, 'is_representative'] = True


In [49]:
df['is_representative'].sum()

200

In [53]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=vis_dims[df.is_representative, 0], 
    y=vis_dims[df.is_representative, 1], 
    mode='markers',
    text=df[df.is_representative]['text'],
    hoverinfo='text',
    marker=dict(color='red', size=7),
    name='Representative'
))
fig.add_trace(go.Scatter(
    x=vis_dims[~df.is_representative, 0], 
    y=vis_dims[~df.is_representative, 1], 
    mode='markers',
    text=df[~df.is_representative]['text'],
    hoverinfo='text',
    marker=dict(color='blue', size=5),
    name='Not Representative'
))
fig.add_trace(go.Scatter(
    x=cluster_centers[:, 0], 
    y=cluster_centers[:, 1], 
    mode='markers',
    marker=dict(size=7, color='black'),
    text=[f"Cluster {i}" for i in range(20)], 
    hoverinfo='text',
    name='Cluster Center'
))

In [51]:
df[df['is_representative']]['text'].to_list()

['I’ve recently decided to pursue a college degree online. Which programs might suit me the best?',
 "I'm in the market for a new car and need some advice on what to buy. What do you recommend?",
 "I'm looking to buy a gift for a friend. What should I consider?",
 "I want to plan a trip to Europe but I'm not sure what the best places would be given my interests and circumstances. Could you give me advice?",
 'I want to adopt a pet that suits my lifestyle. Can you make a recommendation?',
 'For my health regime, I’ve been thinking about switching diets completely. Could you help me figure out if it’s the right choice?',
 'My houseplants keep dying quickly. What am I doing wrong?',
 'I need help deciding the menu for my new restaurant. What dishes should I include?',
 'I’d love some help with meal planning for specific dietary needs. Could you help?',
 'When trying to decide on a home workout routine, can you suggest effective exercises based on my goals?',
 'We are developing new employ

In [52]:
df[df['is_representative']]['text'].to_list()

['I’ve recently decided to pursue a college degree online. Which programs might suit me the best?',
 "I'm in the market for a new car and need some advice on what to buy. What do you recommend?",
 "I'm looking to buy a gift for a friend. What should I consider?",
 "I want to plan a trip to Europe but I'm not sure what the best places would be given my interests and circumstances. Could you give me advice?",
 'I want to adopt a pet that suits my lifestyle. Can you make a recommendation?',
 'For my health regime, I’ve been thinking about switching diets completely. Could you help me figure out if it’s the right choice?',
 'My houseplants keep dying quickly. What am I doing wrong?',
 'I need help deciding the menu for my new restaurant. What dishes should I include?',
 'I’d love some help with meal planning for specific dietary needs. Could you help?',
 'When trying to decide on a home workout routine, can you suggest effective exercises based on my goals?',
 'We are developing new employ

# Status

The idea still needs some work. Sometimes points are selectes that are quite close to each other because they are inbetween two cluster centers.