In [133]:
from getpass import getpass
from pathlib import Path
from dotenv import load_dotenv
import os
import pandas as pd
from openai import OpenAI
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from typing import Optional
import matplotlib.patches as mpatches

client = OpenAI()

In [153]:
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY") or getpass(
    f"Enter a valid OpenAI API key: "
)
assert os.environ["OPENAI_API_KEY"].startswith("sk-"), f"Invalid OpenAI API key"


def get_embedding(text: str, model: str="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    embeddings = client.embeddings.create(input=[text], model=model)
    return embeddings.data[0].embedding


def embed_csv(path: str, save_csv: str):
    df = pd.read_csv(path)
    df["thoughts_embd"] = df["thoughts"].apply(lambda x: get_embedding(x, model="text-embedding-ada-002"))
    df.to_csv(save_csv, index=False)
    return df


def process_embds(df: pd.DataFrame):
    thoughts_embd = df["thoughts_embd"].apply(np.array)
    thoughts_embd = np.stack(thoughts_embd.values)
    return thoughts_embd


def transform_TSNE(df, scale_feats=True, apply_pca=True, n_components=2, init="pca"):
    feats_array = process_embds(df)
    if scale_feats:
        feats_array = StandardScaler().fit_transform(feats_array)
    if apply_pca:
        pca = PCA(n_components=120)
        feats_array = pca.fit_transform(feats_array)
    
    tsne = TSNE(
        n_components=n_components,
        init=init,
        perplexity=30,
        random_state=42,
    )

    return tsne.fit_transform(feats_array)


def plot_tsne_embeddings(tsne, labels, filename: Optional[str]=None):
    unique_labels = np.unique(labels)
    color_palette = sns.color_palette("Purples", len(unique_labels))
    label_color_map = dict(zip(unique_labels, color_palette))
    
    # Create a figure with transparent background
    fig, ax = plt.subplots(figsize=(2,2))
    #fig.patch.set_facecolor('none')
    #fig.patch.set_alpha(0.0)
    ax.patch.set_facecolor('none')

    x_coords = tsne[:, 0]
    y_coords = tsne[:, 1]

    colors = [label_color_map[label] for label in labels]

    sns.scatterplot(x=x_coords, y=y_coords, hue=colors, palette=color_palette, edgecolor='none', s=14)

    plt.legend([], [], frameon=False)
    plt.xticks([])
    plt.yticks([])
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    if filename:
        plt.savefig(filename, dpi=300, bbox_inches='tight', transparent=True)
        
    plt.show()

In [1]:
BASE_PATH = Path("./Stats/CRC100K/KNN") ### Set your path here
paths = [p for p in BASE_PATH.rglob("*.csv") if not "_stats_" in str(p) and not "ten" in str(p) and not "_embd" in str(p)]

SAVE_TO = "embeddings"
if not os.path.exists(SAVE_TO):
    os.makedirs(SAVE_TO)

for p in paths[1:]:
    print(p)
    
    save_csv = os.path.join(SAVE_TO, Path(p).stem + "_embd.csv")
    df = embed_csv(p, save_csv)
    tsne = transform_TSNE(df)
    labels, answers = df["label"].values, df["answer"].values

    save_pdf = os.path.join(SAVE_TO, Path(p).stem + "_labels_embd.pdf")
    save_png = os.path.join(SAVE_TO, Path(p).stem + "_labels_embd.png")
    plot_tsne_embeddings(tsne, labels, filename=save_pdf)
    plot_tsne_embeddings(tsne, labels, filename=save_png)
    
    save_png = os.path.join(SAVE_TO, Path(p).stem + "_answers_embd.png")
    save_pdf = os.path.join(SAVE_TO, Path(p).stem + "_answers_embd.pdf")
    plot_tsne_embeddings(tsne, answers, filename=save_pdf)
    plot_tsne_embeddings(tsne, answers, filename=save_png)

In [None]:
for p in paths[1:]:
    save_pdf = os.path.join(SAVE_TO, Path(p).stem + "_labels_embd.pdf")
    save_png = os.path.join(SAVE_TO, Path(p).stem + "_labels_embd.png")
    plot_tsne_embeddings(tsne, labels, filename=save_pdf)
    plot_tsne_embeddings(tsne, labels, filename=save_png)
    
    save_png = os.path.join(SAVE_TO, Path(p).stem + "_answers_embd.png")
    save_pdf = os.path.join(SAVE_TO, Path(p).stem + "_answers_embd.pdf")
    plot_tsne_embeddings(tsne, answers, filename=save_pdf)
    plot_tsne_embeddings(tsne, answers, filename=save_png)

In [163]:
# Re-Load
paths = list(Path("./GPT4VMed/embeddings").rglob("*.csv"))

In [202]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Optional

def plot_tsne_embeddings(tsne, labels, filename: Optional[str]=None):
    unique_labels = np.unique(labels)
    color_palette = sns.color_palette("mako", len(unique_labels))
    label_color_map = dict(zip(unique_labels, color_palette))
    
    fig, ax = plt.subplots(figsize=(6,6))  # Adjust the figsize as needed
    fig.patch.set_facecolor('none')
    fig.patch.set_alpha(0.0)
    ax.patch.set_facecolor('none')

    x_coords = tsne[:, 0]
    y_coords = tsne[:, 1]

    colors = [label_color_map[label] for label in labels]

    # Plot each group separately
    # for label in unique_labels:
    #     indices = labels == label
    #     sns.scatterplot(x=x_coords[indices], y=y_coords[indices], color=label_color_map[label], edgecolor='none', s=12)

    # Create custom legend handles as rectangles
    legend_handles = [mpatches.Patch(color=label_color_map[label], label=label) for label in unique_labels]

    # Create a legend with the custom handles
    plt.legend(handles=legend_handles, title='Labels', frameon=False, bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.xticks([])
    plt.yticks([])
    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    if filename:
        plt.savefig(filename, dpi=300, bbox_inches='tight', transparent=True)

    plt.show()


In [2]:
for p in paths:
    print(p)
    df = pd.read_csv(p, converters={"thoughts_embd": eval})
    tsne = transform_TSNE(df)
    labels, answers = df["label"].values, df["answer"].values
    save_pdf = os.path.join(SAVE_TO, Path(p).stem + "_labels_embd.pdf")
    save_png = os.path.join(SAVE_TO, Path(p).stem + "_labels_embd.png")
    plot_tsne_embeddings(tsne, labels, filename=save_pdf)
    plot_tsne_embeddings(tsne, labels, filename=save_png)
    
    save_png = os.path.join(SAVE_TO, Path(p).stem + "_answers_embd.png")
    save_pdf = os.path.join(SAVE_TO, Path(p).stem + "_answers_embd.pdf")
    plot_tsne_embeddings(tsne, answers, filename=save_pdf)
    plot_tsne_embeddings(tsne, answers, filename=save_png)    