https://builtin.com/data-science/tsne-python

In [1]:
import torch
from PIL import Image
import open_clip
import numpy as np
import os
from tqdm import tqdm
import time
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib
matplotlib.use('TkAgg') 
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns

def list_dir(path, list_name, extension, return_names=False):
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if os.path.isdir(file_path):
            list_dir(file_path, list_name, extension)
        else:
            if file_path.endswith(extension):
                if return_names:
                    list_name.append(file)
                else:
                    list_name.append(file_path)
    try:
        list_name = sorted(list_name)
    except Exception as e:
        print(e)
    return list_name


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
src_dir = r'D:\Dataset\WeedData\weed_10_species\train2017_real_object_in_box\Embedding_IPAdapter_default'
embedding_paths=list_dir(src_dir, [], '.npy')
embeddings = []
ys = []
for x in tqdm(embedding_paths):
    name = os.path.basename(x)
    sku = name.split('_')[0]
    embedding = np.load(x)
    embeddings.append(embedding)
    ys += [sku]
embeddings_0 = np.array(embeddings)
ys_0 = np.array(ys)

100%|██████████| 2000/2000 [00:38<00:00, 52.27it/s]


In [None]:
src_dir = r'D:\Dataset\WeedData\weed_10_species\train2017_real_object_in_box\Embedding_BioCLIP'
embedding_paths=list_dir(src_dir, [], '.npy')
embeddings = []
ys = []
for x in tqdm(embedding_paths):
    name = os.path.basename(x)
    sku = name.split('_')[0]
    embedding = np.load(x)
    # print(embedding.shape)
    embeddings.append(embedding)
    ys += [sku]
embeddings_1 = np.array(embeddings)
ys_1 = np.array(ys)

100%|██████████| 2000/2000 [00:37<00:00, 53.94it/s]


In [None]:
src_dir = r'D:\Dataset\WeedData\weed_10_species\train2017_real_object_in_box\Embedding_BioCLIP_from_openclip_old'
embedding_paths=list_dir(src_dir, [], '.npy')
embeddings = []
ys = []
sku_num_dict = {}
for x in tqdm(embedding_paths):
    name = os.path.basename(x)
    sku = name.split('_')[0]
    if sku not in sku_num_dict:
        sku_num_dict[sku] =0
    sku_num_dict[sku] +=1
    if sku_num_dict[sku] >= 100:
        continue

    embedding = np.load(x)
    # print(embedding.shape)
    ys += [sku]
    embeddings.append(embedding)

embeddings_2 = np.array(embeddings)
ys_2 = np.array(ys)

100%|██████████| 8526/8526 [00:11<00:00, 773.02it/s] 


In [4]:
def visualize_embedding(embeddings, ys):
    X = embeddings.squeeze()
    y = ys
    print(X.shape, y.shape)

    feat_cols=[ 'pixel'+str(i) for i in range(X.shape[1]) ]
    df = pd.DataFrame(X,columns=feat_cols)
    label_title = 'Weed species'
    df[label_title] = y
    df['label'] = df[label_title].apply(lambda i: str(i))
    label_encoder = LabelEncoder()
    df['y_encoded'] = label_encoder.fit_transform(df[label_title])
    
    X, y = None, None

    print('Size of the dataframe: {}'.format(df.shape))

    rndperm = np.random.permutation(df.shape[0])

    n_components = 2
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(df[feat_cols].values)
    print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

    df['PCA-PC1'] = pca_result[:,0]
    df['PCA-PC2'] = pca_result[:,1] 
    # df['pca-three'] = pca_result[:,2]

    # ax = plt.figure(figsize=(16,10)).add_subplot(projection = '3d')
    # plt.title('PCA plot')
    # scatter = ax.scatter(
    #     xs=df.loc[rndperm,:]["pca-one"], 
    #     ys=df.loc[rndperm,:]["pca-two"], 
    #     zs=df.loc[rndperm,:]["pca-three"], 
    #     c=df.loc[rndperm,:]["y_encoded"], 
    #     cmap='tab10'
    # )
    # ax.set_xlabel('Principal Component 1')
    # ax.set_ylabel('Principal Component 2')
    # ax.set_zlabel('Principal Component 3')
    # # Adding a legend based on the original labels
    # handles, labels = scatter.legend_elements()
    # ax.legend(handles, label_encoder.inverse_transform([label for label in labels]), title="Classes")
    # ax.grid(False)

    plt.figure(figsize=(12,8))
    plt.title('PCA plot', fontsize=24, fontweight='bold')
    sns.scatterplot(
        x="PCA-PC1", y="PCA-PC2",
        hue=label_title,
        palette=sns.color_palette("hls", 10),
        data=df.loc[rndperm,:],
        legend="full",
        alpha=0.3
    )
    plt.legend(fontsize=16)
    evr1, evr2=np.round(pca.explained_variance_ratio_[:2]*100,1)
    plt.xlabel('Principal Component 1 ('+str(evr1)+'%)', fontsize=21)
    plt.ylabel('Principal Component 2 ('+str(evr2)+'%)', fontsize=21)
    # plt.savefig(r'D:\BoyangDeng\OneDrive\OneDrive - Michigan State University\MyPaper\GreatLakeExpo\2024\WeedIPAdapter\Figure\PCA.jpg', dpi=800)
    # df_subset = df.loc[rndperm[:N],:].copy()
    df_subset = df.loc[rndperm,:].copy()
    time_start = time.time()
    tsne = TSNE(n_components=n_components, verbose=1, perplexity=40, n_iter=300)

    data_subset = df_subset[feat_cols].values
    tsne_results = tsne.fit_transform(data_subset)

    print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

    df_subset['tSNE-C1'] = tsne_results[:,0]
    df_subset['tSNE-C2'] = tsne_results[:,1]
    # df_subset['tsne-three'] = tsne_results[:,2]

    # ax = plt.figure(figsize=(16,10)).add_subplot(projection = '3d')
    # plt.title('t-SNE plot')
    # scatter  = ax.scatter(
    #     xs=df_subset.loc[rndperm,:]["tsne-one"], 
    #     ys=df_subset.loc[rndperm,:]["tsne-two"], 
    #     zs=df_subset.loc[rndperm,:]["tsne-three"], 
    #     c=df_subset.loc[rndperm,:]["y_encoded"], 
    #     cmap='tab10'
    # )
    # ax.set_xlabel('t-SNE Component 1')
    # ax.set_ylabel('t-SNE Component 2')
    # ax.set_zlabel('t-SNE Component 3')
    # # Adding a legend based on the original labels
    # handles, labels = scatter.legend_elements()
    # ax.legend(handles, label_encoder.inverse_transform([label for label in labels]), title="Classes")
    # ax.grid(False)

    plt.figure(figsize=(16,12))
    plt.title('t-SNE plot', fontsize=24, fontweight='bold')
    sns.scatterplot(
        x="tSNE-C1", y="tSNE-C2",
        hue=label_title,
        palette=sns.color_palette("hls", 10),
        data=df_subset,
        legend="full",
        alpha=0.3
    )
    plt.legend(fontsize=16)
    plt.xlabel('t-SNE Component 1', fontsize=21)
    plt.ylabel('t-SNE Component 2', fontsize=21)
    # my_dpi = 96
    # plt.figure(i_img, figsize=(1200/my_dpi, 1200/my_dpi), dpi=my_dpi)
    # plt.savefig(r'D:\BoyangDeng\OneDrive\OneDrive - Michigan State University\MyPaper\GreatLakeExpo\2024\WeedIPAdapter\Figure\tSNE.jpg', dpi=800)
    plt.savefig(r'D:\test\tSNE.jpg', dpi=800)
    plt.show(block=True)
    return pca, tsne

In [15]:
def visualize_embedding_tSNE(embeddings, ys):
    X = embeddings.squeeze()
    y = ys
    print(X.shape, y.shape)

    feat_cols=[ 'pixel'+str(i) for i in range(X.shape[1]) ]
    df = pd.DataFrame(X,columns=feat_cols)
    label_title = 'Weed species'
    df[label_title] = y
    df['label'] = df[label_title].apply(lambda i: str(i))
    label_encoder = LabelEncoder()
    df['y_encoded'] = label_encoder.fit_transform(df[label_title])
    
    X, y = None, None

    print('Size of the dataframe: {}'.format(df.shape))

    rndperm = np.random.permutation(df.shape[0])

    n_components = 2

    df_subset = df.loc[rndperm,:].copy()
    time_start = time.time()
    tsne = TSNE(n_components=n_components, verbose=1, perplexity=40, n_iter=300)

    data_subset = df_subset[feat_cols].values
    tsne_results = tsne.fit_transform(data_subset)

    print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

    df_subset['tSNE-C1'] = tsne_results[:,0]
    df_subset['tSNE-C2'] = tsne_results[:,1]

    unique_classes = sorted(df_subset[label_title].unique())
    # palette = sns.color_palette("hls", len(unique_classes))
    palette = sns.color_palette("tab20", len(unique_classes))

    custom_palette = [
        '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
        '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe',
        '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000',
        '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080'
    ]
    palette = custom_palette[:len(unique_classes)]

    color_map = dict(zip(unique_classes, palette))

    plt.figure(figsize=(8,6))
    # plt.title('t-SNE plot', fontsize=24, fontweight='bold')
    ax = sns.scatterplot(
        x="tSNE-C1", y="tSNE-C2",
        hue=label_title,
        # palette=sns.color_palette("hls", 10),
        palette=color_map,
        data=df_subset,
        legend="full",
        alpha=0.5
    )
    # plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="Classes", fontsize=16)
        # Get handles and labels from the plot
    handles, labels = ax.get_legend_handles_labels()
    if 'Weed species' in labels:
        labels = labels[1:]
        handles = handles[1:]
    sorted_labels_handles = sorted(zip(labels, handles), key=lambda x: x[0])
    sorted_labels, sorted_handles = zip(*sorted_labels_handles)
    ax.legend(sorted_handles, sorted_labels, title="Classes", loc='center left', bbox_to_anchor=(10, 0.5), fontsize=16)
    # for only show the legend
    ax.legend(sorted_handles, sorted_labels, title="Classes", loc='center left', bbox_to_anchor=(0.5, 0.5), fontsize=16, framealpha=1.0)

    plt.xlabel('t-SNE Component 1', fontsize=16)
    plt.ylabel('t-SNE Component 2', fontsize=16)
    plt.savefig(r'D:\test\tSNE.jpg', dpi=800)
    plt.show(block=True)
    return tsne

In [16]:
# pca, tsne = visualize_embedding(embeddings_0, ys_0)
tsne = visualize_embedding_tSNE(embeddings_0, ys_0)

(2000, 1024) (2000,)
Size of the dataframe: (2000, 1027)
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.013s...




[t-SNE] Computed neighbors for 2000 samples in 0.214s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 0.178655
[t-SNE] KL divergence after 250 iterations with early exaggeration: 67.220161
[t-SNE] KL divergence after 300 iterations: 1.464453
t-SNE done! Time elapsed: 4.721506834030151 seconds


In [7]:
# visualize_embedding(embeddings_1, ys_1)
tsne = visualize_embedding_tSNE(embeddings_1, ys_1)

(2000, 512) (2000,)
Size of the dataframe: (2000, 515)
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.003s...
[t-SNE] Computed neighbors for 2000 samples in 0.188s...




[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 0.268706
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.630497
[t-SNE] KL divergence after 300 iterations: 1.297085
t-SNE done! Time elapsed: 4.3275065422058105 seconds


In [21]:
# visualize_embedding(embeddings_2, ys_2)
tsne = visualize_embedding_tSNE(embeddings_2, ys_2)

(990, 512) (990,)
Size of the dataframe: (990, 515)
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 990 samples in 0.004s...
[t-SNE] Computed neighbors for 990 samples in 0.084s...
[t-SNE] Computed conditional probabilities for sample 990 / 990
[t-SNE] Mean sigma: 0.295789




[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.070164
[t-SNE] KL divergence after 300 iterations: 0.662844
t-SNE done! Time elapsed: 1.9016997814178467 seconds


In [47]:
# ax = plt.figure(figsize=(16,10)).add_subplot(projection = '3d')
# ax.scatter(
#     xs=df.loc[rndperm,:]["pca-one"], 
#     ys=df.loc[rndperm,:]["pca-two"], 
#     zs=df.loc[rndperm,:]["pca-three"], 
#     c=df.loc[rndperm,:]["y"], 
#     cmap='tab10'
# )
# ax.set_xlabel('pca-one')
# ax.set_ylabel('pca-two')
# ax.set_zlabel('pca-three')
# plt.show()

In [48]:
# N = 8526
# N = 1000
# df_subset = df.loc[rndperm[:N],:].copy()

# df_subset = df.loc[rndperm,:].copy()

# data_subset = df_subset[feat_cols].values

# pca = PCA(n_components=3)
# pca_result = pca.fit_transform(data_subset)

# df_subset['pca-one'] = pca_result[:,0]
# df_subset['pca-two'] = pca_result[:,1] 
# df_subset['pca-three'] = pca_result[:,2]

# print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [0.1829305  0.07413395 0.06695858]
