In [None]:
import open_clip
import torch
from PIL import Image

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os
import shutil
from tqdm import tqdm
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler

from scipy.spatial.distance import cosine
from scipy.stats import spearmanr, pearsonr


In [None]:
def create_tsne(df):
    # Select the numerical columns from the DataFrame
    numerical_columns = df.select_dtypes(include=[np.float64])

    # Perform T-SNE on the numerical columns
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(numerical_columns)
    
    return tsne, tsne_results

In [None]:
def capture_tsne(tsne_df, df):
    # Optional: If you have labels or categories in your DataFrame, you can include them in the T-SNE DataFrame
    tsne_df['label'] = df['label']

    # Visualize the T-SNE results
    for label in tsne_df["label"].unique():
        plt.scatter(
            tsne_df["TSNE1"][tsne_df["label"]==label],
            tsne_df["TSNE2"][tsne_df["label"]==label],
            label=label
        )
    
    plt.xlabel('TSNE1')
    plt.ylabel('TSNE2')
    plt.title('T-SNE Visualization')
    plt.legend()
    
    plt.show()

<h1> Пробуем большой CLIP </h1>

In [None]:
data_path = os.path.join("..", "..", "datasets")
full_set = pd.read_csv(os.path.join(data_path, "full_set.csv"))
full_set["img_path"] = full_set["img_path"].transform(lambda x: os.path.join(data_path, x))
full_set

In [None]:
emb_dir = os.path.join(data_path, "embeddings")

In [None]:
name = "open_clip_ViT-G-14_img"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

tsne, tsne_results = create_tsne(merged_df)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

capture_tsne(tsne_df, merged_df)

In [None]:
name = "open_clip_ViT-G-14_text"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

tsne, tsne_results = create_tsne(merged_df)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

capture_tsne(tsne_df, merged_df)

In [None]:
name = "open_clip_ViT-G-14_both"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

tsne, tsne_results = create_tsne(merged_df)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

capture_tsne(tsne_df, merged_df)

<h1> Пробуем кластеризовать на img эмбеддингах </h1>

In [None]:
name = "open_clip_ViT-G-14_img"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

# добавляем еще img_path
merged_df = full_set[["id", "rating", "img_path"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

tsne, tsne_results = create_tsne(merged_df)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

capture_tsne(tsne_df, merged_df)

In [None]:
# Train KMeans on the numerical columns
n_clusters=40
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", max_iter=1000, random_state=0)
clusters = kmeans.fit_predict(merged_df.select_dtypes(include=[np.float64]))

# Create a new DataFrame with the cluster labels
cluster_df = pd.DataFrame(clusters, columns=['cluster'])

combined_df = pd.concat([tsne_df, cluster_df], axis=1)
unique_clusters = combined_df['cluster'].unique()
scatter = []
for cluster in unique_clusters:
    cluster_data = combined_df[combined_df['cluster'] == cluster]
    plt.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], label=f'cluster {cluster}')

plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('KMeans Clustering')
# plt.legend()

In [None]:
emb_col_names = [i for i in df.columns if i.startswith("emb")]
len(emb_col_names)

In [None]:
merged_df["cluster"] = cluster_df["cluster"]
merged_df

In [None]:
# найдем центроиды
centroids = merged_df.groupby("cluster", as_index=False).aggregate({i:"mean" for i in emb_col_names})
centroids


In [None]:
save_path = "cos_dist"
if os.path.exists(save_path):
    shutil.rmtree(save_path)
os.mkdir(save_path)
    
# по косинусному расстоянию

for index, row in centroids.iterrows():
    clstr = row["cluster"]
    clstr_centroid = np.array(row.drop("cluster"))
    
    clstr_embs = np.array(merged_df[merged_df["cluster"]==clstr][emb_col_names])
    clstr_df = merged_df[merged_df["cluster"] == clstr][["id", "img_path", "label"]]
    
    
    cos_dists = np.zeros(clstr_embs.shape[0])
    for i, emb in enumerate(clstr_embs):
        cos_dists[i] = cosine(emb, clstr_centroid)
    
    spearman_corr, spearman_p_value = spearmanr(clstr_df["label"], cos_dists)
    pearson_corr, pearson_p_value = pearsonr(clstr_df["label"], cos_dists)

    
    plt.title(f"Cluster {clstr}, sprmn: {spearman_corr} p-value: {spearman_p_value}; prsn: {pearson_corr}, p-value: {pearson_p_value}")
    plt.scatter(clstr_df["label"], cos_dists)
    plt.show()
    
    clstr_df["dist"] = cos_dists
    
    os.mkdir(os.path.join(save_path, str(clstr)))
    for i, row1 in clstr_df.sort_values(by="dist", ascending=False).head(5).iterrows():
        shutil.copyfile(
            row1["img_path"],
            os.path.join(save_path, str(clstr), os.path.basename(row1["img_path"]))
        )



In [None]:
save_path = "l2_dist"
if os.path.exists(save_path):
    shutil.rmtree(save_path)
os.mkdir(save_path)


# по l2 расстоянию
for index, row in centroids.iterrows():
    clstr = row["cluster"]
    clstr_centroid = np.array(row.drop("cluster"))

    clstr_embs = np.array(merged_df[merged_df["cluster"]==clstr][emb_col_names])
    clstr_df = merged_df[merged_df["cluster"] == clstr][["id", "img_path", "label"]]

    l2_dists = np.linalg.norm(clstr_embs - clstr_centroid, axis=1)

    spearman_corr, spearman_p_value = spearmanr(clstr_df["label"], l2_dists)
    pearson_corr, pearson_p_value = pearsonr(clstr_df["label"], l2_dists)

    plt.title(f"Cluster {clstr}, sprmn: {spearman_corr} p-value: {spearman_p_value}; prsn: {pearson_corr}, p-value: {pearson_p_value}")
    plt.scatter(clstr_df["label"], l2_dists)
    plt.show()

    
    clstr_df["dist"] = l2_dists
    
    os.mkdir(os.path.join(save_path, str(clstr)))
    for i, row1 in clstr_df.sort_values(by="dist", ascending=False).head(5).iterrows():
        shutil.copyfile(
            row1["img_path"],
            os.path.join(save_path, str(clstr), os.path.basename(row1["img_path"]))
        )


