In [None]:
import open_clip
import torch
from PIL import Image

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os
import shutil
from tqdm import tqdm
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler

from scipy.spatial.distance import cosine
from scipy.stats import spearmanr, pearsonr

import torch.nn.functional as F


In [None]:
def create_tsne(df):
    # Select the numerical columns from the DataFrame
    numerical_columns = df.select_dtypes(include=[np.float64])

    # Perform T-SNE on the numerical columns
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(numerical_columns)
    
    return tsne, tsne_results

In [None]:
def capture_tsne(tsne_df, df):
    # Optional: If you have labels or categories in your DataFrame, you can include them in the T-SNE DataFrame
    tsne_df['label'] = df['label']

    # Visualize the T-SNE results
    for label in tsne_df["label"].unique():
        plt.scatter(
            tsne_df["TSNE1"][tsne_df["label"]==label],
            tsne_df["TSNE2"][tsne_df["label"]==label],
            label=label
        )
    
    plt.xlabel('TSNE1')
    plt.ylabel('TSNE2')
    plt.title('T-SNE Visualization')
    plt.legend()
    
    plt.show()

In [None]:
data_path = os.path.join("..", "..", "datasets")
full_set = pd.read_csv(os.path.join(data_path, "full_set.csv"))
full_set["img_path"] = full_set["img_path"].transform(lambda x: os.path.join(data_path, x))
full_set

In [None]:
emb_dir = os.path.join(data_path, "embeddings")

In [None]:
name = "open_clip_ViT-G-14_img"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

merged_df

In [None]:
key_w_embs = pd.read_csv("open_clip_ViT-G-14_key_word_embs.csv").drop("Unnamed: 0", axis=1)
key_w_embs

In [None]:
emb_col_names = [i for i in merged_df.columns if i.startswith("emb")]

In [None]:
img_embs = np.array(merged_df[emb_col_names])
img_embs

In [None]:
text_embs = np.array(key_w_embs)
text_embs

In [None]:
probs = F.softmax(torch.tensor(img_embs @ text_embs.T), dim=1)

In [None]:
probs

In [None]:
prob_df = pd.DataFrame({
    "id": full_set["id"],
    "label": full_set["rating"],
    **{f"prob_{i}":probs[:, i] for i in range(probs.shape[1])}
})
prob_df

In [None]:
corr_matrix = prob_df.drop("id", axis=1).corr()
corr_matrix

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 8))

sns.heatmap(corr_matrix, cmap="coolwarm", square=True)
# Настройка масштаба осей
plt.ylim(corr_matrix.shape[0], 0)
plt.xlim(0, corr_matrix.shape[1])

# Отображение графика
plt.show()

In [None]:
corr_matrix["label"].sort_values()