In [None]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os
import shutil
from tqdm import tqdm
from PIL import Image
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler
import torch
import clip

In [None]:
def create_tsne(df):
    # Select the numerical columns from the DataFrame
    numerical_columns = df.select_dtypes(include=[np.float64])

    # Perform T-SNE on the numerical columns
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(numerical_columns)
    
    return tsne, tsne_results

In [None]:
def save_clusters_by_column2(df, dst, cluster_col, cluster_nums):
    # в dst (destination) создает папки с названиями 
    # номеров кластеров и копирует картинки туда, 
    # cluster_col - колонка, с номерами кластеров (не название)
    # к каждому имени картинки прикрепляет его ранг
    # сохраняет в dst csv файл с датафреймом о кластеризации
    # возвращает датафрейм с колонками: номер кластера, 
    # среднее, стандартное отклонение по каждому рангу,
    # кол-во картинок определенного ранга для каждого ранга
    
    if not os.path.exists(dst):
        os.makedirs(dst)
    else:
        shutil.rmtree(dst)
        os.makedirs(dst)
    
    means = []
    stds = []
    nums = {
        0: [],
        1: [],
        2: [],
        3: []
    }
    
    pd.DataFrame({
        "id": df["id"],
        "cl": cluster_col
    }).to_csv(os.path.join(dst, "info.csv"), index=False)
    
    for cluster_num in cluster_nums:
        cl_df = df[cluster_col==cluster_num]
        cl_rating_df = cl_df["rating"]
        
        for i in range(4):
            nums[i].append(cl_df[cl_rating_df == i]["id"].count())
            
        
        cl_mean = cl_rating_df.mean()
        cl_std = cl_rating_df.std()
        
        means.append(cl_mean)
        stds.append(cl_std)
        
        cl_dir_name = str(cluster_num) + f"[mean_{cl_mean}][std_{cl_std}]"
        cl_dir_path = os.path.join(dst, cl_dir_name)
        
        os.mkdir(os.path.join(dst, cl_dir_name))
        cl_df = df[cluster_col==cluster_num]
        cl_df[["id", "rating", "img_generation_promt"]].to_csv(os.path.join(cl_dir_path, "cl_info.csv"), index=False)
        for index, row in cl_df.iterrows():
            shutil.copyfile(row["img_path"], os.path.join(cl_dir_path, 
                                                          os.path.basename(row["img_path"])+f"[{row['rating']}]"))
        
    return pd.DataFrame({
        "cluster": cluster_nums,
        "mean": means,
        "std": stds,
        **nums
    })

In [None]:
data_path = os.path.join("..", "..", "datasets")
full_set = pd.read_csv(os.path.join(data_path, "full_set.csv"))
full_set["img_path"] = full_set["img_path"].transform(lambda x: os.path.join(data_path, x))
full_set

In [None]:
emb_dir = os.path.join(data_path, "embeddings")

<h1> ViT-B32_img </h1>

In [None]:
name = "ViT-B32_img"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

numerical_columns = merged_df.select_dtypes(include=[np.float64])

tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(numerical_columns)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

In [None]:
# Train KMeans on the numerical columns
n_clusters=40
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", max_iter=1000, random_state=0)
clusters = kmeans.fit_predict(merged_df.select_dtypes(include=[np.float64]))

# Create a new DataFrame with the cluster labels
cluster_df = pd.DataFrame(clusters, columns=['cluster'])

combined_df = pd.concat([tsne_df, cluster_df], axis=1)
centroids = combined_df.groupby('cluster').mean()

cl_promt_df = pd.DataFrame({
    "id": full_set["id"],
    "cl": clusters,
    "promt": full_set["img_generation_promt"]
})

scatter = []
for cluster in centroids.index:
    cluster_data = combined_df[combined_df['cluster'] == cluster]
    
    promt_cl_text = 
    
    scatter.append(plt.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], label=f'cluster {cluster}'))

    centroid = centroids.loc[cluster]
    plt.annotate(f'cluster {cluster}', (centroid['TSNE1'], centroid['TSNE2']),
                 horizontalalignment='center', verticalalignment='center',
                 fontsize=10, color='black')

plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('KMeans Clustering')


In [None]:
cl_infos = save_clusters_by_column2(full_set, os.path.join("clusters", name, "KMeans"), 
               combined_df['cluster'], 
               range(n_clusters))

In [None]:
plt.bar(cl_infos["cluster"], cl_infos["mean"])
plt.title("mean")
plt.show()


# for i in ["mean", "std", 0, 1, 2, 3]:
#     plt.bar(cl_infos["cluster"], cl_infos[i])
#     plt.title(i)
#     plt.show()

<h1> ViT-B32_text </h1>

In [None]:
name = "ViT-B32_text"
cl_infos = save_clusters_by_column2(full_set, os.path.join("clusters", name, "KMeans"), 
               combined_df['cluster'], 
               range(n_clusters))
df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

numerical_columns = merged_df.select_dtypes(include=[np.float64])

tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(numerical_columns)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

In [None]:
# Train KMeans on the numerical columns
n_clusters=40
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", max_iter=1000, random_state=0)
clusters = kmeans.fit_predict(merged_df.select_dtypes(include=[np.float64]))

# Create a new DataFrame with the cluster labels
cluster_df = pd.DataFrame(clusters, columns=['cluster'])

combined_df = pd.concat([tsne_df, cluster_df], axis=1)
unique_clusters = combined_df['cluster'].unique()
scatter = []
for cluster in unique_clusters:
    cluster_data = combined_df[combined_df['cluster'] == cluster]
    plt.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], label=f'cluster {cluster}')

plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('KMeans Clustering')
plt.legend()

In [None]:
# Train KMeans on the numerical columns
n_clusters=40
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", max_iter=1000, random_state=0)
clusters = kmeans.fit_predict(merged_df.select_dtypes(include=[np.float64]))

# Create a new DataFrame with the cluster labels
cluster_df = pd.DataFrame(clusters, columns=['cluster'])

combined_df = pd.concat([tsne_df, cluster_df], axis=1)
centroids = combined_df.groupby('cluster').mean()

scatter = []
for cluster in centroids.index:
    cluster_data = combined_df[combined_df['cluster'] == cluster]
    scatter.append(plt.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], label=f'cluster {cluster}'))

    centroid = centroids.loc[cluster]
    plt.annotate(f'cluster {cluster}', (centroid['TSNE1'], centroid['TSNE2']),
                 horizontalalignment='center', verticalalignment='center',
                 fontsize=10, color='black')

plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('KMeans Clustering')


In [None]:
cl_infos = save_clusters_by_column2(full_set, os.path.join("clusters", name, "KMeans"), 
               combined_df['cluster'], 
               range(n_clusters))

In [None]:
plt.bar(cl_infos["cluster"], cl_infos["mean"])
plt.title("mean")
plt.show()


# for i in ["mean", "std", 0, 1, 2, 3]:
#     plt.bar(cl_infos["cluster"], cl_infos[i])
#     plt.title(i)
#     plt.show()

<h1> ViT-B32_both </h1>

In [None]:
name = "ViT-B32_both"

df = pd.read_csv(os.path.join(emb_dir, f"{name}.csv"))

merged_df = full_set[["id", "rating"]].merge(df, on="id").rename(
    {
        "rating": "label"
    }, axis=1)

numerical_columns = merged_df.select_dtypes(include=[np.float64])

tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(numerical_columns)

tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])

In [None]:
# Train KMeans on the numerical columns
n_clusters=40
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", max_iter=1000, random_state=0)
clusters = kmeans.fit_predict(merged_df.select_dtypes(include=[np.float64]))

# Create a new DataFrame with the cluster labels
cluster_df = pd.DataFrame(clusters, columns=['cluster'])

combined_df = pd.concat([tsne_df, cluster_df], axis=1)
unique_clusters = combined_df['cluster'].unique()
scatter = []
for cluster in unique_clusters:
    cluster_data = combined_df[combined_df['cluster'] == cluster]
    plt.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], label=f'cluster {cluster}')

plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('KMeans Clustering')
plt.legend()

In [None]:
# Train KMeans on the numerical columns
n_clusters=40
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", max_iter=1000, random_state=0)
clusters = kmeans.fit_predict(merged_df.select_dtypes(include=[np.float64]))

# Create a new DataFrame with the cluster labels
cluster_df = pd.DataFrame(clusters, columns=['cluster'])

combined_df = pd.concat([tsne_df, cluster_df], axis=1)
centroids = combined_df.groupby('cluster').mean()


scatter = []
for cluster in centroids.index:
    cluster_data = combined_df[combined_df['cluster'] == cluster]
    scatter.append(plt.scatter(cluster_data['TSNE1'], cluster_data['TSNE2'], label=f'cluster {cluster}'))

    centroid = centroids.loc[cluster]
    plt.annotate(f'cluster {cluster}', (centroid['TSNE1'], centroid['TSNE2']),
                 horizontalalignment='center', verticalalignment='center',
                 fontsize=10, color='black')

plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('KMeans Clustering')


In [None]:
cl_infos = save_clusters_by_column2(full_set, os.path.join("clusters", name, "KMeans"), 
               combined_df['cluster'], 
               range(n_clusters))

In [None]:
plt.bar(cl_infos["cluster"], cl_infos["mean"])
plt.title("mean")
plt.show()


# for i in ["mean", "std", 0, 1, 2, 3]:
#     plt.bar(cl_infos["cluster"], cl_infos[i])
#     plt.title(i)
#     plt.show()