In [2]:
# Установка необходимых библиотек
!pip install -U plotly scikit-learn tensorflow pandas numpy

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm import tqdm

# Инициализация Plotly для Jupyter
from plotly.offline import init_notebook_mode
import plotly.io as pio
init_notebook_mode(connected=True)
pio.renderers.default = 'plotly_mimetype+notebook'  # Режим отображения для Jupyter

# Конфигурация
img_dir = 'train/train_data/images'
EMBEDDING_SIZE = 512
GROUP_SIZE = 26

# Загрузка модели ResNet50
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
print("Модель ResNet50 загружена")

# Функция для извлечения эмбеддингов
def extract_embeddings(image_paths):
    embeddings = []
    filenames = []
    
    for img_path in tqdm(image_paths, desc="Обработка изображений"):
        try:
            img = image.load_img(img_path, target_size=(224, 224))
            x = image.img_to_array(img)
            x = np.expand_dims(x, axis=0)
            x = preprocess_input(x)
            
            embedding = model.predict(x, verbose=0)
            embeddings.append(embedding.flatten())
            filenames.append(os.path.basename(img_path))
        except Exception as e:
            print(f"Ошибка при обработке {img_path}: {str(e)}")
    
    return np.array(embeddings), filenames

# Сбор путей к изображениям
image_paths = [os.path.join(img_dir, f) for f in os.listdir(img_dir) 
              if f.lower().endswith('.png')]
image_paths.sort()

print(f"Найдено {len(image_paths)} изображений")

# Извлечение эмбеддингов
embeddings, filenames = extract_embeddings(image_paths)
print(f"Размер эмбеддингов: {embeddings.shape}")

# Уменьшение размерности до EMBEDDING_SIZE с помощью PCA
pca_embedding = PCA(n_components=min(EMBEDDING_SIZE, embeddings.shape[0]))
embeddings_reduced = pca_embedding.fit_transform(embeddings)
print(f"Размер после PCA: {embeddings_reduced.shape}")

# Создание DataFrame с данными
df = pd.DataFrame({
    'filename': filenames,
    'group': [f"G-{i//GROUP_SIZE}" for i in range(len(filenames))],
    'group_num': [i//GROUP_SIZE for i in range(len(filenames))]
})

# PCA для 2D визуализации
pca_2d = PCA(n_components=2)
coords_2d = pca_2d.fit_transform(embeddings_reduced)
df['x'] = coords_2d[:, 0]
df['y'] = coords_2d[:, 1]

# PCA для 3D визуализации
pca_3d = PCA(n_components=3)
coords_3d = pca_3d.fit_transform(embeddings_reduced)
df['x3'] = coords_3d[:, 0]
df['y3'] = coords_3d[:, 1]
df['z3'] = coords_3d[:, 2]

# Функция для создания аннотаций групп
def create_group_annotations(df, dim='2d'):
    annotations = []
    for group in df['group'].unique():
        group_df = df[df['group'] == group]
        if dim == '2d':
            x = group_df['x'].mean()
            y = group_df['y'].mean()
            annotations.append(dict(
                x=x, y=y,
                text=group,
                showarrow=False,
                font=dict(size=14, color='black'),
                bgcolor='rgba(255,255,255,0.7)'
            ))
        else:  # 3D
            x = group_df['x3'].mean()
            y = group_df['y3'].mean()
            z = group_df['z3'].mean()
            annotations.append(dict(
                x=x, y=y, z=z,
                text=group,
                showarrow=False,
                font=dict(size=14, color='black')
            ))
    return annotations

# Создание 2D визуализации
fig_2d = px.scatter(
    df, x='x', y='y', 
    color='group',
    hover_name='filename',
    title='2D Projection of Image Embeddings',
    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2'}
)

# Добавление номеров групп
fig_2d.update_layout(
    annotations=create_group_annotations(df, '2d'),
    hoverlabel=dict(bgcolor="white", font_size=12),
    autosize=False,
    width=1200,
    height=800
)

# Создание 3D визуализации
fig_3d = px.scatter_3d(
    df, x='x3', y='y3', z='z3',
    color='group',
    hover_name='filename',
    title='3D Projection of Image Embeddings',
    labels={'x3': 'PCA 1', 'y3': 'PCA 2', 'z3': 'PCA 3'}
)

# Добавление номеров групп в 3D
fig_3d.update_layout(
    scene=dict(annotations=create_group_annotations(df, '3d')),
    hoverlabel=dict(bgcolor="white", font_size=12),
    autosize=False,
    width=1200,
    height=800
)

# Отображение графиков
fig_2d.show()
fig_3d.show()

# Сохранение результатов (опционально)
df.to_csv('image_embeddings.csv', index=False)
print("Результаты сохранены в image_embeddings.csv")

Collecting plotly
  Using cached plotly-6.2.0-py3-none-any.whl.metadata (8.5 kB)
Collecting tensorflow
  Downloading tensorflow-2.20.0rc0-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.0.1-py3-none-any.whl.metadata (11 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (fr

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.2 which is incompatible.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 0us/step
Модель ResNet50 загружена
Найдено 13650 изображений


Обработка изображений: 100%|█████████████████████████████████████████████████████| 13650/13650 [38:37<00:00,  5.89it/s]


Размер эмбеддингов: (13650, 2048)
Размер после PCA: (13650, 512)


Результаты сохранены в image_embeddings.csv
