In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import cv2
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append("..")

from src.dataset_downloader.dataset_utils import get_dataset_dfs, load_images

In [None]:
def load_and_flatten_image(image_path, target_size=(28, 28)):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError(f"Image at {image_path} could not be loaded.")
    return cv2.resize(img, target_size).flatten()

dfs = get_dataset_dfs("../data/mnist")
train_df = dfs['train']
train_df['class_name'] = train_df['class_name'].astype(int)
test_df = dfs['test']
test_df['class_name'] = test_df['class_name'].astype(int)

In [None]:
train_images = load_images(train_df['image_path'].values, mode="grayscale", target_size=(28, 28), max_workers=8)
test_images = load_images(test_df['image_path'].values, mode="grayscale", target_size=(28, 28), max_workers=8)
train_image_vectors = [img.flatten() for img in train_images]
test_image_vectors = [img.flatten() for img in test_images]
train_image_matrix = np.array(train_image_vectors)
test_image_matrix = np.array(test_image_vectors)

# Perform KMeans clustering on the training data
kmeans = KMeans(n_clusters=train_df['class_name'].nunique(), random_state=42, n_init=10, max_iter=300)
kmeans.fit(train_image_matrix)

# Get the cluster centers and labels
cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_

In [None]:
from scipy.stats import mode
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Map each cluster to the most frequent true label (Mehrheitsvoting)
def map_clusters_to_labels(true_labels, cluster_labels):
    label_mapping = {}
    for cluster in np.unique(cluster_labels):
        mask = (cluster_labels == cluster)
        most_common = mode(true_labels[mask], keepdims=True).mode[0]
        label_mapping[cluster] = most_common
    return label_mapping

# Wende das Mapping auf Trainingsdaten an
cluster_to_label = map_clusters_to_labels(train_df['class_name'].values, kmeans.labels_)
train_pred_labels = np.array([cluster_to_label[cluster] for cluster in kmeans.labels_])

# Berechne Genauigkeit auf Trainingsdaten
train_accuracy = accuracy_score(train_df['class_name'].values, train_pred_labels)
print(f"Trainingsgenauigkeit: {train_accuracy:.4f}")

# Wende KMeans auf Testdaten an (Labels werden vorhergesagt durch Zuordnung zum nächsten Cluster)
test_cluster_labels = kmeans.predict(test_image_matrix)
test_pred_labels = np.array([cluster_to_label[cluster] for cluster in test_cluster_labels])

# Berechne Genauigkeit auf Testdaten
test_accuracy = accuracy_score(test_df['class_name'].values, test_pred_labels)
print(f"Testgenauigkeit: {test_accuracy:.4f}")

# Optional: Konfusionsmatrix anzeigen
print("Konfusionsmatrix (Testdaten):")
print(confusion_matrix(test_df['class_name'].values, test_pred_labels))
