In [2]:
import numpy as np
import polars as pl
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import csv

In [3]:
# Load data
data_path = Path("../data/tabular/train_fusion_processed.csv")
df = pl.read_csv(data_path)
cols = df.columns
arr = df.to_numpy()
X = arr[:, :-1].astype(float)
y = arr[:, -1]

In [4]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# Compute class centroids
classes = np.unique(y)
centroids = {}
for cls in classes:
    mask = (y == cls)
    centroids[cls] = np.mean(X_scaled[mask], axis=0)

In [6]:
# Compute distances from each sample to its class centroid and z-scores
z_scores = []
distances = []
sample_classes = []

for i, (x_sample, cls) in enumerate(zip(X_scaled, y)):
    centroid = centroids[cls]
    dist = np.linalg.norm(x_sample - centroid)
    distances.append(dist)
    sample_classes.append(cls)
    z_scores.append(dist)  # store raw distances first, will compute z later per class

# Compute z-scores per class
z_scores_final = np.zeros_like(distances, dtype=float)
for cls in classes:
    mask = (np.array(sample_classes) == cls)
    class_dists = np.array(distances)[mask]
    mu = np.mean(class_dists)
    sigma = np.std(class_dists)
    if sigma > 0:
        z_scores_final[mask] = (class_dists - mu) / sigma
    else:
        z_scores_final[mask] = 0

print(f"Z-scores computed: (range from {np.min(z_scores_final):.3f} to {np.max(z_scores_final):.3f})")

Z-scores computed: (range from -2.400 to 4.686)


In [7]:
# Detect outliers (z > 3)
outlier_mask = z_scores_final > 3.0
num_outliers = np.sum(outlier_mask)
outlier_indices = np.where(outlier_mask)[0]

print(f"Outliers detected (z > 3): {num_outliers} / {len(z_scores_final)} samples")
print(f"Outlier percentage: {100 * num_outliers / len(z_scores_final):.2f}%")

Outliers detected (z > 3): 35 / 4496 samples
Outlier percentage: 0.78%


In [8]:
# Save outliers to CSV
Path("report").mkdir(exist_ok=True)
outlier_path = Path("report/outliers.csv")
with open(outlier_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["sample_idx", "class", "distance_to_centroid", "z_score"])
    for idx in outlier_indices:
        writer.writerow([idx, sample_classes[idx], distances[idx], z_scores_final[idx]])

print(f"Outliers saved to {outlier_path}")

Outliers saved to report\outliers.csv


In [9]:
# Intra-class similarity = std of distances within each class
print("Intra-class similarity (std of within-class distances)")
intra_sims = {}
for cls in classes:
    mask = (np.array(sample_classes) == cls)
    class_dists = np.array(distances)[mask]
    std_dist = np.std(class_dists)
    intra_sims[cls] = std_dist
    print(f"Class {cls}: std = {std_dist:.4f} (mean dist = {np.mean(class_dists):.4f})")

Intra-class similarity (std of within-class distances)
Class banana: std = 4.5546 (mean dist = 13.9758)
Class carrot: std = 4.9409 (mean dist = 15.2108)
Class cucumber: std = 3.2821 (mean dist = 16.4332)
Class mandarin: std = 3.9472 (mean dist = 14.3565)
Class tomato: std = 3.7648 (mean dist = 16.4506)


In [10]:
# Inter-class similarity = distance between class centroids
print("Inter-class similarity (distance between centroids)")
inter_sims = {}
for i, cls_a in enumerate(classes):
    for j, cls_b in enumerate(classes):
        if j <= i:
            continue
        dist = np.linalg.norm(centroids[cls_a] - centroids[cls_b])
        inter_sims[(cls_a, cls_b)] = dist
        print(f"Distance {cls_a} <-> {cls_b}: {dist:.4f}")

Inter-class similarity (distance between centroids)
Distance banana <-> carrot: 6.3678
Distance banana <-> cucumber: 9.2614
Distance banana <-> mandarin: 5.3631
Distance banana <-> tomato: 12.2052
Distance carrot <-> cucumber: 6.5839
Distance carrot <-> mandarin: 4.7803
Distance carrot <-> tomato: 8.3592
Distance cucumber <-> mandarin: 8.3212
Distance cucumber <-> tomato: 9.2963
Distance mandarin <-> tomato: 8.2233


In [11]:
# Summary report
print("Intra- and inter-class similarity analysis")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Preprocessing: StandardScaler")
classes_str = ", ".join(str(c) for c in classes)
print(f"Classes: {classes_str}")
print(f"--- Outliers ---")
print(f"Total outliers (z > 3): {num_outliers} ({100*num_outliers/len(z_scores_final):.2f}%)")
print(f"--- Intra-class similarity (within-class std of distances) ---")
for cls in classes:
    print(f"Class {cls}: std = {intra_sims[cls]:.4f}")
print(f"--- Inter-class similarity (distance between centroids) ---")
for (cls_a, cls_b), dist in inter_sims.items():
    print(f"{cls_a} <-> {cls_b}: {dist:.4f}")

Intra- and inter-class similarity analysis
Dataset: 4496 samples, 282 features
Preprocessing: StandardScaler
Classes: banana, carrot, cucumber, mandarin, tomato
--- Outliers ---
Total outliers (z > 3): 35 (0.78%)
--- Intra-class similarity (within-class std of distances) ---
Class banana: std = 4.5546
Class carrot: std = 4.9409
Class cucumber: std = 3.2821
Class mandarin: std = 3.9472
Class tomato: std = 3.7648
--- Inter-class similarity (distance between centroids) ---
banana <-> carrot: 6.3678
banana <-> cucumber: 9.2614
banana <-> mandarin: 5.3631
banana <-> tomato: 12.2052
carrot <-> cucumber: 6.5839
carrot <-> mandarin: 4.7803
carrot <-> tomato: 8.3592
cucumber <-> mandarin: 8.3212
cucumber <-> tomato: 9.2963
mandarin <-> tomato: 8.2233
