In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import faiss

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance

# Load the dataBaseNormalised CSV file into a DataFrame
df = pd.read_csv('dataBaseNormalised.csv')

# Define the features to be used for distance calculation
hist_features = ['A3', 'D1', 'D2', 'D3', 'D4']
single_val_features = ['surfaceAreaObj', 'compactnessObj', 'rectangularityObj', 'diameterObj', 'convexityObj', 'eccentricityObj']

# Function to calculate distances between objects
def calculate_distances(df, hist_features, single_val_features):
    distance_features = {}

    # Earth movers distance (hist features)
    for hist_feature in hist_features:
        source = np.array(df[hist_feature].apply(eval).tolist())
        targets = np.array(df[hist_feature].apply(eval).tolist())

        # Calculate distances between all objects
        distance_matrix = np.zeros((len(source), len(targets)))
        for i, src in enumerate(source):
            for j, tgt in enumerate(targets):
                distance_matrix[i, j] = wasserstein_distance(src, tgt)

        distance_features[hist_feature] = distance_matrix

    # Distance single-value features
    for single_val_feature in single_val_features:
        source = df[single_val_feature].values
        targets = df[single_val_feature].values

        # Calculate distances between all objects
        distance_matrix = np.zeros((len(source), len(targets)))
        for i, src in enumerate(source):
            for j, tgt in enumerate(targets):
                distance_matrix[i, j] = abs(src - tgt)

        distance_features[single_val_feature] = distance_matrix

    return distance_features

# Function to standardize and combine distances
def standardize_and_combine_distances(df, distance_features, hist_features):
    standardized_distance_features = {}

    # Standardize hist features
    for hist_feature in hist_features:
        mean = np.mean(distance_features[hist_feature])
        std = np.std(distance_features[hist_feature])
        standardized_distance_features[hist_feature] = (distance_features[hist_feature] - mean) / std

    # Combine distances into a single matrix
    combined_distances = np.zeros_like(standardized_distance_features[hist_features[0]])
    for hist_feature in hist_features:
        combined_distances += standardized_distance_features[hist_feature]

    for single_val_feature in single_val_features:
        combined_distances += distance_features[single_val_feature]

    return combined_distances

# Calculate distances between objects
distance_features = calculate_distances(df, hist_features, single_val_features)

# Standardize and combine distances
combined_distances = standardize_and_combine_distances(df, distance_features, hist_features)

# Create a DataFrame to store the ordered list of distances
ordered_distances_df = pd.DataFrame(index=df.index, columns=['name', 'class', 'ordered_distances'])

# Populate the DataFrame with ordered distances and print each object file name when starting on it
for i in range(len(df)):
    print(f"Calculating distances for object: {df.at[i, 'name']}")
    ordered_indices = np.argsort(combined_distances[i])
    ordered_names = df['name'].iloc[ordered_indices].tolist()
    ordered_classes = df['class'].iloc[ordered_indices].tolist()
    ordered_distances_df.at[i, 'name'] = df.at[i, 'name']
    ordered_distances_df.at[i, 'class'] = df.at[i, 'class']
    ordered_distances_df.at[i, 'ordered_distances'] = list(zip(ordered_names, ordered_classes))

# Print the resulting DataFrame
print(ordered_distances_df)
