# Kolmogorov-Smirnov (K-S) test for query minimum euclidean distance distributions

In [9]:
# Imports
import pandas as pd
from scipy.stats import ks_2samp
from itertools import combinations

# Load Datasets

In [2]:
# Load the ditance profiles from the CSV file
distance_profiles = pd.read_csv('data/distance_profiles1.csv')

# Extract the minimum euclidean distance between selected queries and all trips

In [3]:
# The best queries from each impact trip based on the average minimum Euclidean distance
selected_query_num = [7, 13, 21, 22, 34]

In [4]:
# Only keep the best queries in the distance_profiles
distance_profiles = distance_profiles[distance_profiles['query_num'].isin(selected_query_num)]

In [6]:
distance_profiles.head()

Unnamed: 0,trip_num,query_num,distance_profile,min_euclidean_distance
39468,0,7,[22.88592902 29.41060894 18.69057109 22.443871...,18.690571
39469,1,7,[25.52705058 24.82942687 25.48248208 21.368863...,14.168484
39470,2,7,[25.2655633 26.57540994 23.82764794 ... 21.86...,9.316708
39471,3,7,[22.47321033 22.7510133 21.93033253 21.162031...,13.410349
39472,4,7,[22.90302297 22.41664689 24.09112514 24.734674...,13.69116


In [7]:
# Create a dictionary that contains the query number as the key, and then all the minimum distances in an array as the value
distance_profiles_dict = {}
for query_num in selected_query_num:
    distance_profiles_dict[query_num] = distance_profiles[distance_profiles['query_num'] == query_num]['min_euclidean_distance'].values

In [8]:
# Show the dictionary
distance_profiles_dict

{7: array([18.69057109, 14.16848367,  9.3167081 , ...,  8.80665509,
        12.04036232, 10.46606536]),
 13: array([19.36887936, 11.30584006, 11.10926285, ..., 11.62525096,
        14.32147987, 10.8132696 ]),
 21: array([15.68953368, 13.42646348, 10.11526571, ..., 12.34749472,
        13.51178227,  9.001042  ]),
 22: array([16.3010793 , 12.09697571, 10.17847698, ..., 10.59114888,
        10.71156309,  8.20341393]),
 34: array([14.9955456 , 13.88959424, 10.97533621, ..., 12.63657566,
        10.21786138, 10.31009396])}

# Perform K-S Statistical Test

In [13]:
# Perform K-S test on each pair of queries
for query_a, query_b in combinations(distance_profiles_dict.keys(), 2):
    dist_a = distance_profiles_dict[query_a]
    dist_b = distance_profiles_dict[query_b]

    # Make sure the two distributions have the same length
    min_len = min(len(dist_a), len(dist_b))
    dist_a = dist_a[:min_len]
    dist_b = dist_b[:min_len]
    
    # Perform K-S test
    ks_stat, p_value = ks_2samp(dist_a, dist_b)
    
    # Output results
    print(f"K-S Test between Query {query_a} and Query {query_b}:")
    print(f"KS Statistic: {ks_stat:.4f}, P-value: {p_value:.20f}")
    
    if p_value > 0.05:
        print(f"The distributions of Query {query_a} and Query {query_b} are similar (fail to reject H₀).\n")
    else:
        print(f"The distributions of Query {query_a} and Query {query_b} are significantly different (reject H₀).\n")


K-S Test between Query 7 and Query 13:
KS Statistic: 0.2928, P-value: 0.00000000000000000000
The distributions of Query 7 and Query 13 are significantly different (reject H₀).

K-S Test between Query 7 and Query 21:
KS Statistic: 0.1406, P-value: 0.00000000000000000000
The distributions of Query 7 and Query 21 are significantly different (reject H₀).

K-S Test between Query 7 and Query 22:
KS Statistic: 0.1391, P-value: 0.00000000000000000000
The distributions of Query 7 and Query 22 are significantly different (reject H₀).

K-S Test between Query 7 and Query 34:
KS Statistic: 0.1859, P-value: 0.00000000000000000000
The distributions of Query 7 and Query 34 are significantly different (reject H₀).

K-S Test between Query 13 and Query 21:
KS Statistic: 0.2259, P-value: 0.00000000000000000000
The distributions of Query 13 and Query 21 are significantly different (reject H₀).

K-S Test between Query 13 and Query 22:
KS Statistic: 0.4249, P-value: 0.00000000000000000000
The distributions o