In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc

In [2]:
df = pd.read_csv('/Similarity/similarity.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   BN                  108 non-null    object 
 1   Starter             108 non-null    int64  
 2   Synthesizer         108 non-null    object 
 3   Augmentation        108 non-null    int64  
 4   Dataset             108 non-null    object 
 5   Column Shapes       108 non-null    float64
 6   Column Pair Trends  108 non-null    float64
 7   Average             108 non-null    float64
 8   KL improvement      108 non-null    float64
 9   Logical             108 non-null    int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 8.6+ KB


In [3]:
variables = ['Average', 'Column Shapes', 'Column Pair Trends']

for var in variables:
    # Calculate the ROC curve
    fpr, tpr, thresholds = roc_curve(df['Logical'], df[var])
    
    # Calculate the AUC (Area Under the Curve)
    roc_auc = auc(fpr, tpr)
    optimal_idx = np.argmin(np.sqrt(fpr**2 + (1-tpr)**2))
    optimal_threshold = thresholds[optimal_idx]
    print(f'Optimal cut-off for {var}: {optimal_threshold}')

Optimal cut-off for Average: 95.8
Optimal cut-off for Column Shapes: 95.86
Optimal cut-off for Column Pair Trends: 94.18


Now that the Cut-ff was calculated we can generate two arrays based on this Cut-off and perform a permutation test

In [6]:
# Filter the data based on the 'Average' column
kl_high_similarity = df.loc[df['Average'] >= 95.8, 'KL improvement'].values
kl_low_similarity = df.loc[df['Average'] < 95.8, 'KL improvement'].values

# Calculate the observed test statistic
observed_diff = np.mean(kl_high_similarity) - np.mean(kl_low_similarity)
print(f'Observed difference in means: {observed_diff:.5f}')

# Combine the datasets
combined = np.concatenate([kl_high_similarity, kl_low_similarity])


Observed difference in means: 2.37160


In [7]:
# Perform the permutation test
n_permutations = 10000
print(f'Number of permutations: {n_permutations}')
permuted_diffs = []
for _ in range(n_permutations):
    permuted = np.random.permutation(combined)
    permuted_high_similarity = permuted[:len(kl_high_similarity)]
    permuted_low_similarity = permuted[len(kl_high_similarity):]
    permuted_diff = np.mean(permuted_high_similarity) - np.mean(permuted_low_similarity)
    permuted_diffs.append(permuted_diff)

# Calculate the P-value
p_value = np.mean(np.abs(permuted_diffs) >= np.abs(observed_diff))
print(f'P-value: {p_value:.5f}')

Number of permutations: 10000
P-value: 0.00000
