# Bitwise Hash Re-weighting

This is notebook is a bit rougher, but allows for the replication of the Hash-bit-position Weighting used in the paper.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #type:ignore
plt.style.use('dark_background') # dark_background "default
sns.set_context('paper')

%reload_ext autoreload
%autoreload 2

from demo00_conf import *

# Load hashes

In [None]:
from phaser.utils import load_labelencoders, bin2bool

# Load the label encoders
le = load_labelencoders(filename="LabelEncoders.bz2", path="./demo_outputs/")

TRANSFORMS = le['t'].classes_
METRICS    = le['m']
ALGORITHMS = le['a'].classes_
# note METR_dict is imported from demo config.



df_h = load("./demo_outputs/Hashes.df.bz2")

# Get the count of each transform. Should be 250k each.
df_h.groupby(['transformation']).count()

In [None]:
# # Convert binary to boolean for distance computation
# for a in le["a"].classes_:
#     df_h[a] = df_h[a].apply(bin2bool)

## Remove "bad" hashes that sum to 1 as this causes issues (Wavehash, largely)

In [None]:
# # nasty hack to find hashes with all bits set to False
# mask = df_h['wave'].apply(lambda x: sum(x)) == 0
# bad_filenames = df_h[mask]['filename'].unique()
# print(f"Found {len(bad_filenames)} bad filenames. Removing from main hashes")

# Find hashes that sum to 0 since they can cause issues with distance metrics
for a in df_h.columns[2:]:
    mask = df_h[a].apply(lambda x: sum(x)) == 0
    bad_filenames = df_h[mask]["filename"].unique()

    print(f"{len(bad_filenames)} bad hashes found for {a}")

    if len(bad_filenames) > 0:
        df_h = df_h[~df_h["filename"].isin(bad_filenames)]

## Subset if desired

In [None]:
# subset_size = 250_000

# unique_files = sorted(df_h['filename'].unique())
# sampled_files = np.random.choice(unique_files, subset_size, replace=False)

# df_h_sub = df_h[df_h['filename'].isin(sampled_files)]
# df_h_sub.groupby(['transformation']).count()
# df_h = df_h_sub

# Compute distances

In [None]:
print(f"Computing distances using the following metrics")
print(METR_dict.keys())

In [None]:
from phaser.similarities import IntraDistance, InterDistance, find_inter_samplesize

In [None]:
INTRA = IntraDistance(METR_dict, le, set_class=1, progress_bar=True)
dist_intra = INTRA.fit(df_h)

In [None]:
dist_intra.head()

In [None]:
dist_intra.groupby(['algo','metric']).count()

In [None]:
n_samples = find_inter_samplesize(len(df_h["filename"].unique() * 1))
print(n_samples)

INTER = InterDistance(METR_dict, le, set_class=0, n_samples=n_samples, progress_bar=True)
dist_inter = INTER.fit(df_h)

In [None]:
dist_inter.head()

In [None]:
dist_inter.groupby(['algo','metric']).count()

In [None]:
len(dist_intra['fileA'].unique())
len(dist_inter['fileA'].unique())

In [None]:
df_d = pd.concat([dist_intra,dist_inter])

## Compute metrics

In [None]:
# Define the triplet combinations
triplets = np.array(np.meshgrid(
    ALGORITHMS, 
    [t for t in TRANSFORMS if t != 'orig'], # ignore 'orig'
    METRICS.classes_)).T.reshape(-1,3)

print(f"Number of triplets to analyse: {len(triplets)}")

In [None]:
from phaser.evaluation import ComputeMetrics

cm = ComputeMetrics(le, df_d, df_h, analyse_bits=True, n_jobs=1)
m, b = cm.fit(triplets, weighted=False)

In [None]:
m.groupby(['Algorithm'])[['AUC','EER']].agg(['mean','std'])
m.groupby(['Algorithm','Transform'])[['AUC','EER']].agg(['mean','std'])

# Compute distances with bit weights

In [None]:
from phaser.evaluation import make_bit_weights
bit_weights = make_bit_weights(b, le)

In [None]:
INTRA_w = IntraDistance(METR_dict, le, set_class=1, bit_weights=bit_weights, progress_bar=True)
dist_intra_w = INTRA_w.fit(df_h)
INTER_w = InterDistance(METR_dict, le, set_class=0, n_samples=n_samples, bit_weights=bit_weights, progress_bar=True)
dist_inter_w = INTER_w.fit(df_h)

df_d_w = pd.concat([dist_intra_w, dist_inter_w])
cm_w = ComputeMetrics(le, df_d_w, df_h, analyse_bits=False, n_jobs=1)
m_w, b_w = cm_w.fit(triplets, weighted=False)

# Compare performance

In [None]:
print(f"Performance without bit weights")
print(m.groupby(['Algorithm', 'Metric'])[['AUC','EER']].agg(['mean','std']).to_latex(index=False))

print(f"Performance WITH bit weights")
print(m_w.groupby(['Algorithm', 'Metric'])[['AUC','EER']].agg(['mean','std']).to_latex(index=False))

In [None]:
cols = ['Algorithm', 'Transform', 'Metric', 'AUC_noW', 'AUC_wW']
_df = pd.DataFrame(np.column_stack([
    m['Algorithm'], 
    m['Transform'], 
    m['Metric'],
    m['AUC'],
    m_w['AUC']]), columns=cols)

_df['Improvement'] = _df['AUC_wW'] - _df['AUC_noW']

plt.style.use('default') # dark_background "default
sns.set_context('paper')

fig, ax = plt.subplots(1,1, figsize=FIGSIZE, constrained_layout=True)
ax = sns.barplot(_df[_df['Metric']=='Hamming'], x='Algorithm', y='Improvement', hue='Transform', palette='Set2', ax=ax)
# Get custom SNS legend handles from KDE plot
handles = ax.legend_.legend_handles #type:ignore

for handle, txt in zip(handles, ax.legend_.texts): #type:ignore
    # assign the legend labels to the handles
    handle.set_label(txt.get_text().split("_")[0]) #type:ignore
# Update custom SNS legend with the added line.
_ = ax.legend(handles=handles , loc="upper right", title='Transform')
_ = ax.grid(axis='y', alpha=0.35)

fig.savefig("./demo_outputs/figs/AUC_weight_improvements.pdf")  

m_w


# Visualising bit weights 

In [None]:
from phaser.plotting import bit_weights_ax

In [None]:
fig, ax = plt.subplots(1,1, figsize=(5,1.5), constrained_layout=True)
_ = bit_weights_ax(b["phash_Flip_Horizontal_Hamming"], ax=ax)
fig.savefig("./demo_outputs/figs/bit_w_phash_Flip_Horizontal_Hamming.pdf")

In [None]:
[k for k in bit_weights.keys()]

In [None]:
fig, ax = plt.subplots(1,1, figsize=(5,1.5), constrained_layout=True)
_ = bit_weights_ax(pd.DataFrame(bit_weights['phash_Hamming']), ax=ax)
fig.savefig("./demo_outputs/figs/bit_w_phash_median.pdf")