In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from termcolor import colored
from scipy import stats
from matplotlib import pyplot as plt
import matplotlib.style as style
%matplotlib widget
sns.set_theme(style="darkgrid")
style.use('seaborn-colorblind')

We have the following hashfunctions:

In [2]:
with open("./hashes.json", "r") as ifile:
    res = ifile.readlines()
    res = [json.loads(i) for i in res]
    results = {}
    for i in res:
        results[i["name"]] = i
print("available hashes to test:", results.keys())

available hashes to test: dict_keys(['naive_modulo', 'naive_bitmask'])


# functions you can use

In [3]:
def quick_analyze(name):
    '''
    input: hash name
    output: chi squared tests and avg hashing speed
    '''
    global results
    relevant = results[name]
    print(f"average hashing speed in ns, lower is better: {relevant['avg_time']}")
    
    print("chi squared according to wiki page, we want it to be between [0.95, 1.05]")
    ch2_wiki = ["chi_wiki_prime_iter", "chi_wiki_prime_rand", "chi_wiki_two_iter", "chi_wiki_two_rand"]
    for i in ch2_wiki:
        j = relevant[i]
        if j > 0.95 and j < 1.05:
            print(i, colored(j, "green"))
        else:
            print(i, colored(j, "red"))
            
    print("chi squared according to other source, we want it to be as close to 0 as possible")        
    print("I have marked those > 0.2 as bad, idk what a good cutoff value is")
    ch2_other = ["chi_other_prime_iter", "chi_other_prime_rand", "chi_other_two_iter","chi_other_two_rand" ]
    for i in ch2_other:
        j = relevant[i]
        if j < 0.2:
            print(i, colored(j, "green"))
        else:
            print(i, colored(j, "red"))

def plots(name):
    global results
    res = results[name]
    col_list = [
        "bithistogram_prime_rand",
    "bithistogram_prime_iter",
    "bithistogram_two_rand",
    "bithistogram_two_iter"]

    bitgram = pd.DataFrame(columns=["bits", "total","test"])
    for i in col_list:
        temp = np.array(res[i])
        temp = pd.DataFrame([range(0,32), temp/20000, [i for j in range(0,32)]])
        temp = temp.T
        temp.columns = bitgram.columns    
        bitgram = bitgram.append(temp)
    print("We want the occurence of 1 for each to be around 0.5. I should technically do a chi2 test on it. we;ll figure it out somehow")
    g = sns.FacetGrid(bitgram, col="test", height=3.5, aspect=0.9)
    g.map(sns.barplot, "bits", "total", order=range(0,20), label="test")
    plt.show()
    # The below code needs some stats knowledge to validate
#     g.set(title="P(bit=1)") 
#     print("DON't trust the results below")
#     bitgram["total"] = np.array([stats.binom_test(i*20000, 20000, 0.5) for i in bitgram["total"]])
#     h = sns.FacetGrid(bitgram, col="test", height=4, aspect=1)
#     h.map(sns.barplot, "bits", "total", order=range(0,20))
#     h.set(title="P(this is random)")
    print("This is just plots of how much each bin would be filled. you shouldn't be able to spot a pattern")
    cols = [
        "raw_prime_rand",
        "raw_prime_iter",
        "raw_two_rand",
        "raw_two_iter"
    ]
#     this is slower code, but produces better plots
#     raw = pd.DataFrame(columns=["bucket", "total","test"])
#     for i in cols:
#         temp = np.array(res[i])
#         temp = pd.DataFrame([range(len(temp)), temp, [i for j in range(len(temp))]])
#         temp = temp.T
#         temp.columns = raw.columns 
#         raw = raw.append(temp)
#     i = sns.FacetGrid(raw, col="test", height=4, aspect=1)
#     i.map(sns.scatterplot, "bucket", "total", s=10)
    
    raw = pd.DataFrame( [
        res["raw_prime_rand"],
        res["raw_prime_iter"],
        res["raw_two_rand"],
        res["raw_two_iter"]
    ])
    raw = raw.T
    raw.columns=cols
    f, axs = plt.subplots(2, 2, figsize=(10 ,5), gridspec_kw=dict(width_ratios=[1,1]))
    sns.scatterplot(data=raw, x=np.arange(1024),  y="raw_prime_rand", ax=axs[0,0], s=10, label="prime random")
    sns.scatterplot(data=raw, x=np.arange(1024),  y="raw_prime_iter", ax=axs[0,1], s=10, label="prime iterative")
    sns.scatterplot(data=raw, x=np.arange(1024),  y="raw_two_rand", ax=axs[1,0], s=10, label="2^n random")
    sns.scatterplot(data=raw, x=np.arange(1024),  y="raw_two_iter", ax=axs[1,1], s=10, label="2^n iterative")
    f.tight_layout()
    plt.show()
    
    g, axs = plt.subplots(1, 2, figsize=(10,5), gridspec_kw=dict(width_ratios=[1,1]))
    sns.histplot(data=res,x="skiena_prime", ax=axs[0],  label="prime")
    sns.histplot(data=res, x="skiena_two", ax=axs[1], label="2^n")
    g.tight_layout()
    plt.show()

In [4]:
quick_analyze("naive_modulo")
plots("naive_modulo")

average hashing speed in ns, lower is better: 8.92144
chi squared according to wiki page, we want it to be between [0.95, 1.05]
chi_wiki_prime_iter [32m0.9545884578997161[0m
chi_wiki_prime_rand [32m1.001112762985989[0m
chi_wiki_two_iter [32m0.9545878279398056[0m
chi_wiki_two_rand [32m0.9985617259288854[0m
chi squared according to other source, we want it to be as close to 0 as possible
I have marked those > 0.2 as bad, idk what a good cutoff value is
chi_other_prime_iter [32m0.0[0m
chi_other_prime_rand [31m0.4934797179166714[0m
chi_other_two_iter [32m0.0[0m
chi_other_two_rand [31m0.40499206298200113[0m
We want the occurence of 1 for each to be around 0.5. I should technically do a chi2 test on it. we;ll figure it out somehow


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

This is just plots of how much each bin would be filled. you shouldn't be able to spot a pattern


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [5]:
quick_analyze("naive_bitmask")
plots("naive_bitmask")

average hashing speed in ns, lower is better: 6.82064
chi squared according to wiki page, we want it to be between [0.95, 1.05]
chi_wiki_prime_iter [31m917.3595080416272[0m
chi_wiki_prime_rand [32m1.027823579763031[0m
chi_wiki_two_iter [31m930.9958716207218[0m
chi_wiki_two_rand [32m0.9995560882496559[0m
chi squared according to other source, we want it to be as close to 0 as possible
I have marked those > 0.2 as bad, idk what a good cutoff value is
chi_other_prime_iter [31m1.0[0m
chi_other_prime_rand [31m1.0[0m
chi_other_two_iter [31m1.0[0m
chi_other_two_rand [32m0.18845772684254813[0m
We want the occurence of 1 for each to be around 0.5. I should technically do a chi2 test on it. we;ll figure it out somehow


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

This is just plots of how much each bin would be filled. you shouldn't be able to spot a pattern


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …