In [1]:
using BIPs
using Statistics
using Pkg.Artifacts

In [2]:
sample_path = "/home/josemm/MyDocs/DataLake/raw/mini_val.h5"

"/home/josemm/MyDocs/DataLake/raw/mini_val.h5"

In [3]:
sample_jets, sample_labels = BIPs.read_data("TQ", sample_path)
sample_labels = [reinterpret(Bool, b == 1.0) for b in sample_labels]
length(sample_labels)

40299

In [4]:
sample_transf_jets = data2hyp(sample_jets)
println("Transformed jets: ", length(sample_transf_jets))

Transformed jets: 40299


In [5]:
f_bip, specs = build_ip(order=4, levels=7)
    
function bip_data(dataset_jets)
    storage = zeros(length(dataset_jets), length(specs))
    for i = 1:length(dataset_jets)
        storage[i, :] = f_bip(dataset_jets[i])
    end
    storage[:, 2:end]
end

bip_data (generic function with 1 method)

In [6]:
embedded_sample = bip_data(sample_transf_jets)
println("Embedded sample jets correclty")

Embedded sample jets correclty


In [9]:
embedded_sample = embedded_sample[1:5000, :]
sample_labels = sample_labels[1:5000]

5000-element Vector{Bool}:
 1
 1
 0
 0
 0
 0
 0
 0
 1
 1
 ⋮
 1
 1
 1
 0
 0
 1
 0
 0
 1

In [7]:
using PyCall
@pyimport umap as py_umap;
@pyimport sklearn.mixture as sk_mixture

In [10]:
using Statistics, LinearAlgebra
scale(A) = (A .- mean(A, dims=1)) ./ std(A, dims=1)
n_embedded_sample = scale(embedded_sample)
println("Scaled embedded sample jets")

Scaled embedded sample jets


In [29]:
umap = py_umap.UMAP(n_neighbors=200, min_dist=0.5)[:fit_transform](n_embedded_sample)
print("UMAP done")

InterruptException: InterruptException:

In [19]:
using Plots
using DataFrames

In [14]:
bkg = [label==false for label in sample_labels ]
display(scatter(umap[:, 1][bkg], umap[:,2][bkg], sample_labels[bkg], color="red", size=10, label="Background"))
p = scatter!(umap[:, 1][sample_labels], umap[:,2][sample_labels], sample_labels[sample_labels], color="Blue", size=10, label="Signal")





In [20]:
df = DataFrame(comp1 = vec(umap[:, 1]), comp2 = vec(umap[:, 2]), label = vec(sample_labels))

Unnamed: 0_level_0,comp1,comp2,label
Unnamed: 0_level_1,Float32,Float32,Bool
1,0.115081,10.0002,1
2,1.12446,8.05329,1
3,13.0707,3.70003,0
4,14.4694,2.57212,0
5,-0.439803,6.82153,0
6,14.7446,-0.601483,0
7,9.84765,7.33783,0
8,0.787353,11.1745,0
9,13.4167,9.29626,1
10,0.611978,10.7088,1


In [24]:
using DelimitedFiles, CSV

writedlm( "FileName.csv",  umap, ',')

In [22]:
length(sample_jets)

40299

In [27]:
CSV.write("/home/josemm/MyDocs/DataLake/umap2.csv",  Tables.table(umap), writeheader=false)

"/home/josemm/MyDocs/DataLake/umap2.csv"

In [28]:
CSV.write("/home/josemm/MyDocs/DataLake/labels2.csv",  Tables.table(sample_labels), writeheader=true)

"/home/josemm/MyDocs/DataLake/labels2.csv"