In [56]:
from astropy.table import Table

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
%matplotlib notebook

import numpy as np
import time
from sklearn import neighbors
from sklearn import decomposition
import joblib
from vast.voidfinder._voidfinder_cython_find_next import MaskChecker
from vast.voidfinder.distance import z_to_comoving_dist
from vast.voidfinder import ra_dec_to_xyz
import pickle
import pandas as pd

Update:
- Trying to figure out if we can plot ra v dec, probs not
- If not, needed to figure out how to work with ra_dec_to_xyz() without Rgal variable (Douglass)

In [2]:
mask_file_name = "/Users/lorenzomendoza/Desktop/Research/Function/NSA_main_mask.pickle"

In [3]:
temp_infile = open(mask_file_name, "rb")
mask, mask_resolution = pickle.load(temp_infile)
temp_infile.close()

## V$^2$ void catalog

Generating galaxy array, .npy

In [4]:
V2_galzones = Table.read("/Users/lorenzomendoza/Desktop/Research/Function/V2_REVOLVER-nsa_v1_0_1_galzones.dat",format='ascii.commented_header')
V2_zonevoids = Table.read("/Users/lorenzomendoza/Desktop/Research/Function/V2_REVOLVER-nsa_v1_0_1_zonevoids.dat",format='ascii.commented_header')

# V2_gz = np.zeros(len(V2_zonevoids['zone']),dtype=int)
# V2_gz[V2_zonevoids['zone'] > -1] = 1
'''
for i in range(len(V2_gz)):
    if V2_galzones['zone'][i] > -1:
        #V2_gz[i] = V2_zonevoids['void1'][V2_galzones['zone'][i]]
        V2_gz[i] = 1
''';

In [5]:
V2_gz = np.zeros(len(V2_galzones['zone']),dtype=int)

for i in range(len(V2_gz)):
    
    if V2_zonevoids['void1'][V2_galzones['zone'][i]] > -1:
        V2_gz[i] = 1

In [6]:
len(V2_gz)

194125

In [7]:
V2_gz

array([0, 0, 1, ..., 1, 0, 1])

Extract galaxy void membership. This returns the points in the data set that are V2 voids. 

If returns -1, then not V2

In [8]:
file_name = "/Users/lorenzomendoza/Desktop/Research/Function/V2_nsa_v1_0_1_gal.txt"

In [9]:
data_table_vl = Table.read(file_name, format = "ascii.commented_header")

In [10]:
omega_M = np.float32(0.3)
h = np.float32(1.0)
Rgal = z_to_comoving_dist(data_table_vl['redshift'].astype(np.float32),omega_M,h)
data_table_vl['Rgal'] = Rgal

In [11]:
z_boolean = data_table_vl['redshift']>0 #Edge Case: 513626 = [[-0.  0.  0.]]
data_table_vl = data_table_vl[z_boolean]

In [12]:
galaxies_xyz = ra_dec_to_xyz(data_table_vl)

In [13]:
data_table_vl['x'] = galaxies_xyz[:,0]
data_table_vl['y'] = galaxies_xyz[:,1]
data_table_vl['z'] = galaxies_xyz[:,2]

In [14]:
# create boolean mask
boolmask = np.isin(data_table_vl['index'], V2_galzones['gal'])

# assign values using boolean indexing
V2_galzones['x'] = data_table_vl['x'][boolmask]
V2_galzones['y'] = data_table_vl['y'][boolmask]
V2_galzones['z'] = data_table_vl['z'][boolmask]

## Remove all galaxies outside the mask

For each galaxy, check to see if it is in the mask.  If it is, keep it!  If it is not, throw it away.
1. Define the variables `rmin` and `rmax`. Values are input by user. 
2. Create a boolean array called `galaxies_boolean`, with the same length as `data_table_vl`, filled with `True` values.
3. Create a `MaskChecker` object using the values `0`, `mask`, `mask_resolution`, `rmin`, and `rmax`.
4. Iterate over each element `i` in `data_table_vl`.
5. Define a variable `curr_gal` as the `i`-th element of the `galaxies_xyz` array reshaped into a 1x3 array.
6. Call the `not_in_mask` method of the `mask_checker` object on `curr_gal`, and assign the result to the variable `not_in_mask`.
7. Set the `i`-th element of the `galaxies_boolean` array to the opposite of the boolean value of `not_in_mask`.
8. After the loop has completed, print the time to run the loop.
9. Filter the `data_table_vl` array using the `galaxies_boolean` array to create a new array called `galaxies_in_mask`.
10. Print the sum of the `True` values in `galaxies_boolean`, the sum of the `False` values in `galaxies_boolean`, the shape of `galaxies_boolean`, and the `galaxies_in_mask` array.


In [15]:
def calc_volume_boundaries(void_cat_A, void_cat_B):
    """Compute the boundaries of the minimal rectangular volume (parallelpiped)
    that completely contains two void catalogs.
    
    Parameters
    ----------
    void_cat_A : astropy.Table
        Table of void data from first catalog.
    void_cat_B : astropy.Table
        Table of void data from second catalog.
        
    Returns
    -------
    x_min : float
    x_max : float
    y_min : float
    y_max : float
    z_min : float
    z_max : float
    """
    x_min = np.minimum(np.min(void_cat_A['x']), np.min(void_cat_B['x']))
    x_max = np.maximum(np.max(void_cat_A['x']), np.max(void_cat_B['x']))
    
    y_min = np.minimum(np.min(void_cat_A['y']), np.min(void_cat_B['y']))
    y_max = np.maximum(np.max(void_cat_A['y']), np.max(void_cat_B['y']))

    z_min = np.minimum(np.min(void_cat_A['z']), np.min(void_cat_B['z']))
    z_max = np.maximum(np.max(void_cat_A['z']), np.max(void_cat_B['z']))

    return x_min, x_max, y_min, y_max, z_min, z_max

def generate_grid_points(x_min, x_max, y_min, y_max, z_min, z_max):
    """Creates a dense rectangular grid of points in 3D for the void volume calculation.
    
    Returns
    -------
    xyz : list
        2D list of points in 3D space.
    """
    
    x_range = np.arange(x_min, x_max) # default spacing: 1 Megaparsec
    y_range = np.arange(y_min, y_max)
    z_range = np.arange(z_min, z_max)


    # Creating a meshgrid from the ranges to 
    X,Y,Z = np.meshgrid(x_range,y_range,z_range)

    x_points = np.ravel(X)
    y_points = np.ravel(Y)
    z_points = np.ravel(Z)
    
    point_coords = np.array([x_points, y_points, z_points])
    
    return point_coords

xmin, xmax, ymin, ymax, zmin, zmax = calc_volume_boundaries(galaxies_in_mask, galaxies_in_mask)

Calls the `calc_volume_boundaries` function on the `galaxies_in_mask` array and assigns the returned values to the variables `xmin`, `xmax`, `ymin`, `ymax`, `zmin`, and `zmax`.

---

pts = generate_grid_points(xmin, xmax, ymin, ymax, zmin, zmax)

Calls the `generate_grid_points` function with the values `xmin`, `xmax`, `ymin`, `ymax`, `zmin`, and `zmax` as arguments and assigns the returned value to the `pts` variable.

---

b = pts.shape
print(b)

Gets the shape of the `pts` array and assigns it to the `b` variable. The shape of the array is printed.


MaskChecker

In [31]:
def mask_point_filter(pts, mask, mask_resolution, rmin=0, rmax=312.89816):
    start_time = time.time()
    points_boolean = np.ones(pts.shape[1], dtype = bool)

    mask_checker = MaskChecker(0,
                            mask,
                            mask_resolution,
                            rmin,
                            rmax)

    for i in range(pts.shape[1]):
        curr_pt = pts[:,i]
        not_in_mask = mask_checker.not_in_mask(curr_pt)
        points_boolean[i] = not bool(not_in_mask)

    points_in_mask = pts[:,points_boolean]
    print('Time taken:', time.time() - start_time)
    print('Points in Mask Shape:', points_in_mask.shape)
    print('Sum of Points IN:', np.sum(points_boolean))
    print('Sum of Points OUT:', np.sum(~points_boolean))
    print('Boolean Shape:', points_boolean.shape)
    print('Points in Mask:',points_in_mask)
    return points_in_mask, points_boolean


In [17]:
def kd_tree(void_cat):
    """We are creating a function to make a KDTree to find the number of points in 
    and out of a catalogue.
    
    Parameters
    ----------
    point_coords: ndarray has a shape of (3,N)
        This is the list of points to query the given void catalogue. N is the number of points given. 
    void_cat: Astropy Table
        This is the given void catalogue.
    
    Returns
    -------
    true_inside: ndarray of shape (N,1)
        Is this the boolean array of length N (same length as point_coords). True means that 1 point 
        is inside the hole.
    """
#############
    cx = void_cat['x']
    cy = void_cat['y']
    cz = void_cat['z']

    sphere_coords = np.array([cx, cy, cz])

    #The .T is meant to transpose the array from (3,1054) to (1054,3)
    sphere_tree = neighbors.KDTree(sphere_coords.T)
    # print("KDTree")

##############
    
    return sphere_tree

In [18]:
def point_query(point_coords, sphere_tree, void_cat):
    # print("Starting Query")
    #Void cat classifcation
    true_inside = np.zeros(point_coords.shape[1])

    idx = sphere_tree.query(point_coords.T, k = 1, return_distance=False)
    
    #true_inside = void_cat[idx]
    for i in range(len(idx)):
        true_inside[i] = void_cat[idx[i]]
    
    return true_inside

In [19]:
# start_time = time.time()

# (var, n_points) = points_in_mask.shape

# # Takes about 1.5 mins per query
# points_in_mask_copy = points_in_mask.copy()

# kdTree_V1 = kd_tree(V2_galzones)
# kdTree_V2 = kd_tree(V2_galzones)

# true_inside_V1 = point_query(points_in_mask_copy, kdTree_V1, V2_gz)
# count_in_V1 = np.sum(true_inside_V1)
# count_out_V1 = n_points - count_in_V1

# true_inside_V2 = point_query(points_in_mask_copy, kdTree_V2, V2_gz)
# count_in_V2 = np.sum(true_inside_V2)
# count_out_V2 = n_points - count_in_V2

# inside_both = np.sum(np.logical_and(true_inside_V1, true_inside_V2))
# inside_neither = np.sum(np.logical_not(np.logical_or(true_inside_V1, true_inside_V2)))
# inside_V1 = np.sum(np.logical_and(true_inside_V1, np.logical_not(true_inside_V2)))
# inside_V2 = np.sum(np.logical_and(true_inside_V2, np.logical_not(true_inside_V1)))




# print(time.time() - start_time)
# print('\nNumber of points inside V1:', count_in_V1)
# print('\nNumber of points outside V2:', count_out_V1)
# print('\nNumber of points inside V1:', count_in_V2)
# print('\nNumber of points outside V2:', count_out_V2)
# print("\nThis is the total number of points: {}".format(n_points))
# # print("\nThis is the total number of points in Delaunay: {}".format(total_DEL))

In [20]:


# def point_query(point_coords, sphere_tree, void_cat):
#     # Reduce dimensionality using PCA
#     pca = decomposition.PCA(n_components=10)
#     point_coords_pca = pca.fit_transform(point_coords)

#     # Use a pre-built index
#     if sphere_tree is None:
#         sphere_tree = neighbors.BallTree(point_coords_pca)
    
#     # Parallelize the computation
#     true_inside = joblib.Parallel(n_jobs=-1)(
#         joblib.delayed(_query_single_point)(
#             point_coords_pca[i], sphere_tree, void_cat) for i in range(point_coords.shape[0])
#     )

#     return np.array(true_inside)

# def _query_single_point(point, sphere_tree, void_cat):
#     idx = sphere_tree.query(point.reshape(1, -1), k=1, return_distance=False)
#     return void_cat[idx[0]]

In [35]:
def count_points(points_in_mask, galzones_V1, galzones_V2, V2_gz):
    start_time = time.time()
    (var, n_points) = points_in_mask.shape

    # Takes about 1.5 mins per query
    points_in_mask_copy = points_in_mask.copy()

    kdTree_V1 = kd_tree(galzones_V1)
    kdTree_V2 = kd_tree(galzones_V2)

    true_inside_V1 = point_query(points_in_mask_copy, kdTree_V1, V2_gz)
    count_in_V1 = np.sum(true_inside_V1)
    count_out_V1 = n_points - count_in_V1

    true_inside_V2 = point_query(points_in_mask_copy, kdTree_V2, V2_gz)
    count_in_V2 = np.sum(true_inside_V2)
    count_out_V2 = n_points - count_in_V2

    inside_both = np.sum(np.logical_and(true_inside_V1, true_inside_V2))
    inside_neither = np.sum(np.logical_not(np.logical_or(true_inside_V1, true_inside_V2)))
    inside_V1 = np.sum(np.logical_and(true_inside_V1, np.logical_not(true_inside_V2)))
    inside_V2 = np.sum(np.logical_and(true_inside_V2, np.logical_not(true_inside_V1)))
    



    print("Runtime:", time.time() - start_time)
    print('\nNumber of points inside V1:', count_in_V1)
    print('\nNumber of points outside V2:', count_out_V1)
    print('\nNumber of points inside V1:', count_in_V2)
    print('\nNumber of points outside V2:', count_out_V2)
    print("\nThis is the total number of points: {}".format(n_points))
    # print("\nThis is the total number of points in Delaunay: {}".format(total_DEL))
    return (count_in_V1, count_out_V1, count_in_V2, count_out_V2, inside_both, inside_neither, inside_V1, inside_V2, n_points)

In [22]:
xmin, xmax, ymin, ymax, zmin, zmax = calc_volume_boundaries(V2_galzones, V2_galzones)

#This line makes creates the points in between 



In [23]:
pts = generate_grid_points(xmin, xmax, ymin, ymax, zmin, zmax)

b = pts.shape
print(b)

(3, 64749960)


In [33]:
points_in_mask, points_boolean = mask_point_filter(pts, mask, mask_resolution)


Time taken: 85.28129005432129
Points in Mask Shape: (3, 23119361)
Sum of Points IN: 23119361
Sum of Points OUT: 41630599
Boolean Shape: (64749960,)
Points in Mask: [[-108.17809884 -108.17809884 -108.17809884 ... -138.17809884
  -138.17809884 -138.17809884]
 [-293.30349342 -293.30349342 -293.30349342 ...  270.69650658
   270.69650658  270.69650658]
 [  -5.14470225   -4.14470225   -3.14470225 ...   71.85529775
    72.85529775   73.85529775]]


In [36]:
(count_in_V1, count_out_V1, count_in_V2, count_out_V2, inside_both, inside_neither, inside_V1, inside_V2, n_points) = count_points(points_in_mask, V2_galzones, V2_galzones, V2_gz)

Runtime: 192.09627866744995

Number of points inside V1: 20857709.0

Number of points outside V2: 2261652.0

Number of points inside V1: 20857709.0

Number of points outside V2: 2261652.0

This is the total number of points: 23119361


In [47]:
r_V1 = count_in_V1 / n_points
r_V2 = count_in_V2 / n_points
r_V1_V2 = np.sum(inside_both) / n_points
r_not_V1_V2 = np.sum(inside_neither) / n_points
r_V1_not_V2 = np.sum(inside_V1) / n_points
r_V2_not_V1 = np.sum(inside_V2) / n_points

average_V1 = np.mean(count_in_V1)
r_average_V1 = average_V1 / n_points
std_V1 = np.std(count_in_V1)
r_std_V1 = std_V1 / n_points
print("\nNumber of points inside V1: {}".format(count_in_V1))
print("\nNumber of points outside V1: {}".format(count_out_V1))
print("\nStandard Deviation of V1: {}".format(std_V1))

print('\nRatio of  V1 Points:', r_average_V1)
print('\nRatio SD of V1:', r_std_V1)
      
average_V2 = np.mean(count_in_V2)
r_average_V2 = average_V2 / n_points
std_V2 = np.std(count_in_V2)
r_std_V2 = std_V2 / n_points
print("\nNumber of points inside V2: {}".format(count_in_V2))
print("\nNumber of points outside V2: {}".format(count_out_V2))
print("\nStandard Deviation of V2: {}".format(std_V2))
print('\nRatio of V2 Points:', r_average_V2)
print('\nRatio SD of V2:', r_std_V2)


average_inside = np.mean(inside_both)
r_average_inside = average_inside / n_points

std_both = np.std(inside_both)
r_std_both = std_both / n_points
print("\nNumber of points inside both: {}".format(inside_both))
print("\nNumber of points outside both: {}".format(inside_neither))
print("\nStandard Deviation of both: {}".format(std_both))
print('\nRatio of Points Inside:', r_average_inside)
print('\nRatio SD of Both:', r_std_both)


average_outside = np.mean(inside_neither)
r_average_outside = average_outside / n_points

std_outside = np.std(inside_neither)
r_std_outside = std_outside / n_points
print("\nNumber of points inside neither: {}".format(inside_neither))
print("\nNumber of points outside neither: {}".format(inside_neither))
print('\nRatio of Points Outside:', r_average_outside)
print('\nRatio SD of Neither:', r_std_outside)


# average_in_V1 = np.mean(inside_V1)
# r_average_in_V1 = average_in_V1 / n_points

# std_in_V1 = np.std(inside_neither)
# r_std_in_V1 = std_in_V1 / n_points


# print('\nRatio of Points in V1:', r_average_in_V1)
# print('\nRatio SD:', r_std_in_V1)


# average_in_V2 = np.mean(inside_V2)
# r_average_in_V2 = average_in_V2 / n_points

# std_in_V2 = np.std(inside_neither)
# r_std_in_V2 = std_in_V2 / n_points

# print('\nRatio of Points in V2:', average_in_V2)
# print('\nRatio SD:', r_std_in_V2)


Number of points inside V1: 20857709.0

Number of points outside V1: 2261652.0

Standard Deviation of V1: 0.0

Ratio of  V1 Points: 0.9021749779329974

Ratio SD of V1: 0.0

Number of points inside V2: 20857709.0

Number of points outside V2: 2261652.0

Standard Deviation of V2: 0.0

Ratio of V2 Points: 0.9021749779329974

Ratio SD of V2: 0.0

Number of points inside both: 20857709

Number of points outside both: 2261652

Standard Deviation of both: 0.0

Ratio of Points Inside: 0.9021749779329974

Ratio SD of Both: 0.0

Number of points inside neither: 2261652

Number of points outside neither: 2261652

Ratio of Points Outside: 0.09782502206700264

Ratio SD of Neither: 0.0


In [44]:
average_V2 = np.mean(count_in_V2)
r_average_V2 = average_V2 / n_points
std_V2 = np.std(count_in_V2)
r_std_V2 = std_V2 / n_points
print("\nNumber of points inside V2: {}".format(count_in_V2))
print("\nNumber of points outside V2: {}".format(count_out_V2))
print("\nStandard Deviation of V2: {}".format(std_V2))
print('\nRatio of V2 Points:', r_average_V2)
print('\nRatio SD:', r_std_V2)





Number of points inside V2: 20857709.0

Number of points outside V2: 2261652.0

Standard Deviation of V2: 0.0

Ratio of V2 Points: 0.9021749779329974

Ratio SD: 0.0


In [45]:
average_inside = np.mean(inside_both)
r_average_inside = average_inside / n_points

std_both = np.std(inside_both)
r_std_both = std_both / n_points
print("\nNumber of points inside both: {}".format(inside_both))
print("\nNumber of points outside both: {}".format(inside_neither))
print("\nStandard Deviation of both: {}".format(std_both))
print('\nRatio of Points Inside:', r_average_inside)
print('\nRatio SD:', r_std_both)





Number of points inside both: 20857709

Number of points outside both: 2261652

Standard Deviation of both: 0.0

Ratio of Points Inside: 0.9021749779329974

Ratio SD: 0.0


In [46]:
average_outside = np.mean(inside_neither)
r_average_outside = average_outside / n_points

std_outside = np.std(inside_neither)
r_std_outside = std_outside / n_points
print("\nNumber of points inside neither: {}".format(inside_neither))
print("\nNumber of points outside neither: {}".format(inside_neither))
print("\nStandard Deviation of neither: {}".format(std_outside))
print('\nRatio of Points Outside:', r_average_outside)
print('\nRatio SD:', r_std_outside)


Number of points inside neither: 2261652

Number of points outside neither: 2261652

Standard Deviation of neither: 0.0

Ratio of Points Outside: 0.09782502206700264

Ratio SD: 0.0


In [60]:
def calculate_ratios_and_stats(count_in_V1, count_out_V1, count_in_V2, count_out_V2, inside_both, inside_neither, inside_V1, inside_V2, n_points):
    r_V1 = count_in_V1 / n_points
    r_V2 = count_in_V2 / n_points
    r_V1_V2 = np.sum(inside_both) / n_points
    r_not_V1_V2 = np.sum(inside_neither) / n_points
    r_V1_not_V2 = np.sum(inside_V1) / n_points
    r_V2_not_V1 = np.sum(inside_V2) / n_points

    average_V1 = np.mean(count_in_V1)
    r_average_V1 = average_V1 / n_points
    std_V1 = np.std(count_in_V1)
    r_std_V1 = std_V1 / n_points

    average_V2 = np.mean(count_in_V2)
    r_average_V2 = average_V2 / n_points
    std_V2 = np.std(count_in_V2)
    r_std_V2 = std_V2 / n_points

    average_inside = np.mean(inside_both)
    r_average_inside = average_inside / n_points
    std_both = np.std(inside_both)
    r_std_both = std_both / n_points

    average_outside = np.mean(inside_neither)
    r_average_outside = average_outside / n_points
    std_outside = np.std(inside_neither)
    r_std_outside = std_outside / n_points

    results = {
        'V1': [average_V1, std_V1, r_V1, r_average_V1, r_std_V1, r_V1_not_V2],
        'V2': [average_V2, std_V2, r_V2, r_average_V2, r_std_V2, r_V2_not_V1],
        'Both': [average_inside, std_both, r_V1_V2, r_average_inside, r_std_both, "N/A"],
        'Neither': [average_outside, std_outside, r_not_V1_V2, r_average_outside, r_std_outside, "N/A"],
    }
    # Create a pandas DataFrame with the results
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Average', 'SD', 'Ratio', 'Ratio Average', 'Ratio SD', 'In V1 & Not V2', 'In V2 & Not V1'])
    
    return results_df


In [61]:
results = calculate_ratios_and_stats(count_in_V1, count_out_V1, count_in_V2, count_out_V2, inside_both, inside_neither, inside_V1, inside_V2, n_points)


In [62]:
results

Unnamed: 0,Average,SD,Ratio,Ratio Average,Ratio SD
V1,20857709.0,0.0,0.902175,0.902175,0.0
V2,20857709.0,0.0,0.902175,0.902175,0.0
Both,20857709.0,0.0,0.902175,0.902175,0.0
Neither,2261652.0,0.0,0.097825,0.097825,0.0
