From 96fb1bbdbc1c06ec3d23c454e196b83ec3f53c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Tue, 23 Jul 2024 15:42:05 -0700 Subject: [PATCH 1/5] Removing parquet cosmosim notebook --- conf.py | 4 +- index.md | 1 + tutorials/cosmosims/CosmoDC2_Parquet.md | 250 ------------------------ 3 files changed, 2 insertions(+), 253 deletions(-) delete mode 100644 tutorials/cosmosims/CosmoDC2_Parquet.md diff --git a/conf.py b/conf.py index 4ba368c1..9218b8bd 100644 --- a/conf.py +++ b/conf.py @@ -1,5 +1,3 @@ - - # Configuration file for the Sphinx documentation builder. # # -- Path setup -------------------------------------------------------------- @@ -40,7 +38,7 @@ # MyST-NB configuration nb_execution_timeout = 900 -nb_execution_excludepatterns = ['CosmoDC2_Parquet.md', 'Parallelize_Convolution.md'] +nb_execution_excludepatterns = ['Parallelize_Convolution.md'] # -- Options for HTML output ------------------------------------------------- diff --git a/index.md b/index.md index 072df7fa..0181cdde 100644 --- a/index.md +++ b/index.md @@ -87,6 +87,7 @@ maxdepth: 1 tutorials/openuniversesims/openuniverse2024_roman_simulated_timedomainsurvey tutorials/openuniversesims/openuniverse2024_roman_simulated_wideareasurvey + ``` ## Generally useful techniques diff --git a/tutorials/cosmosims/CosmoDC2_Parquet.md b/tutorials/cosmosims/CosmoDC2_Parquet.md deleted file mode 100644 index 2290715b..00000000 --- a/tutorials/cosmosims/CosmoDC2_Parquet.md +++ /dev/null @@ -1,250 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.16.2 -kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - -# Working with CosmoDC2 Parquet files - -This notebook shows how to work with the bulk-downloadable CosmoDC2 files in Parquet format. Goals include: - -* how to read in a single file -* how to plot galaxy properties -* how to read in selected columns -* how to read from all of the files - -+++ - -### Imports - -The pyarrow package is the underlying engine for reading these files - -```{code-cell} ipython3 -import os - -from astropy.table import Table -import dask.dataframe as dd -import numpy as np -import matplotlib.colors as clr -import matplotlib.pyplot as plt -import pandas as pd -import pyarrow.parquet as pq -``` - -```{code-cell} ipython3 -%matplotlib inline -``` - -### Set data location - -+++ - -Modify the `inpath` to your location for the Parquet files. - -```{code-cell} ipython3 -inpath = '/stage/irsa-data-download10/parquet-work/CosmoDC2' -``` - -Select a single file for the first demonstrations. The files are numbered by HEALPix number with NSIDE=32. The example here is near the center of the 440 sq deg covered by the simulations. - -```{code-cell} ipython3 -infile = 'cosmoDC2_healpix09560.parquet' -``` - -```{code-cell} ipython3 -joined = os.path.join(inpath, infile) -``` - -### Get all column names and types - -+++ - -Read a table of zero length to get the column names. - -```{code-cell} ipython3 -schema = Table.read(joined, schema_only=True) -``` - -The column names are in case-sensitive alphabetical order here. - -```{code-cell} ipython3 -schema -``` - -Alternatively, use the pyarrow package directly. - -```{code-cell} ipython3 -schema = pq.read_schema(os.path.join(inpath, infile)) -``` - -```{code-cell} ipython3 -len(schema) -``` - -Capitalized magnitudes are absolute magnitudes. Descriptions of the columns are not included in the Parquet files, but are in a separate file (to be specified). - -```{code-cell} ipython3 -print(schema) -``` - -### Read only the redshifts as a single Astropy table - -Reading a single column is faster with Parquet files. - -```{code-cell} ipython3 -tab = Table.read(joined, include_names=['redshift']) -``` - -```{code-cell} ipython3 -tab -``` - -### Read in all data from a single file - -```{code-cell} ipython3 -d=pd.read_parquet(joined) -``` - -```{code-cell} ipython3 -len(d) -``` - -### Plot number counts vs halo mass - -```{code-cell} ipython3 -plt.figure() -h,xbins = np.histogram(np.log10(d['halo_mass']),bins=40) -xbins_avg = (xbins[1:]+xbins[:-1])/2.0 -plt.semilogy(xbins_avg, h) -plt.ylabel(r'Galaxy Count') -plt.xlabel(r'log10( M$_{\rm{halo}}$ / M$_\odot)$') -plt.show() -``` - -### Plot g-r colors vs redshift - -```{code-cell} ipython3 -plt.figure() -gal_clr = d['mag_g_sdss']-d['mag_r_sdss'] -plt.hist2d(d['redshift'], gal_clr, bins=100, cmap='PuBu', norm=clr.LogNorm()) -plt.colorbar(label='population density') -plt.ylabel('Observed g-r') -plt.xlabel('redshift') -plt.title('Galaxy Colors in Clusters') -plt.tight_layout() -``` - -### Plot r-i colors vs redshift - -```{code-cell} ipython3 -plt.figure() -gal_clr = d['mag_r_sdss']-d['mag_i_sdss'] -plt.hist2d(d['redshift'], gal_clr, bins=100, cmap='PuBu',norm=clr.LogNorm()) -plt.colorbar(label='population density') -plt.ylabel('r-i') -plt.xlabel('redshift') -plt.title('Galaxy Colors in Clusters') -plt.tight_layout() -plt.show() -``` - -### Read one Parquet file using Pandas, and plot a histogram of redshifts - -```{code-cell} ipython3 -df = pd.read_parquet(joined, columns=['redshift']) -``` - -```{code-cell} ipython3 -num_bins = 20 -n, bins, patches = plt.hist(df.redshift, num_bins, - facecolor='blue', alpha=0.5) -plt.xlabel('Redshift') -plt.ylabel('Number') -plt.title('Redshift Histogram CosmoDC2 Mock Catalog V1 Abridged') -``` - -### Count the number of records in all the files - -+++ - -This section assumes you have downloaded all the files, or at least multiple files. - -```{code-cell} ipython3 -ddf = dd.read_parquet(os.path.join(inpath, '*parquet'), - engine='pyarrow', - columns=['stellar_mass']) -``` - -```{code-cell} ipython3 -%%time -len(ddf) -``` - -### Read multiple files using Dask, and plot the galaxy main sequence in a redshift interval - -+++ - -This section assumes you have downloaded all the files, or at least multiple files. -Read in stellar mass and absolute magnitudes in a redshift interval from 0.5 to 0.54. - -```{code-cell} ipython3 -ddf = dd.read_parquet(os.path.join(inpath, '*parquet'), - engine='pyarrow', - columns=['stellar_mass', - 'Mag_true_g_lsst_z0', - 'Mag_true_r_lsst_z0', - 'Mag_true_i_lsst_z0', - 'Mag_true_z_lsst_z0', - 'Mag_true_y_lsst_z0', - 'redshift'], - filters=[('redshift', '>', 0.5), - ('redshift', '<', 0.54)]) -``` - -The next cell does the data read and can take several minutes. - -```{code-cell} ipython3 -%%time -df2d = ddf.compute() -``` - -```{code-cell} ipython3 -df2d.head() -``` - -```{code-cell} ipython3 -len(df2d) -``` - -Since this results in almost 4 million galaxies, we will construct 2D histograms -rather than scatter plots. - -```{code-cell} ipython3 - -plt.hist2d(df2d.Mag_true_r_lsst_z0, - (df2d.Mag_true_g_lsst_z0 - df2d.Mag_true_r_lsst_z0), - bins=200, cmap='plasma', cmax = 500) - -# Plot a colorbar with label. -cb = plt.colorbar() -cb.set_label('Number') - -# Add title and labels to plot. -plt.title('True magnitudes and colors in 0.5 < z < 0.54') -plt.xlabel('LSST Mag r') -plt.ylabel('LSST rest-frame g-r color') - -# Show the plot. -plt.show() -``` - -```{code-cell} ipython3 - -``` From b2b1a582ea97976dc018394696e2e04f96c6ca2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Wed, 24 Jul 2024 13:46:04 -0700 Subject: [PATCH 2/5] CONT: adding cosmoDC2 TAP access tutorial --- tutorials/cosmodc2/cosmoDC2_TAP_access.md | 183 ++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 tutorials/cosmodc2/cosmoDC2_TAP_access.md diff --git a/tutorials/cosmodc2/cosmoDC2_TAP_access.md b/tutorials/cosmodc2/cosmoDC2_TAP_access.md new file mode 100644 index 00000000..647c4b9f --- /dev/null +++ b/tutorials/cosmodc2/cosmoDC2_TAP_access.md @@ -0,0 +1,183 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + + + +This tutorial demonstrates how to access the CosmoDC2 Mock V1 catalogs. More information about these catalogs can be found here: https://irsa.ipac.caltech.edu/Missions/cosmodc2.html + +# Use IRSA's Virtual Observatory Table Access Protocol (TAP) Service to access these catalogs. + +This catalog can be accessed through IRSA's Table Access Protocol (TAP) service. See https://www.ivoa.net/documents/TAP/ for details on the protocol. This service can be accessed through Python using the pvyo library. + +```{code-cell} ipython3 +import pyvo as vo +``` + +```{code-cell} ipython3 +service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") +``` + +## List the available DC2 tables + +```{code-cell} ipython3 +tables = service.tables +for tablename in tables.keys(): + if not "tap_schema" in tablename: + if "dc2" in tablename: + tables[tablename].describe() +``` + +## Choose the DC2 catalog you want to work with. + +IRSA currently offers 3 versions of the DC2 catalog. + +* ``cosmodc2mockv1_new`` has been optimized to make searches with constraints on stellar mass and redshift fast. + +* ``cosmodc2mockv1`` has been optimized to make searches with spatial constraints fast. + +* ``cosmodc2mockv1_heavy`` is the same as ``cosmodc2mockv1_new``, except that it does not contain galaxies with stellar masses <= 10^7 solar masses. + +If you are new to the DC2 catalog, we recommend that you start with ``cosmodc2mockv1_heavy`` + +```{code-cell} ipython3 +# Choose the abridged table to start with. +# Queries should be faster on smaller tables. + +tablename = 'cosmodc2mockv1_heavy' +``` + +## How many rows are in the chosen table? + +With TAP, you can query catalogs with constraints specified in IVOA Astronomical Data Query Language (ADQL; https://www.ivoa.net/documents/latest/ADQL.html), which is based on SQL. + +```{code-cell} ipython3 +# For example, this snippet of ADQL counts the number of elements in +# the redshift column of the table we chose. +adql = f"SELECT count(redshift) FROM {tablename}" +adql +``` + +```{code-cell} ipython3 +# In order to use TAP with this ADQL string using pyvo, you can do the following: +result = service.search(adql) +result +``` + +The above query shows that there are 597,488,849 redshifts in this table. + ++++ + +## What is the default maximum number of rows returned by the service? + +This service will return a maximum of 2 billion rows by default. + +```{code-cell} ipython3 +service.maxrec +``` + +This default maximum can be changed, and there is no hard upper limit to what it can be changed to. + +```{code-cell} ipython3 +print(service.hardlimit) +``` + +## List the columns in the chosen table + +This table contains 301 columns. + +```{code-cell} ipython3 +columns = tables[tablename].columns +print(len(columns)) +``` + +Let's learn a bit more about them. + +```{code-cell} ipython3 +for col in columns: + print(f'{f"{col.name}":30s} {col.description}') +``` + +## Create a histogram of redshifts + +Let's figure out what redshift range these galaxies cover. Since we found out above that it's a large catalog, we can start with a spatial search over a small area of 0.1 deg. The ADQL that is needed for the spatial constraint is: + +```{code-cell} ipython3 +adql = f"SELECT redshift FROM {tablename} WHERE CONTAINS(POINT('ICRS', RAMean, DecMean), CIRCLE('ICRS',54.218205903,-37.497959343,.1))=1" +adql +``` + +Now we can use the previously-defined service to execute the query with the spatial contraint. + +```{code-cell} ipython3 +cone_results = service.search(adql) +``` + +```{code-cell} ipython3 +# Plot a histogram +import numpy as np +import matplotlib.mlab as mlab +import matplotlib.pyplot as plt + +num_bins = 20 +# the histogram of the data +n, bins, patches = plt.hist(cone_results['redshift'], num_bins, + facecolor='blue', alpha = 0.5) +plt.xlabel('Redshift') +plt.ylabel('Number') +plt.title('Redshift Histogram CosmoDC2 Mock Catalog V1 abridged') +``` + +We can easily see form this plot that the simulated galaxies go out to z = 3. + ++++ + +## Visualize galaxy colors at z ~ 0.5 + +Now let's visualize the galaxy main sequence at z = 2.0. First, we'll do a narrow redshift cut with no spatial constraint. + +Let's do it as an asynchronous search since this might take awhile. + +```{code-cell} ipython3 +service = vo.dal.TAPService("https://irsa.ipac.caltech.edu/TAP") +adql = f"SELECT Mag_true_r_sdss_z0, Mag_true_g_sdss_z0, redshift FROM {tablename} WHERE redshift > 0.5 and redshift < 0.54" +results = service.run_async(adql) +``` + +```{code-cell} ipython3 +len(results['mag_true_r_sdss_z0']) +``` + +```{code-cell} ipython3 +# Since this results in almost 4 million galaxies, +# we will construct a 2D histogram rather than a scatter plot. +plt.hist2d(results['mag_true_r_sdss_z0'], results['mag_true_g_sdss_z0']-results['mag_true_r_sdss_z0'], + bins=200, cmap='plasma', cmax=500) + +# Plot a colorbar with label. +cb = plt.colorbar() +cb.set_label('Number') + +# Add title and labels to plot. +plt.xlabel('SDSS Mag r') +plt.ylabel('SDSS rest-frame g-r color') + +# Show the plot. +plt.show() +``` + +## About this notebook + + * Author: Vandana Desai (IRSA Science Lead) + * Updated: 2024-07-24 + * Contact: https://irsa.ipac.caltech.edu/docs/help_desk.html From f3b9b38615326abf80ce6cb0b30a3151e7798047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Wed, 24 Jul 2024 13:48:50 -0700 Subject: [PATCH 3/5] Adding cosmoDC2 tutorial to the page --- index.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/index.md b/index.md index 0181cdde..3c25e0e4 100644 --- a/index.md +++ b/index.md @@ -25,7 +25,6 @@ tutorials/irsa-sia-examples/siav2_seip ``` -