In [1]:
%pylab inline

import numpy as np
import pandas as pd
import dask.dataframe as dd

print(np.__version__, pd.__version__)

Populating the interactive namespace from numpy and matplotlib
1.11.0 0.18.1


Read the 15 Gbyte dataframe located in a HDF5 data file into a dask dataframe.  This will allow us to do out of memory operations.

In [2]:
df = dd.read_hdf('dithered.hdf', key='dithered')

In [3]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Data columns (total 166 columns):
FLAGS1                                       uint64
FLAGS2                                       uint64
FLAGS3                                       int64
id                                           int64
coord_ra                                     float64
coord_dec                                    float64
parent                                       int64
deblend_nChild                               int64
deblend_psfCenter_x                          float64
deblend_psfCenter_y                          float64
deblend_psfFlux                              float64
base_GaussianCentroid_x                      float64
base_GaussianCentroid_y                      float64
base_NaiveCentroid_x                         float64
base_NaiveCentroid_y                         float64
base_SdssCentroid_x                          float64
base_SdssCentroid_y                          float64
base_SdssCentroid_xSigma           

Now do a dask operation and put the results (which will fit in memory) into a normal pandas data frame.  This selects a patch and only some columns for the result.

In [4]:
# The columns we actually want to use
requested_columns = ['patch', 'footprint','base_PsfFlux_flux','base_PsfFlux_fluxSigma']

This is all of the entries, no selection and some of the columns.

In [5]:
selected_columns = df[requested_columns].compute()
selected_columns.shape

(12311932, 4)

This applies a selection and only returns the requested columns

In [6]:
selected_patch = df.query("patch == \"'10,10'\"")[requested_columns].compute()
selected_patch.shape

(79076, 4)

This selects the first 100 entries with all of the columns.

In [7]:
first_100 = df.head(100)
first_100.shape

(100, 166)

In [8]:
first_100.head(5)

Unnamed: 0,FLAGS1,FLAGS2,FLAGS3,id,coord_ra,coord_dec,parent,deblend_nChild,deblend_psfCenter_x,deblend_psfCenter_y,...,modelfit_CModel_initial_apCorr,modelfit_CModel_initial_apCorrSigma,base_GaussianFlux_apCorr,base_GaussianFlux_apCorrSigma,modelfit_CModel_dev_apCorr,modelfit_CModel_dev_apCorrSigma,base_ClassificationExtendedness_value,footprint,projectId,patch
0,11970879316054508036,10378695580359589885,766,70381629079553,1.722391,-0.520165,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,1,1,"'8,12'"
1,2533274791051780,13868985135874965504,640,70381629079554,1.72146,-0.513888,0,4,,,...,0.992812,0.0,1.066794,0.0,0.992824,0.0,,2,1,"'8,12'"
2,11970879316012564996,10378695580359589885,766,70381629079555,1.720959,-0.520203,0,0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,3,1,"'8,12'"
3,2463768135275119108,9541026117278760941,640,70381629079556,1.720945,-0.519593,0,0,,,...,0.994504,0.0,1.070237,0.0,0.993761,0.0,,4,1,"'8,12'"
4,11682355097525813764,9559040515788242941,640,70381629079557,1.720937,-0.519563,0,0,,,...,0.994491,0.0,1.070189,0.0,0.993735,0.0,,5,1,"'8,12'"


Lets look at the size of the resulting data frames.

In [9]:
selected_patch.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79076 entries, 260580 to 339655
Data columns (total 4 columns):
patch                     79076 non-null object
footprint                 79076 non-null int64
base_PsfFlux_flux         79076 non-null float64
base_PsfFlux_fluxSigma    79076 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 3.0+ MB


In [10]:
selected_columns.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12311932 entries, 0 to 12311931
Data columns (total 4 columns):
patch                     object
footprint                 int64
base_PsfFlux_flux         float64
base_PsfFlux_fluxSigma    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 469.7+ MB


In [11]:
first_100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Columns: 166 entries, FLAGS1 to patch
dtypes: float64(153), int64(10), object(1), uint64(2)
memory usage: 130.5+ KB
