In [1]:
"""
    These are the modules we'll be using. 
    The %pip command is a special Jupyter command that allows us to load packages locally for the notebook,
    instead of installing them
"""

%pip install numpy
%pip install h5py
%pip install pandas
%pip install openpyxl

import numpy as np
import h5py
import pandas

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no long

In [2]:
# First step, let's generate some data
Npts = 100
gen_rand_dat = lambda : np.random.default_rng().standard_normal(Npts)
x = gen_rand_dat()
y = gen_rand_dat()
z = gen_rand_dat()

"""
    We also want to give our data attributes so people looking at the data can know what is being presented
"""
headers = {'x': 'bees', 'y': 'dogs', 'z': 'pineapples'}

In [3]:
"""
    First option, storing data as a comma-separated values (CSV) file. Two ways to do this. 
    
    First way is without Numpy. We're going to first write the header to a string.
    Then for each element of (x,y,z), we append the value to the string, tstr.
"""
tstr = ''
nkeys = len(headers.keys())
for ikey,akey in enumerate(headers):
    if ikey == nkeys - 1:
        # if this is the last entry, write a newline character, \n
        lchar = '\n'
    else:
        # otherwise, use the data separator. here a comma for CSV
        lchar = ','
    tstr += '{:}{:}'.format(headers[akey],lchar)
    
for i in range(len(x)):
    tstr += '{:}, {:}, {:}\n'.format(x[i],y[i],z[i])
    
"""
    Now we can actually write the data to a file.
    'tmp_fl' is a variable that stores the file we are writing to.
    'w' means open the file in write mode, and '+' means make a new file if it doesn't exist.
    If you just want to read data from a file, replace 'w+' with 'r'.
"""
with open('./CSV_std_python.csv','w+') as tmp_fl:
    tmp_fl.write(tstr)

In [4]:
"""
    Second way, use Numpy. This is extraodrinarily easy when your data is saved as a Numpy array
    
    We still have to make the header string, but this is the same as before
"""
hstr = ''
nkeys = len(headers.keys())
for ikey,akey in enumerate(headers):
    if ikey == nkeys - 1:
        lchar = ''
    else:
        lchar = ','
    hstr += '{:}{:}'.format(headers[akey],lchar)
    
"""
    And that's it! Notice that Numpy defaults to scientific format for saving a file.
    You can use the fmt option to specify a different numeric format, see here:
        https://numpy.org/doc/stable/reference/generated/numpy.savetxt.html
        
    Numpy expects the data to be in an array format, so we transpose the individual columns (x,y,z) into an array, 
    tdat
"""
tdat = np.transpose((x,y,z))
numpy_file = 'CSV_numpy.csv'
np.savetxt(numpy_file,tdat,delimiter=',',header = hstr)

In [5]:
"""
    To read in data from a CSV using Numpy, we use genfromtxt
    
    We need to tell numpy how many header (string) lines are present so it can skip them
    There are more options, like specifying whether a column should be integers, floats, etc.
"""
newdat = np.genfromtxt(numpy_file,delimiter=',',skip_header=1)
print(newdat)

[[ 0.38703258 -1.16858028 -1.31020582]
 [-1.21626495 -1.23521801  0.32991304]
 [ 0.46626521  1.48910423 -1.72275824]
 [-1.04870721  0.27853479 -1.6564019 ]
 [-2.10637432  1.33789622 -0.36462733]
 [ 0.04158944  0.7399749   0.29959579]
 [-1.1888016  -0.22744891  2.43123419]
 [ 1.4269652   2.83265892  1.09923481]
 [ 1.02505994 -1.1448108  -0.38191974]
 [-0.02653877 -0.3193336   2.0913984 ]
 [-0.37915203  0.74051077  1.33936682]
 [ 0.29320314  0.23724278 -1.78254977]
 [-1.86603777  0.85166762 -0.47350371]
 [ 1.31241293  1.00067909 -1.19665822]
 [-1.0168521  -2.06286192  0.25191383]
 [ 1.26287022 -0.33143395 -0.23282828]
 [ 0.64480542 -1.39291442 -1.12213131]
 [-1.59130327  0.64512638  1.58083265]
 [ 0.21657878 -0.20515405 -1.47987031]
 [ 1.7564697   0.50582011 -0.56682325]
 [ 1.86468056 -0.32250499 -0.8266571 ]
 [-0.18247334  0.020184   -1.20162144]
 [-0.35250307 -0.6230851  -1.1708306 ]
 [ 0.81592922 -0.63999838 -0.40699362]
 [-1.08266074  1.12603847 -2.42828016]
 [-0.98877644  0.49532996

In [6]:
"""
    If you're collaborating with less tech savvy people, it's best to use either CSV, 
    which can be read by Excel, or Excel itself.
    
    Pandas lets you make excel files directly and is very handy for this.
    Notice that Pandas also wants the data as an array, so we reuse tdat
    We can specify the column names using the columns keyword
"""
tmppd = pandas.DataFrame(tdat, columns=[headers[akey] for akey in headers])

"""
    Now we can save this to an excel file
    
    index = False turns off the default indexing used by Pandas (0, 1,..., N_data)
"""
pandas_file = 'some_data.xlsx'
xclw = pandas.ExcelWriter(pandas_file,engine='xlsxwriter')
tmppd.to_excel(xclw,index=False)
xclw.save()

In [7]:
"""
    Another nice feature of Pandas is reading CSV's into numpy arrays. 
    This lets you manipulate data easily.
"""

tmpdat = pandas.read_excel(pandas_file)
print(tmpdat,'\n\n')

"""
    Access any single column like a dictionary
"""
print('x',tmpdat[headers['x']],'\n\n')

"""
    And operate on any two columns elementwise
"""
pdiff = 200*(tmpdat[headers['x']] - tmpdat[headers['y']])/(tmpdat[headers['x']] + tmpdat[headers['y']])
print(pdiff)

        bees      dogs  pineapples
0   0.387033 -1.168580   -1.310206
1  -1.216265 -1.235218    0.329913
2   0.466265  1.489104   -1.722758
3  -1.048707  0.278535   -1.656402
4  -2.106374  1.337896   -0.364627
..       ...       ...         ...
95 -1.579770  0.041551    0.533954
96  1.551070  1.305702   -0.562450
97 -1.939305 -1.363541   -0.864700
98 -0.125230  0.085147    1.045807
99  0.155444 -0.854324   -0.230752

[100 rows x 3 columns] 


x 0     0.387033
1    -1.216265
2     0.466265
3    -1.048707
4    -2.106374
        ...   
95   -1.579770
96    1.551070
97   -1.939305
98   -0.125230
99    0.155444
Name: bees, Length: 100, dtype: float64 


0     -398.085197
1       -1.546253
2     -104.618493
3      344.661006
4      896.387426
         ...     
95     210.805068
96      17.178012
97      34.864694
98    1049.687892
99    -288.967194
Length: 100, dtype: float64


In [9]:
"""
    Last, if you have a huge data file (like the charge density on a grid), you might want to use an HDF5 file.
    This stores large data efficiently by keywords
"""
hdf5_file = 'practice_HDF5.hdf5'
tfl = h5py.File(hdf5_file, 'w')

# HDF5 is hierarchical (like a directory), so we first make a group

tgrp = tfl.create_group('Main data')
# Next we make the datasets for x, y, and z. These are accessible by the header of x, y, and z
tgrp.create_dataset(headers['x'],data=x)
tgrp.create_dataset(headers['y'],data=y)
tgrp.create_dataset(headers['z'],data=z)

# Suppose we have related data, but want to store it separately. 
# We computed the percent difference between x and y earlier, let's save that
tgrp2 = tfl.create_group('Aux data')
pdset = tgrp2.create_dataset('Percent diff',data=np.array(pdiff))

# We can also add attributes to data. You could store computational parameters here, or notes
tgrp.attrs['Note'] = 'The main data for this paper'
pdset.attrs['Metadata'] = 'Percent difference between x and y'

tfl.close()

In [13]:
"""
    Now reading in an HDF5 file
"""

tfl2 = h5py.File(hdf5_file, "r")

# we can see all available groups in the HDF5 file
print(tfl2.keys())

# and any attributes
for akey in tfl2.keys():
    print(akey,tfl2[akey].attr)

# now let's open one data set

<KeysViewHDF5 ['Aux data', 'Main data']>


AttributeError: 'Group' object has no attribute 'attr'