# Xarray
## Paul Eldridge

Xarray is a python package that is great for working with multi-dimensional arrays. It is great for working with netCDF files.

In [3]:
import datetime
import numpy as np
import pandas as pd
import xarray as xr


### Since this class is based off of the geosciences, we will use a geoscience example. First, we will generate a random temperature dataset, in Kelvin, using numpy

In [6]:
data = 273 + np.random.randn(5, 3, 4) # Creating random "temperatures"
data

array([[[273.6276138 , 272.38656799, 274.26968778, 275.37802285],
        [272.69200018, 273.28536818, 272.88865279, 272.5870791 ],
        [272.15289789, 272.76111158, 274.26321078, 270.686693  ]],

       [[272.95691258, 272.32355586, 273.83247776, 273.06207312],
        [271.72048339, 273.10794271, 273.554869  , 273.27342346],
        [270.4025347 , 271.54893124, 274.81983597, 272.57025397]],

       [[271.20032766, 272.8855977 , 272.613939  , 272.29489548],
        [273.18285194, 272.4851436 , 273.74161712, 274.25606379],
        [273.3037821 , 271.74287482, 271.72422839, 271.4740035 ]],

       [[272.53933419, 272.14297363, 273.06960744, 273.37355491],
        [271.85167663, 274.62733533, 273.49968885, 271.98429329],
        [273.55391066, 272.61100173, 272.72590508, 273.89568052]],

       [[273.33813296, 273.56308935, 271.56004419, 273.60148362],
        [272.22217027, 273.75816271, 271.89071361, 274.97672194],
        [273.03118213, 271.18603253, 271.90092721, 273.75082201]]])

### This dataset is only a numpy array at the moment, so we will need to cast it into an xarray "data array"

In [5]:
temps = xr.DataArray(data) # Creation of a DataArray
temps

# Next, we will assign dimension names to our data array. This is where Xarray data arrays begin to shine over numpy arrays, as this isn't something we can do with numpy arrays.

In [7]:
temps = xr.DataArray(data, dims=['time', 'lat', 'lon']) # Assigning dimension names
temps

# Xarray is great with "n-dimension" arrays"; in our use case, these dimensions will usually be time, latitude, and longitude. We will also store one or more variables that will be plotted for some latitude and longitude at some time. We are creating, in a sense, a "slideshow" of graphs. 

### Now, we will make spacetime coordinates. The times will be handled using pandas and datetime, and the space (lat/lons) will be done with numpy. We will then add this to our Xarray Dataarray.

In [8]:
times = pd.date_range('2018-01-01', periods=5) # creating an array of datetimes
times

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05'],
              dtype='datetime64[ns]', freq='D')

In [11]:
lons = np.linspace(-120, -60, 4) # creating a sample of longitudes. The last argument represents the increments, 
                                 # so we end up with 4 longitude points and 3 latitude points. 
lats = np.linspace(25, 55, 3) # creating a sample of latitudes

temps = xr.DataArray(data, coords=[times, lats, lons], 
                     dims=['time', 'lat', 'lon'])
# creating a DataArray with dimensions and coordinates associated with the temperature values

temps

### Next, we will add attributes to our Data Array. We want to know what our data means. 

In [12]:
temps.attrs['units'] = 'kelvin'
temps.attrs['standard_name'] = 'air_temperature'
# Creating ttributes (metadata)

temps

### Like numpy, it is easy to perform mathematical operations in Xarray. Here, we convert our temperatures from Kelvin to Celcius. Notice that the attributes do not carry over.

In [14]:
tempC = temps - 273.15 # Converting from kelvin to celcius
tempC

# Now, we will another dimension to our data array. Let's do pressures. 

In [15]:
pressureRand = 1013.25 + 2 * np.random.randn(5, 3, 4)
pressures = xr.DataArray(pressureRand, coords= [times, lats, lons],
                        dims= ['time', 'lat', 'lon']
                        )
pressures.attrs['units'] = 'hPa'
pressures.attrs['standard_name'] = 'air_pressure'

# Creating a dataset with pressures

pressures

In [17]:
ds = xr.Dataset(data_vars= {'Temperatures': temps, 'Pressures': pressures})
ds

# Combining the DataArrays into a Dataset. These arrays share the coordinates.

### If we want to examine one of our variables, we can call that specific array in the following ways:

In [18]:
ds.Pressures

'''OR''' # Two ways to call an array

ds['Pressures']

# What if we want to analyze some variable at a specific time or place?

In [19]:
named_selection = temps.sel(time= '2018-01-02')
named_selection
# Using .sel() to select data based on coordinate values. This takes at least
# 1 named coordinate and returns data matching said coordinates.

In [22]:
temps.sel(time= slice('2018-01-01', '2018-01-03'), lon= slice(-110, -70), lat= slice(25, 45))

# In this example,, we are slicing along coordinates for a specific range of times, lats, and lons.


In [23]:
temps.loc['2018-01-01']

In [24]:
temps.loc['2018-01-01':'2018-01-01', 23:45, -110:-70]

# Another selection method, .loc[], similar to pandas

# Like pandas, we can import files directly into a dataset. These files will typically be netcdf files.

In [None]:
filepath = 'NOAAGlobalTemp_v5.0.0_gridded_s188001_e202212_c20230108T133308.nc'

ds = xr.open_dataset(filepath)
ds

# Importing a dataset into Xarray.