# Chlorophyll A Data Vis Workbook
### Amy Phung, August 2020

This notebook provides some minor revisions to the original figures presented in the extended abstract notebook. Notably:
+ At the time of writing, the original flow-through data did not have a fully up-to-date satellite dataset as a basis of comparison. The data was compared to the most recent data at the time as a stand-in. In this notebook, this figure is updated using currently available data.
+ The .nc files are now directly used (significant time & space improvement over converting data to a csv)

## Setup

In [133]:
from netCDF4 import Dataset
import numpy as np
import pandas as pd
import glob
import datetime
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## BGC Argo
The satellite data for the BGC Argo comparison was fully up-to-date in the original, so the original results are loaded here

In [34]:
compiled_results = pd.read_csv('data/compiled-data/3-31results2.csv', index_col=0)
compiled_results.head()

Unnamed: 0,Src,Lat,Long,Chl-A,Timestamp,Station,Sat Chl-A,Sat Lat,Sat Long,Sat File
0,BGC,-32.465,185.242,0.598395,1501384000.0,1,0.220572,-32.479168,185.270828,data/satellite-data/autogenerated-csv\V2017209...
1,BGC,-32.267,185.3,0.628195,1502250000.0,2,0.276144,-32.270832,185.3125,data/satellite-data/autogenerated-csv\V2017217...
2,BGC,-32.298,185.256,0.415293,1503110000.0,3,0.211113,-32.3125,185.270828,data/satellite-data/autogenerated-csv\V2017225...
3,BGC,-32.56,185.326,0.51482,1503982000.0,4,0.181649,-32.5625,185.395828,data/satellite-data/autogenerated-csv\V2017233...
4,BGC,-32.979,185.25101,0.363909,1504851000.0,5,0.204564,-32.979168,185.270828,data/satellite-data/autogenerated-csv\V2017249...


## Satellite Data Update
Fully up-to-date satellite data was not available when creating the original figures, so those figures are updated here

#### Satellite Data Class

In [122]:
class SatelliteDataset():
    def __init__(self, directory):
        self.lookup_df = self._createLookupTable(directory)
        
    def _parseTimestamps(self, files):
        ts_starts, ts_ends = [], []

        for f in files:
            # Parse filename to find date
            start_idx = f.find("V") + 1 
            year_st  = int(f[start_idx:start_idx + 4]) # Year is 4 chars long 
            day_st   = int(f[start_idx + 4:start_idx + 7]) # Day is 3 chars long
            year_end = int(f[start_idx + 7:start_idx + 11]) 
            day_end  = int(f[start_idx + 11:start_idx + 14]) 

            # Convert from date to timestamp
            time_st = datetime.datetime(year_st, 1, 1) \
                      + datetime.timedelta(day_st - 1)
            time_end = datetime.datetime(year_end, 1, 1) \
                      + datetime.timedelta(day_end)
        
            # Add to output array
            ts_starts.append(time_st.timestamp())
            ts_ends.append(time_end.timestamp())
            
        return ts_starts, ts_ends
    
    def _createLookupTable(self, directory):
        """ Creates lookup table for finding appropriate file by time """
        file_list = glob.glob('%s/*.nc' % directory)
        ts_starts, ts_ends = self._parseTimestamps(file_list)
        
        l_df = pd.DataFrame([file_list, ts_starts, ts_ends]).T
        l_df.columns=["Filename", "Time Start", "Time End"]
        l_df = l_df.sort_values(by=["Time Start"])
        l_df.reset_index(inplace=True, drop=True)
        return l_df
    
    def findBestFile(self, timestamp):
        """For a given timestamp, finds the best satellite data file"""

        valid_st = self.lookup_df[(timestamp >= self.lookup_df["Time Start"])]
        if len(valid_st) == 0: # Timestamp is earlier than available data
            print("Warning: Timestamp is earlier than available data. Using first available file")
            return str(self.lookup_df.iloc[0]["Filename"])
        
        valid_end = valid_st[timestamp < valid_st["Time End"]]
        if len(valid_end) == 0: # Timestamp is later than available data
            print("Warning: Timestamp is later than available data. Using last available file")
            return str(self.lookup_df.iloc[-1]["Filename"])
        return str(valid_end.iloc[-1]["Filename"])

In [125]:
# Initialize satellite dataset
sat_data = SatelliteDataset('data/satellite-data')

In [147]:
# Create list of new satellite files
new_sat_list = compiled_results[compiled_results["Src"] == "FT"]["Timestamp"].parallel_apply(sat_data.findBestFile)

#### Compute new chl-a, lats, long values for all flow-through points

In [223]:
def lookupChlorophyll(df_row, lons, lats, chls):
    """Looks up nearest satellite point for each flow-through 
    point and returns the corresponding chlorophyll value"""
   
    
    lon_idx = abs(lons-df_row["Long"]).argmin()
    lat_idx = abs(lats-df_row["Lat"]).argmin()
    
    chl_val = chls[lat_idx][lon_idx]
    
    return pd.Series(chl_val)

temp_sat_list = []

for f in new_sat_list.unique():
    fh = Dataset(f, mode='r')
    lons = fh.variables['lon'][:]
    lats = fh.variables['lat'][:]
    chls = fh.variables['chlor_a'][:]
    fh.close()

    # Extract flow-through data with current sat file name
    new_sat_df = compiled_results[compiled_results["Src"] == "FT"] \
                                 [new_sat_list == f].head() \
                                 .parallel_apply(lookupChlorophyll, axis=1, args=(lons,lats,chls,))
    temp_sat_list.append(new_sat_df)

In [225]:
new_sat_data = pd.concat(temp_sat_list)
new_sat_data

Unnamed: 0,0
93,1.068479
94,1.068479
95,1.068479
96,1.068479
97,1.068479
7919,0.091004
7920,0.089683
7921,0.089683
7922,0.089683
7923,0.089683


In [164]:
[new_sat_list[new_sat_list == f] == f]

[93      True
 94      True
 95      True
 96      True
 97      True
 98      True
 99      True
 100     True
 101     True
 102     True
 103     True
 104     True
 105     True
 106     True
 107     True
 108     True
 109     True
 110     True
 111     True
 112     True
 113     True
 114     True
 115     True
 116     True
 117     True
 118     True
 119     True
 120     True
 121     True
 122     True
         ... 
 7889    True
 7890    True
 7891    True
 7892    True
 7893    True
 7894    True
 7895    True
 7896    True
 7897    True
 7898    True
 7899    True
 7900    True
 7901    True
 7902    True
 7903    True
 7904    True
 7905    True
 7906    True
 7907    True
 7908    True
 7909    True
 7910    True
 7911    True
 7912    True
 7913    True
 7914    True
 7915    True
 7916    True
 7917    True
 7918    True
 Name: Timestamp, Length: 7826, dtype: bool]

In [153]:
# compiled_results[compiled_results["Src"] == "FT"]

In [120]:
# compiled_results[compiled_results["Src"] == "FT"]
print(sat_data.lookup_df)
print(sat_data.findBestFile(1584763200.0))

In [None]:




my_example_nc_file = 'data/satellite-data/V20200412020048.L3m_8D_SNPP_CHL.x_chlor_a.nc'
fh = Dataset(my_example_nc_file, mode='r')

lons = fh.variables['lon'][:]
lats = fh.variables['lat'][:]
chls = fh.variables['chlor_a'][:]

fh.close()

In [8]:
fh.variables.keys()

odict_keys(['chlor_a', 'lat', 'lon', 'palette'])

In [27]:
lon, lat = np.meshgrid(lons, lats)

chl=np.squeeze(chls)

print(lon.shape)
print(lat.shape)
print(chl.shape)
# df = pd.DataFrame([lon, lat, chl])

(480, 8640)
(480, 8640)
(480, 8640)


In [16]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8630,8631,8632,8633,8634,8635,8636,8637,8638,8639
0,-179.979,-179.938,-179.896,-179.854,-179.812,-179.771,-179.729,-179.688,-179.646,-179.604,...,179.604172,179.645844,179.687515,179.729172,179.770844,179.812515,179.854172,179.895844,179.937515,179.979172
1,-30.0208,-30.0625,-30.1042,-30.1458,-30.1875,-30.2292,-30.2708,-30.3125,-30.3542,-30.3958,...,,,,,,,,,,
2,"[0.05987856, 0.059922937, 0.060528573, 0.06586...","[0.060319662, 0.057855785, 0.056257866, 0.0559...","[0.057511065, 0.05571203, 0.05496283, 0.057378...","[0.06274808, 0.059540287, 0.058732588, 0.06194...","[0.07171011, 0.07580612, 0.06978024, 0.0661913...","[0.06983888, 0.06841403, 0.06936561, 0.0703758...","[0.07597647, 0.06938917, 0.06562619, 0.0648491...","[0.07352852, 0.067037374, 0.06266423, 0.061499...","[0.064043775, 0.06602199, 0.066545814, 0.06487...","[0.071973175, 0.081027456, 0.08886131, 0.09520...",...,,,,,,,,,,


In [9]:
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

# Get some parameters for the Stereographic Projection
lon_0 = lons.mean()
lat_0 = lats.mean()

m = Basemap(width=5000000,height=3500000,
            resolution='l',projection='stere',\
            lat_ts=40,lat_0=lat_0,lon_0=lon_0)

# Because our lon and lat variables are 1D,
# use meshgrid to create 2D arrays
# Not necessary if coordinates are already in 2D arrays.
lon, lat = np.meshgrid(lons, lats)
xi, yi = m(lon, lat)

# Plot Data
cs = m.pcolor(xi,yi,np.squeeze(tmax))

# Add Grid Lines
m.drawparallels(np.arange(-80., 81., 10.), labels=[1,0,0,0], fontsize=10)
m.drawmeridians(np.arange(-180., 181., 10.), labels=[0,0,0,1], fontsize=10)

# Add Coastlines, States, and Country Boundaries
m.drawcoastlines()
m.drawstates()
m.drawcountries()

# Add Colorbar
cbar = m.colorbar(cs, location='bottom', pad="10%")
cbar.set_label(tmax_units)

# Add Title
plt.title('DJF Maximum Temperature')

plt.show()

ImportError: No module named 'mpl_toolkits.basemap'