


![Python logo](https://cmap.readthedocs.io/en/latest/_static/CMAP_logos/CMAP_logo_High_Res.png) 
# In this notebook we will download enviormental data Using [Simons CMAP](https://simonscmap.com).

## We will create a dataframe that has all the latitude and longitude values that we want to sample, and then use these as targets to sample CMAP

In [1]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [2]:
#!pip install global_land_mask
import pandas as pd
import numpy as np
from global_land_mask import globe

## Creating a df that has every lat/lon point we want to sample.


In [3]:
lat_list = list(range(-80,81,1))
lon_list = list(range(-180,181,1))

lat = []
lon = []

for i in lat_list:
    for j in lon_list:
        if not globe.is_land(i,j):
            lat.append(i)
            lon.append(j)

In [4]:
predictors = pd.DataFrame({'lat':lat,'lon':lon})
print(predictors)
print(predictors.dtypes)
predictors['lat'] = predictors['lat'].astype('float64')
predictors['lon'] = predictors['lon'].astype('float64')
print(predictors.dtypes)

       lat  lon
0      -80 -180
1      -80 -179
2      -80 -178
3      -80 -177
4      -80 -176
...    ...  ...
39989   80  176
39990   80  177
39991   80  178
39992   80  179
39993   80  180

[39994 rows x 2 columns]
lat    int64
lon    int64
dtype: object
lat    float64
lon    float64
dtype: object


#### Picking an arbitrary date and time to sample, this can be changed but you will have to re-sample CMAP

In [5]:
predictors['date'] = '2023-04-10'
predictors['date'] = pd.to_datetime(predictors['date'])

In [6]:
import ephem
# Finding the time in GMT that correstponds to the sunrise at the given location
def find_sunrise(row):
    obs = ephem.Observer()
    obs.lat = str(row['lat'])
    obs.long = str(row['lon'])
    obs.date = row['date']


    sunrise = str(obs.previous_rising(ephem.Sun()))
    return sunrise

predictors['sunrise'] = predictors.apply(find_sunrise, axis=1)

In [7]:
predictors['sunrise'] = pd.to_datetime(predictors['sunrise'])
# we will predict at 4 hours past sunrise at every location
predictors['time'] = predictors['sunrise'] + pd.Timedelta(hours=4)

print(predictors)

        lat    lon       date             sunrise                time
0     -80.0 -180.0 2023-04-10 2023-04-09 20:52:39 2023-04-10 00:52:39
1     -80.0 -179.0 2023-04-10 2023-04-09 20:48:37 2023-04-10 00:48:37
2     -80.0 -178.0 2023-04-10 2023-04-09 20:44:35 2023-04-10 00:44:35
3     -80.0 -177.0 2023-04-10 2023-04-09 20:40:33 2023-04-10 00:40:33
4     -80.0 -176.0 2023-04-10 2023-04-09 20:36:32 2023-04-10 00:36:32
...     ...    ...        ...                 ...                 ...
39989  80.0  176.0 2023-04-10 2023-04-09 14:25:26 2023-04-09 18:25:26
39990  80.0  177.0 2023-04-10 2023-04-09 14:21:29 2023-04-09 18:21:29
39991  80.0  178.0 2023-04-10 2023-04-09 14:17:31 2023-04-09 18:17:31
39992  80.0  179.0 2023-04-10 2023-04-09 14:13:34 2023-04-09 18:13:34
39993  80.0  180.0 2023-04-10 2023-04-09 14:09:37 2023-04-09 18:09:37

[39994 rows x 5 columns]


In [8]:
# the standard depth for Seaflow measurements
predictors['depth'] = 5.0

In [9]:
pd.DataFrame.to_csv(predictors, 'data_ingest/data/original/predictors.csv', index=False)

### Sampling CMAP
We will be using the exact same variables that we used for the SeaFlow data.

In [10]:
import pycmap
api = pycmap.API(token='<8f086ef3-74b5-44a8-9e11-6135f4cacaf1>')
predictors['time'] = predictors['time'].dt.strftime('%Y-%m-%d %H:%M:%S')


In [11]:
alk = api.query(
    '''
    SELECT month, lat, lon, depth, ALK_darwin_clim FROM tblDarwin_Nutrient_Climatology
    WHERE
    lat >= -80 AND lat <= 80 AND
    lon >= -180 AND lon <= 180 AND
    month = 4 AND
    depth > 4 AND depth < 6
    ORDER BY lat, lon
    '''
)

In [12]:
sss = api.query(
    '''
    SELECT [time], lat, lon, sss_smap FROM tblSSS_NRT_cl1
    WHERE
    lat >= -80 AND lat <= 80 AND
    lon >= -180 AND lon <= 180 AND
    [time] >= '2023-04-09' AND [time] < '2023-04-12'
    ORDER BY [time], lat, lon
    '''
)

In [13]:
sst = api.query(
    '''
    SELECT [time], lat, lon, sst FROM tblSST_AVHRR_OI_NRT
    WHERE
    lat >= -80 AND lat <= 80 AND
    lon >= -180 AND lon <= 180 AND
    [time] >= '2023-04-10' AND [time] < '2023-04-11'
    ORDER BY [time], lat, lon
    '''
)

In [14]:
pisces = api.query(
    '''
    SELECT [time], lat, lon, fe, o2, no3, po4, si FROM tblPisces_Forecast_cl1
    WHERE
    lat >= -80 AND lat <= 80 AND
    lon >= -180 AND lon <= 180 AND
    ROUND(lat) = lat AND
    ROUND(lon) = lon AND
    [time] >= '2023-04-10' AND [time] < '2023-04-11' AND
    depth > 4 AND depth < 6
    ORDER BY [time], lat, lon
    '''
)

In [15]:
import pyarrow as pa
import pyarrow.parquet as pq

In [16]:
sss_table = pa.Table.from_pandas(sss)
sst_table = pa.Table.from_pandas(sst)
alk_table = pa.Table.from_pandas(alk)
pisces_table = pa.Table.from_pandas(pisces)

In [17]:
pq.write_table(sss_table, 'data_ingest/data/original/sss.parquet')
pq.write_table(sst_table, 'data_ingest/data/original/sst.parquet')
pq.write_table(alk_table, 'data_ingest/data/original/alk.parquet')
pq.write_table(pisces_table, 'data_ingest/data/original/pisces.parquet')

In [18]:
# pd.DataFrame.to_csv(alk, 'data_ingest/data/original/alk.csv', index=False)
# pd.DataFrame.to_csv(sss, 'data_ingest/data/original/sss.csv', index=False)
# pd.DataFrame.to_csv(sst, 'data_ingest/data/original/sst.csv', index=False)
# pd.DataFrame.to_csv(pisces, 'data_ingest/data/original/pisces.csv', index=False)