


![Python logo](https://cmap.readthedocs.io/en/latest/_static/CMAP_logos/CMAP_logo_High_Res.png) 
# In this notebook we will download SeaFlow and enviormental data Using [Simons CMAP](https://simonscmap.com).

Below are the datasets that will be used
The End goalEnd goal is to create a dataset that has these variables below.



#### SeaFlow:
 - time
 - lat
 - lon
 - biomass
 - CruiseName


In this notebook we will also use <u> depth</u>, and <u>cruise</u> to match with other avalaible CMAP dataframes. 


#### Mercator Pisces Biogeochemistry blended observations and model:
 - NO3
 - PO4
 - Fe
 - Si
 - Alk


#### NOAA Satellite Sea Surface Temperature: 
 - sst


#### SMAP Satellite Sea Surface Salinity:
 - sss


#### Copernicus Satellite SSH derived current speed:
 - ugos 
 - vgos




# Loading Functions

In [1]:
import pandas as pd
import numpy as np


In [2]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)


## Creating Real Time data set
### Datetime is in UTC

## The Covariate Seaflow dataset is all cruises that SeaFlow has been run on as of 2023

In [3]:

import pandas as pd
covari_path = 'data_ingest/data/original/SeaFlow_cmap_v1/SeaFlow_cmap_v1.6.xlsx'
#covari = pd.read_excel(covari_path, sheet_name=0)
#changing column names to be earier to read
covari_cols = ['time', 'lat',
       'lon', 'depth', 'cruise', 'abundance_prochloro', 'abundance_synecho',
       'abundance_picoeuk', 'abundance_croco', 'diam_prochloro', 'diam_synecho', 
       'diam_picoeuk', 'diam_croco',  'Qc_prochloro', 'Qc_synecho', 'Qc_picoeuk', 'Qc_croco', 
       'biomass_prochloro', 'biomass_synecho', 'biomass_picoeuk', 'biomass_croco']

#reading in the xlsx to a pandas df
covari = (pd
          .read_excel(covari_path, sheet_name=0, names=covari_cols)
          #need to get rid of the first 2 rosw as they are a repeat when loaded as such
          .tail(-2)
         )

covari.head(3)

Unnamed: 0,time,lat,lon,depth,cruise,abundance_prochloro,abundance_synecho,abundance_picoeuk,abundance_croco,diam_prochloro,...,diam_picoeuk,diam_croco,Qc_prochloro,Qc_synecho,Qc_picoeuk,Qc_croco,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco
2,2010-05-21 20:17:20,44.2505,-124.1677,5,W1005A,0.082249,2.352316,5.753303,0.045237,1.058705,...,0.826935,1.744986,0.208296,0.038762,0.091634,0.629223,0.017132,0.09118,0.527201,0.028464
3,2010-05-21 20:23:20,44.2519,-124.1685,5,W1005A,0.164716,2.437801,5.583881,0.037061,0.793225,...,0.826297,1.527685,0.082608,0.038834,0.091452,0.446465,0.013607,0.094669,0.510656,0.016547
4,2010-05-21 20:26:20,44.2519,-124.1687,5,W1005A,0.041195,2.44287,5.487189,0.028837,0.655232,...,0.836938,1.575126,0.050267,0.039896,0.094522,0.483121,0.002071,0.097461,0.518658,0.013932


In [4]:
#this cruise does not have lat/lon values so it needs to be removed
covari = covari[covari['cruise'] != 'KOK1512']

In [5]:
def ChangeObjectTypes(df):
    """
    Changing the object types so that it works best for CMAP colocalizatin of data
    """
    for column in df:
        if column == 'PopulationName' or column == 'cruise' or column == 'time':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column])
    return df
covari = ChangeObjectTypes(covari)       

## Keeping only the data varaibles that we will be using for the machine learning model

In [6]:
covari = (covari[['time', 'lat',
            'lon', 'depth', 'cruise', 'abundance_prochloro', 'abundance_synecho',
            'abundance_picoeuk', 'abundance_croco', 'diam_prochloro', 'diam_synecho', 
            'diam_picoeuk', 'diam_croco',  'Qc_prochloro', 'Qc_synecho', 'Qc_picoeuk', 'Qc_croco', 
            'biomass_prochloro', 'biomass_synecho', 'biomass_picoeuk', 'biomass_croco']]
          .dropna()
          .reset_index(drop=True)
         )
covari

Unnamed: 0,time,lat,lon,depth,cruise,abundance_prochloro,abundance_synecho,abundance_picoeuk,abundance_croco,diam_prochloro,...,diam_picoeuk,diam_croco,Qc_prochloro,Qc_synecho,Qc_picoeuk,Qc_croco,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco
0,2010-05-21 20:17:20,44.2505,-124.1677,5,W1005A,0.082249,2.352316,5.753303,0.045237,1.058705,...,0.826935,1.744986,0.208296,0.038762,0.091634,0.629223,0.017132,0.091180,0.527201,0.028464
1,2010-05-21 20:23:20,44.2519,-124.1685,5,W1005A,0.164716,2.437801,5.583881,0.037061,0.793225,...,0.826297,1.527685,0.082608,0.038834,0.091452,0.446465,0.013607,0.094669,0.510656,0.016547
2,2010-05-21 20:26:20,44.2519,-124.1687,5,W1005A,0.041195,2.442870,5.487189,0.028837,0.655232,...,0.836938,1.575126,0.050267,0.039896,0.094522,0.483121,0.002071,0.097461,0.518658,0.013932
3,2010-05-21 20:29:20,44.2519,-124.1686,5,W1005A,0.123568,2.397221,5.465828,0.057665,0.863134,...,0.828218,2.163342,0.102344,0.037872,0.092002,1.100616,0.012646,0.090787,0.502864,0.063467
4,2010-05-21 20:38:20,44.2530,-124.1752,5,W1005A,0.123396,2.644784,5.569264,0.024679,0.756068,...,0.846435,1.430712,0.072723,0.039015,0.097314,0.377199,0.008974,0.103186,0.541965,0.009309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121471,2012-02-06 03:57:38,34.5645,-122.2759,5,Tokyo_4,199.265903,286.071037,209.833832,0.090759,0.438957,...,0.930074,3.097573,0.017883,0.029180,0.124096,2.765768,3.563395,8.347600,26.039451,0.251017
121472,2012-02-06 04:06:39,34.5572,-122.2139,5,Tokyo_4,185.432911,254.462842,199.292507,0.091798,0.435216,...,0.920438,0.732544,0.017492,0.029180,0.120805,0.067027,3.243594,7.425268,24.075548,0.006153
121473,2012-02-06 04:24:40,34.5421,-122.0920,5,Tokyo_4,165.296363,235.585959,191.344464,0.092170,0.437831,...,0.903773,1.965274,0.017764,0.028334,0.115243,0.855090,2.936403,6.675133,22.051099,0.078813
121474,2012-02-06 04:27:40,34.5399,-122.0718,5,Tokyo_4,146.827747,204.563909,190.944719,0.092022,0.445211,...,0.910481,3.351962,0.018547,0.028721,0.117462,3.390411,2.723253,5.875356,22.428844,0.311991


In [7]:
covari.dtypes

time                    object
lat                    float64
lon                    float64
depth                    int64
cruise                  object
abundance_prochloro    float64
abundance_synecho      float64
abundance_picoeuk      float64
abundance_croco        float64
diam_prochloro         float64
diam_synecho           float64
diam_picoeuk           float64
diam_croco             float64
Qc_prochloro           float64
Qc_synecho             float64
Qc_picoeuk             float64
Qc_croco               float64
biomass_prochloro      float64
biomass_synecho        float64
biomass_picoeuk        float64
biomass_croco          float64
dtype: object

# Using SimonCMAP to gather additional features

#### First installing and importing pycmap 

In [8]:
# !pip install pycmap
import pycmap

#### Setting API

In [9]:
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

### Prepping covariate data for colocalization using Simon's CMAP

In [12]:
covari.head(3)
covari['time'] = pd.to_datetime(covari['time'])
import numpy as np
# Separate numeric and object (string) columns
covari.set_index('time', inplace=True)
covari_cmap_numeric = covari.select_dtypes(include=[np.number])
covari_cmap_non_numeric = covari.select_dtypes(exclude=[np.number])

# Average numeric columns by hour, take first data point in each hour for object columns
covari_cmap_numeric_hourly = covari_cmap_numeric.resample('H').mean()
covari_cmap_non_numeric_hourly = covari_cmap_non_numeric.resample('H').first()

# Recombine
covari_cmap_hourly = pd.concat([covari_cmap_non_numeric_hourly, covari_cmap_numeric_hourly], axis=1)
covari_cmap_hourly.dropna(inplace=True)
covari_cmap_hourly.reset_index(inplace=True)
covari_cmap_hourly.head(3)


Unnamed: 0,time,cruise,lat,lon,depth,abundance_prochloro,abundance_synecho,abundance_picoeuk,abundance_croco,diam_prochloro,...,diam_picoeuk,diam_croco,Qc_prochloro,Qc_synecho,Qc_picoeuk,Qc_croco,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco
0,2009-11-07 23:00:00,TN243,47.455531,-122.408963,5.0,0.296867,0.181126,6.632747,0.261147,0.701349,...,1.92703,5.283986,0.060232,0.19843,0.813686,11.023746,0.017932,0.035915,5.395694,2.877982
1,2009-11-08 00:00:00,TN243,47.4261,-122.3961,5.0,0.068834,0.178967,4.130016,0.199617,0.645218,...,1.971641,6.57395,0.048309,0.25352,0.862262,19.274369,0.003325,0.045372,3.561155,3.8475
2,2009-11-08 17:00:00,TN243,47.343433,-122.540133,5.0,0.068775,0.090553,1.940754,0.111194,0.706921,...,1.998109,5.294829,0.061427,0.206915,0.895059,11.140859,0.004225,0.018923,1.720945,1.247962


In [13]:
covari_cmap_hourly.tail(10)
covari = covari_cmap_hourly
covari.head(3)
covari['time'] = covari['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

## Querying CMAP
This takes ~2 hours, so only run if you need new data.  The repository already contains this data so running this function is not needed for the model to run

In [14]:
targets = {
        
        # Darwin Biogeochemistry Climatology Model
        "tblDarwin_Nutrient_Climatology": {
                          "variables": ["ALK_darwin_clim"],
            # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         },

        # CMAP sea surface salinity
        "tblSSS_NRT_cl1": {
                          "variables": ['sss'],
            # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [7, 1, 1, 6]
                         },
        # CMAP sea surface temperature
        "tblSST_AVHRR_OI_NRT": { "variables": ['sst'],
            # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         },
        # CMAP SSH velocity
        "tblAltimetry_REP_Signal": { "variables": ['ugos', 'vgos'],
                        # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         },
        # Pisces 2011-19
        "tblPisces_NRT": { "variables": ['Fe', 'O2', 'NO3', 'PO4', 'Si'],
                        # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [7, 1, 1, 5]
                         },
        # Pisces 2019-21
        "tblPisces_Forecast": { "variables": ['fe', 'o2', 'no3', 'po4', 'si'],
                        # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         },
        # Pisces 2020-24
        "tblPisces_Forecast_cl1": { "variables": ['fe', 'o2', 'no3', 'po4', 'si'],
                        # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         },

        }


source = covari

covari_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )

Gathering metadata .... 
Sampling starts
Sampling finished                                                                                                    

In [23]:
covari_cmap.head(10)


Unnamed: 0,time,cruise,lat,lon,depth,abundance_prochloro,abundance_synecho,abundance_picoeuk,abundance_croco,diam_prochloro,...,CMAP_fe_tblPisces_Forecast,CMAP_o2_tblPisces_Forecast,CMAP_no3_tblPisces_Forecast,CMAP_po4_tblPisces_Forecast,CMAP_si_tblPisces_Forecast,CMAP_fe_tblPisces_Forecast_cl1,CMAP_o2_tblPisces_Forecast_cl1,CMAP_no3_tblPisces_Forecast_cl1,CMAP_po4_tblPisces_Forecast_cl1,CMAP_si_tblPisces_Forecast_cl1
28,2010-05-21 18:00:00,W1005A,44.4933,-124.1368,5.0,0.246577,1.652478,13.346651,0.141894,0.907278,...,,,,,,,,,,
29,2010-05-21 20:00:00,W1005A,44.251625,-124.195262,5.0,0.102905,2.73311,5.297041,0.039618,0.724296,...,,,,,,,,,,
30,2010-05-21 21:00:00,W1005A,44.252489,-124.252939,5.0,0.086831,3.083692,5.410113,0.061919,0.869899,...,,,,,,,,,,
31,2010-05-21 22:00:00,W1005A,44.252175,-124.290925,5.0,0.143867,2.961071,6.720226,0.109911,0.982521,...,,,,,,,,,,
32,2010-05-21 23:00:00,W1005A,44.250991,-124.4568,5.0,0.299187,4.599585,6.459321,0.128636,0.878229,...,,,,,,,,,,
33,2010-05-22 00:00:00,W1005A,44.251764,-124.711009,5.0,0.636155,3.779168,4.939703,0.034048,0.58751,...,,,,,,,,,,
34,2010-05-22 01:00:00,W1005A,44.252125,-124.834563,5.0,0.411412,3.605853,6.233767,0.029839,0.752117,...,,,,,,,,,,
35,2010-05-22 02:00:00,W1005A,44.251705,-124.974795,5.0,0.643085,4.429017,10.323529,0.024465,0.639234,...,,,,,,,,,,
36,2010-05-22 03:00:00,W1005A,44.251595,-125.150547,5.0,0.194922,2.900974,7.308691,0.019701,0.887,...,,,,,,,,,,
37,2010-05-22 04:00:00,W1005A,44.248485,-125.16516,5.0,0.404941,2.785428,8.0156,0.017269,0.697642,...,,,,,,,,,,


### Checking for NaN values

In [24]:
covari_cmap.isna().sum()

time                                                      0
cruise                                                    0
lat                                                       0
lon                                                       0
depth                                                     0
abundance_prochloro                                       0
abundance_synecho                                         0
abundance_picoeuk                                         0
abundance_croco                                           0
diam_prochloro                                            0
diam_synecho                                              0
diam_picoeuk                                              0
diam_croco                                                0
Qc_prochloro                                              0
Qc_synecho                                                0
Qc_picoeuk                                                0
Qc_croco                                

In [22]:
#removing cruises with nan values
covari_cmapp = covari_cmap[covari_cmap['cruise'] != 'TN243']
covari_cmapp = covari_cmapp[covari_cmapp['cruise'] != 'TN291']
covari_cmap = covari_cmapp

## Saving as a CSV

In [25]:
#saving as a CSV file
covari_cmap.to_csv('data_ingest/data/modified/Seaflow_covariates_CMAP.csv', index=False)
