


![Python logo](https://cmap.readthedocs.io/en/latest/_static/CMAP_logos/CMAP_logo_High_Res.png) 
# In this notebook we will download SeaFlow and enviormental data Using [Simons CMAP](https://simonscmap.com).

Below are the datasets that will be used
The End goalEnd goal is to create a dataset that has these variables below.



#### SeaFlow:
- time
- lat
- lon
- biomass
- CruiseName
- Temperature
- Salinity

In this notebook we will also use <u> depth, and cruise</u> to match with other avalaible CMAP dataframes. 


#### Darwin Biogeochemistry Climatology Model
- NO3
- PO4
- Fe
- Si
- chl
- and more


# Loading Functions

In [1]:
import pandas as pd
import numpy as np


## Creating Real Time data set
### Datetime is in UTC

In [2]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)


## The Covariate Seaflow dataset is averaged per hour for every Picoplankton population

In [3]:
covari_path = 'data/original/SeaFlow_covariates.csv'

#changing column naes so that the date is just called data
covari_cols = ['time', 'PopulationName', 'lat',
       'lon', 'CellAbundance_10^6_cells_per_L',
       'Biomass_pgC_per_L', 'CellQuotas_fgC_per_cell',
       'CellDiameter_micrometer', 'salin', 'temp',
       'cruisename', 'Light_micromolQuanta_m2_s', 'SiO4_micromol_per_L',
       'NO3NO2', 'PO4', 'Fe',
       'SatChl', 'MixedLayerDepth_m']

#reading in the csv to a pandas df
covari = (pd
          .read_csv(covari_path, names=covari_cols)
          #need to get rid of the first row as it is a repeat when loaded as such
          .tail(-1)
         )

covari.head(3)

Unnamed: 0,time,PopulationName,lat,lon,CellAbundance_10^6_cells_per_L,Biomass_pgC_per_L,CellQuotas_fgC_per_cell,CellDiameter_micrometer,salin,temp,cruisename,Light_micromolQuanta_m2_s,SiO4_micromol_per_L,NO3NO2,PO4,Fe,SatChl,MixedLayerDepth_m
1,2016-04-20T00:00:00Z,Prochlorococcus,,,253.186148,9.232478116,0.036465179,0.57858,34.67912841,25.74054659,KOK1606,1764.076136,,,,,,26.96643939
2,2016-04-20T00:00:00Z,Synechococcus,,,1.588988684,0.279172358,0.175691848,1.06425,34.67912841,25.74054659,KOK1606,1764.076136,,,,,,26.96643939
3,2016-04-20T00:00:00Z,nanoeukaryotes (2-5µm),,,1.332641539,3.079117581,2.310536998,2.88901,34.67912841,25.74054659,KOK1606,1764.076136,,,,,,26.96643939


#### Making time format CMAP appropriate

In [4]:
covari['time'] = covari['time'].str.replace('Z', '')

### Adding a depth column so that we can get climatoligcal data from a depths we are intersted in
Seaflow observes data at 7m depth

In [5]:
covari['depth'] = 7

### Need to adjust columns that the dtypes are correct

In [6]:
def ChangeObjectTypes(df):
    """
    Changing the object types so that it works best for CMAP colocalizatin of data
    """
    for column in df:
        if column == 'PopulationName' or column == 'cruisename' or column == 'time':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column])
    return df
covari = ChangeObjectTypes(covari)       

## Keeping only the data varaibles that we will be using for the machine learning model

In [7]:
covari = (covari[['time', 'PopulationName', 'lat','lon',
                 'Biomass_pgC_per_L','salin', 'depth', 'temp','cruisename']]
          .dropna()
          .reset_index(drop=True)
         )
covari

Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,depth,temp,cruisename
0,2016-04-20T07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,7,24.351745,KOK1606
1,2016-04-20T07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,7,24.351745,KOK1606
2,2016-04-20T07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,7,24.351745,KOK1606
3,2016-04-20T07:00:00,picoeukaryotes (< 2µm),21.520326,-158.326984,0.701902,34.893785,7,24.351745,KOK1606
4,2016-04-20T08:00:00,Prochlorococcus,21.662710,-158.323430,9.309387,34.902376,7,24.339265,KOK1606
...,...,...,...,...,...,...,...,...,...
10906,2021-12-30T00:00:00,picoeukaryotes (< 2µm),32.673493,-117.545342,3.774488,33.468151,7,15.189021,TN398
10907,2021-12-30T01:00:00,Prochlorococcus,32.682100,-117.660321,0.874599,33.478846,7,15.327302,TN398
10908,2021-12-30T01:00:00,Synechococcus,32.682100,-117.660321,9.707579,33.478846,7,15.327302,TN398
10909,2021-12-30T01:00:00,nanoeukaryotes (2-5µm),32.682100,-117.660321,2.428084,33.478846,7,15.327302,TN398


In [8]:
covari.dtypes

time                  object
PopulationName        object
lat                  float64
lon                  float64
Biomass_pgC_per_L    float64
salin                float64
depth                  int64
temp                 float64
cruisename            object
dtype: object

# Using SimonCMAP to gather additional features

### Our climatological data will come from the 

#### First installing and importing pycmap 

In [9]:
# !pip install pycmap
import pycmap

### Prepping covariate data for colocalization using Simon's CMAP

#### Setting API

In [10]:
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

### Investigating what variables are available from the Darwin Nutrient Climatology model

In [11]:
api.get_catalog()

Unnamed: 0,Variable,Table_Name,Long_Name,Unit,Make,Sensor,Process_Level,Study_Domain,Temporal_Resolution,Spatial_Resolution,...,Dataset_Name,Dataset_Short_Name,Data_Source,Distributor,Dataset_Description,Acknowledgement,Dataset_ID,ID,Unstructured_Dataset_Metadata,Unstructured_Variable_Metadata
0,Fe,tblPisces_NRT,Mole concentration of dissolved iron in sea water,mmol/m^3,Model,Blend,Near-Real-Time,Biogeochemistry,Weekly,1/2Â° X 1/2Â°,...,Mercator-Pisces Biogeochemistry and Weekly For...,Mercator_Pisces_Biogeochem_Climatology,MERCATOR BIOMER4V1R2,http://marine.copernicus.eu,"""Produced by Mercator Ocean in Toulouse, Franc...",Data provided by: E.U. Copernicus Marine Servi...,17,39,,
1,PP,tblPisces_NRT,Net primary productivity of Carbon per unit vo...,g/m^3/day,Model,Blend,Near-Real-Time,Biogeochemistry,Weekly,1/2Â° X 1/2Â°,...,Mercator-Pisces Biogeochemistry and Weekly For...,Mercator_Pisces_Biogeochem_Climatology,MERCATOR BIOMER4V1R2,http://marine.copernicus.eu,"""Produced by Mercator Ocean in Toulouse, Franc...",Data provided by: E.U. Copernicus Marine Servi...,17,40,,
2,Si,tblPisces_NRT,Mole concentration of Silicate in sea water,umol/L,Model,Blend,Near-Real-Time,Biogeochemistry,Weekly,1/2Â° X 1/2Â°,...,Mercator-Pisces Biogeochemistry and Weekly For...,Mercator_Pisces_Biogeochem_Climatology,MERCATOR BIOMER4V1R2,http://marine.copernicus.eu,"""Produced by Mercator Ocean in Toulouse, Franc...",Data provided by: E.U. Copernicus Marine Servi...,17,41,,
3,NO3,tblPisces_NRT,Mole concentration of Nitrate in sea water,mmol/m^3,Model,Blend,Near-Real-Time,Biogeochemistry,Weekly,1/2Â° X 1/2Â°,...,Mercator-Pisces Biogeochemistry and Weekly For...,Mercator_Pisces_Biogeochem_Climatology,MERCATOR BIOMER4V1R2,http://marine.copernicus.eu,"""Produced by Mercator Ocean in Toulouse, Franc...",Data provided by: E.U. Copernicus Marine Servi...,17,42,,
4,CHL,tblPisces_NRT,Mass concentration of Chlorophyll in sea water,mg/m^3,Model,Blend,Near-Real-Time,Biogeochemistry,Weekly,1/2Â° X 1/2Â°,...,Mercator-Pisces Biogeochemistry and Weekly For...,Mercator_Pisces_Biogeochem_Climatology,MERCATOR BIOMER4V1R2,http://marine.copernicus.eu,"""Produced by Mercator Ocean in Toulouse, Franc...",Data provided by: E.U. Copernicus Marine Servi...,17,43,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9162,pigment20,tblBATS_Pigment,a-Carotene,ng/kg,Observation,HPLC,Reprocessed,Biogeochemistry,Irregular,Irregular,...,Bermuda Atlantic Time-Series Study (BATS) Pigm...,BATS_Pigments,Bermuda Institute of Ocean Sciences,Bermuda Institute of Ocean Sciences,The BATS (Bermuda Atlantic Time-series Study) ...,Bermuda Institute of Ocean Sciences,678,17097,,
9163,pigment21,tblBATS_Pigment,b-Carotene,ng/kg,Observation,HPLC,Reprocessed,Biogeochemistry,Irregular,Irregular,...,Bermuda Atlantic Time-Series Study (BATS) Pigm...,BATS_Pigments,Bermuda Institute of Ocean Sciences,Bermuda Institute of Ocean Sciences,The BATS (Bermuda Atlantic Time-series Study) ...,Bermuda Institute of Ocean Sciences,678,17098,,
9164,Cruise_ID,tblBATS_Pigment,Cruise ID,,Observation,Uncategorized,Reprocessed,Biogeochemistry,Irregular,Irregular,...,Bermuda Atlantic Time-Series Study (BATS) Pigm...,BATS_Pigments,Bermuda Institute of Ocean Sciences,Bermuda Institute of Ocean Sciences,The BATS (Bermuda Atlantic Time-series Study) ...,Bermuda Institute of Ocean Sciences,678,17099,,
9165,niskin_flag,tblBATS_Pigment,Niskin/Goflo Quality Flag,,Observation,Uncategorized,Reprocessed,Biogeochemistry,Irregular,Irregular,...,Bermuda Atlantic Time-Series Study (BATS) Pigm...,BATS_Pigments,Bermuda Institute of Ocean Sciences,Bermuda Institute of Ocean Sciences,The BATS (Bermuda Atlantic Time-series Study) ...,Bermuda Institute of Ocean Sciences,678,17100,,


In [12]:
api.get_dataset_metadata('tblDarwin_Nutrient_Climatology')

Unnamed: 0,Variable,Table_Name,Long_Name,Unit,Make,Sensor,Process_Level,Study_Domain,Temporal_Resolution,Spatial_Resolution,...,Dataset_Short_Name,Data_Source,Distributor,Dataset_Description,Acknowledgement,Dataset_ID,ID,Visualize,Keywords,Refrences
0,ALK_darwin_clim,tblDarwin_Nutrient_Climatology,ALK concentration (climatology),mmol eq,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,171,1,"alk, ALK_darwin_clim, bio, biogeochem, biogeoc...",
1,CDOM_darwin_clim,tblDarwin_Nutrient_Climatology,CDOM concentration (climatology),mmol C/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,173,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
2,DIC_darwin_clim,tblDarwin_Nutrient_Climatology,DIC concentration (climatology),mmol C/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,146,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
3,DOC_darwin_clim,tblDarwin_Nutrient_Climatology,DOC concentration (climatology),mmol C/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,162,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
4,DOFe_darwin_clim,tblDarwin_Nutrient_Climatology,DOfe concentration (climatology),mmol Fe,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,165,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
5,DON_darwin_clim,tblDarwin_Nutrient_Climatology,DON concentration (climatology),mmol N/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,163,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
6,DOP_darwin_clim,tblDarwin_Nutrient_Climatology,DOP concentration (climatology),mmol P/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,164,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
7,FeT_darwin_clim,tblDarwin_Nutrient_Climatology,FeT concentration (climatology),mmol Fe,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,161,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
8,NH4_darwin_clim,tblDarwin_Nutrient_Climatology,NH4 concentration (climatology),mmol N/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,156,1,"ammonium, bio, biogeochem, biogeochemistry, bi...",
9,NO2_darwin_clim,tblDarwin_Nutrient_Climatology,NO2 concentration (climatology),mmol N/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,157,1,"bio, biogeochem, biogeochemistry, biogo, blend...",


In [13]:
covari.head(3)

Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,depth,temp,cruisename
0,2016-04-20T07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,7,24.351745,KOK1606
1,2016-04-20T07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,7,24.351745,KOK1606
2,2016-04-20T07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,7,24.351745,KOK1606


In [14]:
targets = {
        
        # Darwin Biogeochemistry Climatology Model
        "tblDarwin_Nutrient_Climatology": {
                          "variables": ["SiO2_darwin_clim", "POSi_darwin_clim", "PON_darwin_clim",
                                        "POFe_darwin_clim", "POC_darwin_clim", "PO4_darwin_clim",
                                        "PIC_darwin_clim", "O2_darwin_clim", "NO3_darwin_clim",
                                        "NO2_darwin_clim", "NH4_darwin_clim", "FeT_darwin_clim",
                                        "DOP_darwin_clim", "DON_darwin_clim", "DOFe_darwin_clim",
                                        "DOC_darwin_clim", "DIC_darwin_clim", "CDOM_darwin_clim",
                                        "ALK_darwin_clim"],
            # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         }
        }


source = covari

covari_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )

Gathering metadata .... 
Sampling starts
Sampling finished                                                                                                    

In [15]:
covari_cmap

Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,depth,temp,cruisename,CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology,...,CMAP_NO2_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_NH4_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_FeT_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOP_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DON_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOFe_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DIC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_CDOM_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology
0,2016-04-20T07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
1,2016-04-20T07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
2,2016-04-20T07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
3,2016-04-20T07:00:00,picoeukaryotes (< 2µm),21.520326,-158.326984,0.701902,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
4,2016-04-20T08:00:00,Prochlorococcus,21.662710,-158.323430,9.309387,34.902376,7,24.339265,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,2021-12-30T00:00:00,picoeukaryotes (< 2µm),32.673493,-117.545342,3.774488,33.468151,7,15.189021,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775
10907,2021-12-30T01:00:00,Prochlorococcus,32.682100,-117.660321,0.874599,33.478846,7,15.327302,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775
10908,2021-12-30T01:00:00,Synechococcus,32.682100,-117.660321,9.707579,33.478846,7,15.327302,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775
10909,2021-12-30T01:00:00,nanoeukaryotes (2-5µm),32.682100,-117.660321,2.428084,33.478846,7,15.327302,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775


### Checking for NaN values

In [16]:
covari_cmap.isna().sum()

time                                                    0
PopulationName                                          0
lat                                                     0
lon                                                     0
Biomass_pgC_per_L                                       0
salin                                                   0
depth                                                   0
temp                                                    0
cruisename                                              0
CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology    0
CMAP_POSi_darwin_clim_tblDarwin_Nutrient_Climatology    0
CMAP_PON_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_POFe_darwin_clim_tblDarwin_Nutrient_Climatology    0
CMAP_POC_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_PO4_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_PIC_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_O2_darwin_clim_tblDarwin_Nutrient_Climatology      0
CMAP_NO3_darwi

## Saving as a CSV

In [17]:
#saving as a CSV file
covari_cmap.to_csv('data/modified/Seaflow_covariates_CMAP.csv', index=False)
