In [1]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

## In this notebook we will be testing whether the random forest models can predict on cruise KM2206

### Loading in ship data

In [31]:
import pandas as pd

mola_ship_path = 'data/Testing Data/KM2206_740_740.sfl'

mola_ship = (pd
        .read_csv(mola_path, delimiter='	'))

mola_ship = mola_ship.rename(columns={'DATE': 'time', 'LAT':'lat', 'LON':'lon', 'SALINITY': 'salin', 'OCEAN TEMP': 'temp', 'PAR':'par'})
mola_ship = mola_ship[['time', 'lat', 'lon', 'salin', 'par' ]]
mola_ship

Unnamed: 0,time,lat,lon,salin,par
0,2022-06-06T10:06:59+00:00,21.247083,-158.013283,34.643600,0.001000
1,2022-06-06T10:09:59+00:00,21.247150,-158.021900,34.659666,0.001000
2,2022-06-06T10:13:00+00:00,21.247300,-158.028000,34.670898,0.001000
3,2022-06-06T10:14:37+00:00,21.247420,-158.038440,34.660119,0.001000
4,2022-06-06T10:17:38+00:00,21.247533,-158.047917,34.624100,0.000667
...,...,...,...,...,...
13518,2022-07-05T06:15:42+00:00,20.714900,-159.692850,34.719999,0.001000
13519,2022-07-05T07:20:23+00:00,20.773267,-159.503383,,
13520,2022-07-05T07:23:24+00:00,20.776350,-159.494283,34.756183,0.001000
13521,2022-07-05T07:26:24+00:00,20.779283,-159.485383,,


## Loading in seaflow data

In [28]:
import pandas as pd

mola_seaflow_path = 'data/Testing Data/SeaFlow_KM2206.csv'

mola_seaflow = (pd
        .read_csv(mola_seaflow_path)
               .loc[:,['time', 'lat', 'lon', 'depth', 'biomass_prochloro', 'biomass_synecho', 'biomass_picoeuk', 'biomass_croco']
                ])

mola_seaflow

Unnamed: 0,time,lat,lon,depth,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco
0,2022-06-06T10:14:37+00:00,21.247420,-158.038440,5,3.365156,0.222397,0.229292,0.000000
1,2022-06-06T10:17:38+00:00,21.247533,-158.047917,5,3.654160,0.386374,0.322980,0.000000
2,2022-06-06T10:20:38+00:00,21.247567,-158.058183,5,3.841185,0.784832,0.673194,0.068327
3,2022-06-06T10:23:38+00:00,21.247583,-158.068533,5,4.208837,0.752714,0.726844,0.000000
4,2022-06-06T10:26:38+00:00,21.247600,-158.078667,5,4.194555,0.582385,1.173581,0.000000
...,...,...,...,...,...,...,...,...
12960,2022-07-05T06:00:40+00:00,20.703600,-159.732817,5,8.209827,0.351032,1.188342,0.198866
12961,2022-07-05T06:03:41+00:00,20.706067,-159.724233,5,7.918806,0.327101,0.591730,0.134931
12962,2022-07-05T06:06:41+00:00,20.708500,-159.715717,5,8.150971,0.368326,1.482961,0.141464
12963,2022-07-05T06:09:41+00:00,20.710900,-159.707183,5,8.018070,0.340827,1.251484,0.096195


## Merging the ship and seaflow data

In [32]:
mola = mola_seaflow.merge(mola_ship.drop(['lat','lon'], axis=1), left_on=['time'], right_on=['time'])
mola

Unnamed: 0,time,lat,lon,depth,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco,salin,par
0,2022-06-06T10:14:37+00:00,21.247420,-158.038440,5,3.365156,0.222397,0.229292,0.000000,34.660119,0.001000
1,2022-06-06T10:17:38+00:00,21.247533,-158.047917,5,3.654160,0.386374,0.322980,0.000000,34.624100,0.000667
2,2022-06-06T10:20:38+00:00,21.247567,-158.058183,5,3.841185,0.784832,0.673194,0.068327,34.617950,0.001000
3,2022-06-06T10:23:38+00:00,21.247583,-158.068533,5,4.208837,0.752714,0.726844,0.000000,34.621282,0.001000
4,2022-06-06T10:26:38+00:00,21.247600,-158.078667,5,4.194555,0.582385,1.173581,0.000000,34.622901,0.000833
...,...,...,...,...,...,...,...,...,...,...
12960,2022-07-05T06:00:40+00:00,20.703600,-159.732817,5,8.209827,0.351032,1.188342,0.198866,34.725817,0.001000
12961,2022-07-05T06:03:41+00:00,20.706067,-159.724233,5,7.918806,0.327101,0.591730,0.134931,34.718984,0.001000
12962,2022-07-05T06:06:41+00:00,20.708500,-159.715717,5,8.150971,0.368326,1.482961,0.141464,34.717750,0.001000
12963,2022-07-05T06:09:41+00:00,20.710900,-159.707183,5,8.018070,0.340827,1.251484,0.096195,34.717384,0.001000


#### Making time format CMAP appropriate

In [37]:
mola['time'] = pd.to_datetime(mola['time']).dt.strftime('%Y-%m-%dT%H:%M:%S')
mola

Unnamed: 0,time,lat,lon,depth,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco,salin,par
0,2022-06-06T10:14:37,21.247420,-158.038440,5,3.365156,0.222397,0.229292,0.000000,34.660119,0.001000
1,2022-06-06T10:17:38,21.247533,-158.047917,5,3.654160,0.386374,0.322980,0.000000,34.624100,0.000667
2,2022-06-06T10:20:38,21.247567,-158.058183,5,3.841185,0.784832,0.673194,0.068327,34.617950,0.001000
3,2022-06-06T10:23:38,21.247583,-158.068533,5,4.208837,0.752714,0.726844,0.000000,34.621282,0.001000
4,2022-06-06T10:26:38,21.247600,-158.078667,5,4.194555,0.582385,1.173581,0.000000,34.622901,0.000833
...,...,...,...,...,...,...,...,...,...,...
12960,2022-07-05T06:00:40,20.703600,-159.732817,5,8.209827,0.351032,1.188342,0.198866,34.725817,0.001000
12961,2022-07-05T06:03:41,20.706067,-159.724233,5,7.918806,0.327101,0.591730,0.134931,34.718984,0.001000
12962,2022-07-05T06:06:41,20.708500,-159.715717,5,8.150971,0.368326,1.482961,0.141464,34.717750,0.001000
12963,2022-07-05T06:09:41,20.710900,-159.707183,5,8.018070,0.340827,1.251484,0.096195,34.717384,0.001000


### Need to adjust columns that the dtypes are correct

In [38]:
def ChangeObjectTypes(df):
    """
    Changing the object types so that it works best for CMAP colocalizatin of data
    """
    for column in df:
        if column == 'time':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column])
    return df
mola = ChangeObjectTypes(mola)       

In [39]:
mola.dtypes

time                  object
lat                  float64
lon                  float64
depth                  int64
biomass_prochloro    float64
biomass_synecho      float64
biomass_picoeuk      float64
biomass_croco        float64
salin                float64
par                  float64
dtype: object

# Using SimonCMAP to gather additional features

### Our climatological data will come from the 

#### First installing and importing pycmap 

In [40]:
# !pip install pycmap
import pycmap

### Prepping molaate data for colocalization using Simon's CMAP

#### Setting API

In [41]:
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

### Investigating what variables are available from the Darwin Nutrient Climatology model

In [43]:
api.get_dataset_metadata('tblDarwin_Nutrient_Climatology')

Unnamed: 0,Variable,Table_Name,Long_Name,Unit,Make,Sensor,Process_Level,Study_Domain,Temporal_Resolution,Spatial_Resolution,...,Dataset_Short_Name,Data_Source,Distributor,Dataset_Description,Acknowledgement,Dataset_ID,ID,Visualize,Keywords,Refrences
0,ALK_darwin_clim,tblDarwin_Nutrient_Climatology,ALK concentration (climatology),mmol eq,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,171,1,"alk, ALK_darwin_clim, bio, biogeochem, biogeoc...",
1,CDOM_darwin_clim,tblDarwin_Nutrient_Climatology,CDOM concentration (climatology),mmol C/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,173,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
2,DIC_darwin_clim,tblDarwin_Nutrient_Climatology,DIC concentration (climatology),mmol C/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,146,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
3,DOC_darwin_clim,tblDarwin_Nutrient_Climatology,DOC concentration (climatology),mmol C/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,162,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
4,DOFe_darwin_clim,tblDarwin_Nutrient_Climatology,DOfe concentration (climatology),mmol Fe,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,165,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
5,DON_darwin_clim,tblDarwin_Nutrient_Climatology,DON concentration (climatology),mmol N/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,163,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
6,DOP_darwin_clim,tblDarwin_Nutrient_Climatology,DOP concentration (climatology),mmol P/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,164,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
7,FeT_darwin_clim,tblDarwin_Nutrient_Climatology,FeT concentration (climatology),mmol Fe,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,161,1,"bio, biogeochem, biogeochemistry, biogo, blend...",
8,NH4_darwin_clim,tblDarwin_Nutrient_Climatology,NH4 concentration (climatology),mmol N/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,156,1,"ammonium, bio, biogeochem, biogeochemistry, bi...",
9,NO2_darwin_clim,tblDarwin_Nutrient_Climatology,NO2 concentration (climatology),mmol N/,Model,Blend,Reprocessed,Biogeochemistry,Monthly Climatology,1/2Â° X 1/2Â°,...,Darwin-MITgcm_Climatology,http://darwinproject.mit.edu,http://darwinproject.mit.edu,This version of the model is modified from Dut...,Data provided by: http://darwinproject.mit.edu/,21,157,1,"bio, biogeochem, biogeochemistry, biogo, blend...",


In [44]:
mola.head(3)

Unnamed: 0,time,lat,lon,depth,biomass_prochloro,biomass_synecho,biomass_picoeuk,biomass_croco,salin,par
0,2022-06-06T10:14:37,21.24742,-158.03844,5,3.365156,0.222397,0.229292,0.0,34.660119,0.001
1,2022-06-06T10:17:38,21.247533,-158.047917,5,3.65416,0.386374,0.32298,0.0,34.6241,0.000667
2,2022-06-06T10:20:38,21.247567,-158.058183,5,3.841185,0.784832,0.673194,0.068327,34.61795,0.001


In [None]:
targets = {
        
        # Darwin Biogeochemistry Climatology Model
        "tblDarwin_Nutrient_Climatology": {
                          "variables": ["SiO2_darwin_clim", "POSi_darwin_clim", "PON_darwin_clim",
                                        "POFe_darwin_clim", "POC_darwin_clim", "PO4_darwin_clim",
                                        "PIC_darwin_clim", "O2_darwin_clim", "NO3_darwin_clim",
                                        "NO2_darwin_clim", "NH4_darwin_clim", "FeT_darwin_clim",
                                        "DOP_darwin_clim", "DON_darwin_clim", "DOFe_darwin_clim",
                                        "DOC_darwin_clim", "DIC_darwin_clim", "CDOM_darwin_clim",
                                        "ALK_darwin_clim"],
            # Tolerance varaibles/order: temporal [days], meridional [deg], zonal [deg], and vertical [m]
                          "tolerances": [1, 0.5, 0.5, 5]
                         }
        }


source = mola

mola_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )

Gathering metadata .... 
Sampling starts
Sampling tblDarwin_Nutrient_Climatology ... 163 / 12965                                                  

In [15]:
mola_cmap

Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,depth,temp,cruisename,CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology,...,CMAP_NO2_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_NH4_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_FeT_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOP_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DON_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOFe_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DIC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_CDOM_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology
0,2016-04-20T07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
1,2016-04-20T07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
2,2016-04-20T07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
3,2016-04-20T07:00:00,picoeukaryotes (< 2µm),21.520326,-158.326984,0.701902,34.893785,7,24.351745,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
4,2016-04-20T08:00:00,Prochlorococcus,21.662710,-158.323430,9.309387,34.902376,7,24.339265,KOK1606,-0.022845,...,0.295276,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,2021-12-30T00:00:00,picoeukaryotes (< 2µm),32.673493,-117.545342,3.774488,33.468151,7,15.189021,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775
10907,2021-12-30T01:00:00,Prochlorococcus,32.682100,-117.660321,0.874599,33.478846,7,15.327302,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775
10908,2021-12-30T01:00:00,Synechococcus,32.682100,-117.660321,9.707579,33.478846,7,15.327302,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775
10909,2021-12-30T01:00:00,nanoeukaryotes (2-5µm),32.682100,-117.660321,2.428084,33.478846,7,15.327302,TN398,0.363296,...,0.202274,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775


### Checking for NaN values

In [16]:
mola_cmap.isna().sum()

time                                                    0
PopulationName                                          0
lat                                                     0
lon                                                     0
Biomass_pgC_per_L                                       0
salin                                                   0
depth                                                   0
temp                                                    0
cruisename                                              0
CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology    0
CMAP_POSi_darwin_clim_tblDarwin_Nutrient_Climatology    0
CMAP_PON_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_POFe_darwin_clim_tblDarwin_Nutrient_Climatology    0
CMAP_POC_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_PO4_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_PIC_darwin_clim_tblDarwin_Nutrient_Climatology     0
CMAP_O2_darwin_clim_tblDarwin_Nutrient_Climatology      0
CMAP_NO3_darwi

## Saving as a CSV

In [17]:
#saving as a CSV file
mola_cmap.to_csv('data/modified/Seaflow_molaates_CMAP.csv', index=False)
