# Loading Functions

In [1]:
import pandas as pd
import numpy as np


## Creating Real Time data set
### Datetime is in UTC

In [2]:
realtime_path = '/Users/cristianswift/Desktop/Spring-Quarter-2022-2023/SeniorThesis/data/original/SeaFlow_realtime_TN413.csv'

#changing column naes so that the date is just called data
realtime_cols = ['time',
 'lat',
 'lon',
 'population',
 'abundance_cells_per_microliter',
 'diameter_micrometer']

#reading in the csv to a pandas df
realtime = pd.read_csv(realtime_path, names=realtime_cols)
# getting rid of the first row since it has the column names repeated
realtime = realtime.tail(-1)
realtime

Unnamed: 0,time,lat,lon,population,abundance_cells_per_microliter,diameter_micrometer
1,2023-02-25T05:45:57Z,21.3067,-157.0366,picoeuk,14.30323085044253,1.55786005995454
2,2023-02-25T05:51:57Z,21.3148,-157.0362,prochloro,59.90896889937337,0.663476451861207
3,2023-02-25T05:54:58Z,21.3142,-157.0321,picoeuk,20.71698119210922,1.5760000524204352
4,2023-02-25T05:54:58Z,21.3142,-157.0321,prochloro,64.44967710586306,0.670356077823434
5,2023-02-25T05:57:58Z,21.3133,-157.0266,prochloro,68.45117621283211,0.6805887892801286
...,...,...,...,...,...,...
10570,2023-03-11T08:30:41Z,-19.4207,-181.8025,synecho,0.25541483661504516,1.07457158316671
10571,2023-03-11T08:33:41Z,-19.4208,-181.792,picoeuk,4.370431648746329,1.985346246474965
10572,2023-03-11T08:36:41Z,-19.4207,-181.7817,picoeuk,4.427190501327449,2.01327502068565
10573,2023-03-11T08:36:41Z,-19.4207,-181.7817,prochloro,92.26151487061243,0.572558873170014


## Averaging data over 10 minute resolution so that SeaFlow and Underway CTD data match

In [3]:
realtime['abundance_cells_per_microliter'] = pd.to_numeric(realtime['abundance_cells_per_microliter'])

realtime = realtime.dropna(subset=['time','lat', 'lon']).reset_index().drop('index',axis=1)
realtime['time'] = realtime['time'].str[:-1]
realtime['lat'] = realtime['lat'].astype(float)
realtime['lon'] = realtime['lon'].astype(float)
realtime['abundance_cells_per_microliter'] = realtime['abundance_cells_per_microliter'].astype(float)
realtime['diameter_micrometer'] = realtime['diameter_micrometer'].astype(float)
realtime['depth'] = 0

print(realtime.dtypes)
realtime.head(4)

time                               object
lat                               float64
lon                               float64
population                         object
abundance_cells_per_microliter    float64
diameter_micrometer               float64
depth                               int64
dtype: object


Unnamed: 0,time,lat,lon,population,abundance_cells_per_microliter,diameter_micrometer,depth
0,2023-02-25T05:45:57,21.3067,-157.0366,picoeuk,14.303231,1.55786,0
1,2023-02-25T05:51:57,21.3148,-157.0362,prochloro,59.908969,0.663476,0
2,2023-02-25T05:54:58,21.3142,-157.0321,picoeuk,20.716981,1.576,0
3,2023-02-25T05:54:58,21.3142,-157.0321,prochloro,64.449677,0.670356,0


In [4]:
#averaging over every 10 min
realtime['time'] = pd.to_datetime(realtime['time'])
#this drops the phytoplankton population categorical column, need to recalulate
realtime_avg = (realtime
                .groupby('population')
                .resample('10min', on='time')
                .mean(numeric_only=False)
                .reset_index()
               )
realtime_avg
# making time a string object again
realtime_avg['time'] = realtime_avg['time'].astype(str)
realtime_avg['time'] = realtime_avg['time'].str.replace(' ', 'T')

  .mean(numeric_only=False)
  .mean(numeric_only=False)
  .mean(numeric_only=False)


# Using SimonCMAP to gather additional features

#### First installing and importing pycmap 

In [9]:
# !pip install pycmap
import pycmap

### Prepping realtime data for colocalization using Simon's CMAP

#### Setting API

In [28]:
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

In [29]:
targets = {
        
        # BioGeoChemical Numerical Near-Real-Time Model
        "tblPisces_Forecast_cl1": {
                          "variables": ["NO3", "PO4", "Fe", "Si", "chl", "nppv", "temp", "salin"],
                          "tolerances": [4, 0.5, 0.5, 5]
                         }
        }


source = realtime_avg

realtime_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )


Gathering metadata .... 
Sampling starts
Sampling finished                                                                                                    

In [31]:
#saving as a CSV file
realtime_cmap.to_csv('/Users/cristianswift/Desktop/Spring-Quarter-2022-2023/SeniorThesis/data/modified/realtime_cmap.csv', index=False)


## The given Covariate dataset has a lot of NaNs values for nutrient information retrieved from CMAP, we will try to use Two Pisces Climatological models to fill in the gaps.

In [6]:
covari_path = '/Users/cristianswift/Desktop/Spring-Quarter-2022-2023/SeniorThesis/data/original/SeaFlow_covariates.csv'
covari = pd.read_csv(covari_path, usecols=lambda column: column != 'Unnamed: 0')
#keeping only relavent columns
covari = covari[['date_UTC', 'PopulationName', 'Latitude_decimalDegree', 'Longitude_decimalDegree',
        'Biomass_pgC_per_L', 'Salinity_psu', 'Temperature_degC']]


In [7]:
# rename for easier use and making time a string object to be manipulated
covari.rename(columns={
    'date_UTC': 'time', 'Latitude_decimalDegree': 'lat', 'Longitude_decimalDegree': 'lon'}, inplace=True)
covari['time'] = covari['time'].astype(str)
#making date time format CMAP appropriate
covari['time'] = covari['time'].str.replace('Z', '')
covari

Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,Salinity_psu,Temperature_degC
0,2016-04-20T00:00:00,Prochlorococcus,,,9.232478,34.679128,25.740547
1,2016-04-20T00:00:00,Synechococcus,,,0.279172,34.679128,25.740547
2,2016-04-20T00:00:00,nanoeukaryotes (2-5µm),,,3.079118,34.679128,25.740547
3,2016-04-20T00:00:00,picoeukaryotes (< 2µm),,,0.606572,34.679128,25.740547
4,2016-04-20T01:00:00,Prochlorococcus,,,10.175402,34.704286,25.624466
...,...,...,...,...,...,...,...
12182,2021-12-30T00:00:00,picoeukaryotes (< 2µm),32.673493,-117.545342,3.774488,33.468151,15.189021
12183,2021-12-30T01:00:00,Prochlorococcus,32.682100,-117.660321,0.874599,33.478846,15.327302
12184,2021-12-30T01:00:00,Synechococcus,32.682100,-117.660321,9.707579,33.478846,15.327302
12185,2021-12-30T01:00:00,nanoeukaryotes (2-5µm),32.682100,-117.660321,2.428084,33.478846,15.327302


In [10]:
targets = {
        
        # BioGeoChemical Numerical Near-Real-Time Model
        "tblPisces_NRT": {
                          "variables": ["NO3", "PO4", "Fe", "Si", "chl"],
                          "tolerances": [4, 0.5, 0.5, 5]
                         },
    
    "tblPisces_Forecast": {
                          "variables": ["NO3", "PO4", "Fe", "Si", "chl"],
                          "tolerances": [4, 0.5, 0.5, 5]
        }
}
import pycmap
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

source = covari
covari_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )


Gathering metadata .... 
Sampling starts
Sampling finished                                                                                                    

Saving as a CSV to be accesible later.

In [18]:
covari_cmap.to_csv('/Users/cristianswift/Desktop/Spring-Quarter-2022-2023/SeniorThesis/data/modified/covari_cmap.csv', index=False)
