# Loading Functions

In [1]:
import pandas as pd
import numpy as np


## Creating Real Time data set
### Datetime is in UTC

In [2]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [3]:
realtime_path = 'data/original/SeaFlow_realtime_TN413.csv'

#changing column naes so that the date is just called data
realtime_cols = ['time',
 'lat',
 'lon',
 'population',
 'abundance_cells_per_microliter',
 'diameter_micrometer']

#reading in the csv to a pandas df
realtime = pd.read_csv(realtime_path, names=realtime_cols)
# getting rid of the first row since it has the column names repeated
realtime = realtime.tail(-1)
realtime

Unnamed: 0,time,lat,lon,population,abundance_cells_per_microliter,diameter_micrometer
1,2023-02-25T05:45:57Z,21.3067,-157.0366,picoeuk,14.30323085044253,1.55786005995454
2,2023-02-25T05:51:57Z,21.3148,-157.0362,prochloro,59.90896889937337,0.663476451861207
3,2023-02-25T05:54:58Z,21.3142,-157.0321,picoeuk,20.71698119210922,1.5760000524204352
4,2023-02-25T05:54:58Z,21.3142,-157.0321,prochloro,64.44967710586306,0.670356077823434
5,2023-02-25T05:57:58Z,21.3133,-157.0266,prochloro,68.45117621283211,0.6805887892801286
...,...,...,...,...,...,...
10570,2023-03-11T08:30:41Z,-19.4207,-181.8025,synecho,0.25541483661504516,1.07457158316671
10571,2023-03-11T08:33:41Z,-19.4208,-181.792,picoeuk,4.370431648746329,1.985346246474965
10572,2023-03-11T08:36:41Z,-19.4207,-181.7817,picoeuk,4.427190501327449,2.01327502068565
10573,2023-03-11T08:36:41Z,-19.4207,-181.7817,prochloro,92.26151487061243,0.572558873170014


## Averaging data over 10 minute resolution so that SeaFlow and Underway CTD data match

In [4]:
realtime['abundance_cells_per_microliter'] = pd.to_numeric(realtime['abundance_cells_per_microliter'])

realtime = realtime.dropna(subset=['time','lat', 'lon']).reset_index().drop('index',axis=1)
realtime['time'] = realtime['time'].str[:-1]
realtime['lat'] = realtime['lat'].astype(float)
realtime['lon'] = realtime['lon'].astype(float)
realtime['abundance_cells_per_microliter'] = realtime['abundance_cells_per_microliter'].astype(float)
realtime['diameter_micrometer'] = realtime['diameter_micrometer'].astype(float)
realtime['depth'] = 0

print(realtime.dtypes)
realtime.head(4)

time                               object
lat                               float64
lon                               float64
population                         object
abundance_cells_per_microliter    float64
diameter_micrometer               float64
depth                               int64
dtype: object


Unnamed: 0,time,lat,lon,population,abundance_cells_per_microliter,diameter_micrometer,depth
0,2023-02-25T05:45:57,21.3067,-157.0366,picoeuk,14.303231,1.55786,0
1,2023-02-25T05:51:57,21.3148,-157.0362,prochloro,59.908969,0.663476,0
2,2023-02-25T05:54:58,21.3142,-157.0321,picoeuk,20.716981,1.576,0
3,2023-02-25T05:54:58,21.3142,-157.0321,prochloro,64.449677,0.670356,0


In [5]:
#averaging over every 10 min
realtime['time'] = pd.to_datetime(realtime['time'])
#this drops the phytoplankton population categorical column, need to recalulate
realtime_avg = (realtime
                .groupby('population')
                .resample('10min', on='time')
                .mean(numeric_only=False)
                .reset_index()
               )
realtime_avg
# making time a string object again
realtime_avg['time'] = realtime_avg['time'].astype(str)
realtime_avg['time'] = realtime_avg['time'].str.replace(' ', 'T')

  .mean(numeric_only=False)
  .mean(numeric_only=False)
  .mean(numeric_only=False)


# Using SimonCMAP to gather additional features

#### First installing and importing pycmap 

In [6]:
# !pip install pycmap
import pycmap

### Prepping realtime data for colocalization using Simon's CMAP

#### Setting API

In [7]:
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

In [8]:
targets = {
        
        # BioGeoChemical Numerical Near-Real-Time Model
        "tblPisces_Forecast_cl1": {
                          "variables": ["NO3", "PO4", "Fe", "Si", "chl", "nppv", "temp", "salin"],
                          "tolerances": [4, 0.5, 0.5, 5]
                         }
        }


source = realtime_avg

realtime_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )


Gathering metadata .... 


KeyboardInterrupt: 

## Saving as a CSV

In [None]:
#saving as a CSV file
realtime_cmap.to_csv('data/modified/Seaflow_realtime_CMAP.csv', index=False)


In [None]:
realtime_cmap.isna().sum()