# Loading Functions

In [4]:
import pandas as pd
import numpy as np


## Creating Real Time data set
### Datetime is in UTC

In [5]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [11]:
tn413_path = 'data/original/SeaFlow_realtime_TN413.csv'

#changing column naes so that the date is just called data
tn413_cols = ['time',
 'lat',
 'lon',
 'population',
 'abundance_cells_per_microliter',
 'diameter_micrometer']

#reading in the csv to a pandas df
tn413 = pd.read_csv(tn413_path, names=tn413_cols)
# getting rid of the first row since it has the column names repeated
tn413 = tn413.tail(-1)
tn413

Unnamed: 0,time,lat,lon,population,abundance_cells_per_microliter,diameter_micrometer
1,2023-02-25T05:45:57Z,21.3067,-157.0366,picoeuk,14.30323085044253,1.55786005995454
2,2023-02-25T05:51:57Z,21.3148,-157.0362,prochloro,59.90896889937337,0.663476451861207
3,2023-02-25T05:54:58Z,21.3142,-157.0321,picoeuk,20.71698119210922,1.5760000524204352
4,2023-02-25T05:54:58Z,21.3142,-157.0321,prochloro,64.44967710586306,0.670356077823434
5,2023-02-25T05:57:58Z,21.3133,-157.0266,prochloro,68.45117621283211,0.6805887892801286
...,...,...,...,...,...,...
10570,2023-03-11T08:30:41Z,-19.4207,-181.8025,synecho,0.25541483661504516,1.07457158316671
10571,2023-03-11T08:33:41Z,-19.4208,-181.792,picoeuk,4.370431648746329,1.985346246474965
10572,2023-03-11T08:36:41Z,-19.4207,-181.7817,picoeuk,4.427190501327449,2.01327502068565
10573,2023-03-11T08:36:41Z,-19.4207,-181.7817,prochloro,92.26151487061243,0.572558873170014


## Averaging data over 10 minute resolution so that SeaFlow and Underway CTD data match

In [12]:
tn413['abundance_cells_per_microliter'] = pd.to_numeric(tn413['abundance_cells_per_microliter'])

tn413 = tn413.dropna(subset=['time','lat', 'lon']).reset_index().drop('index',axis=1)
tn413['time'] = tn413['time'].str[:-1]
tn413['lat'] = tn413['lat'].astype(float)
tn413['lon'] = tn413['lon'].astype(float)
tn413['abundance_cells_per_microliter'] = tn413['abundance_cells_per_microliter'].astype(float)
tn413['diameter_micrometer'] = tn413['diameter_micrometer'].astype(float)
tn413['depth'] = 0

print(tn413.dtypes)
tn413.head(4)

time                               object
lat                               float64
lon                               float64
population                         object
abundance_cells_per_microliter    float64
diameter_micrometer               float64
depth                               int64
dtype: object


Unnamed: 0,time,lat,lon,population,abundance_cells_per_microliter,diameter_micrometer,depth
0,2023-02-25T05:45:57,21.3067,-157.0366,picoeuk,14.303231,1.55786,0
1,2023-02-25T05:51:57,21.3148,-157.0362,prochloro,59.908969,0.663476,0
2,2023-02-25T05:54:58,21.3142,-157.0321,picoeuk,20.716981,1.576,0
3,2023-02-25T05:54:58,21.3142,-157.0321,prochloro,64.449677,0.670356,0


In [13]:
#averaging over every 10 min
tn413['time'] = pd.to_datetime(tn413['time'])
#this drops the phytoplankton population categorical column, need to recalulate
tn413_avg = (tn413
                .groupby('population')
                .resample('10min', on='time')
                .mean(numeric_only=False)
                .reset_index()
               )
tn413_avg
# making time a string object again
tn413_avg['time'] = tn413_avg['time'].astype(str)
tn413_avg['time'] = tn413_avg['time'].str.replace(' ', 'T')

  .mean(numeric_only=False)
  .mean(numeric_only=False)
  .mean(numeric_only=False)


# Using SimonCMAP to gather additional features

#### First installing and importing pycmap 

In [14]:
# !pip install pycmap
import pycmap

### Prepping tn413 data for colocalization using Simon's CMAP

#### Setting API

In [15]:
api = pycmap.API(token='<6e1eb1d3-d364-4dfb-9121-8c23369dbbbe>')

In [16]:
targets = {
        
        # BioGeoChemical Numerical Near-Real-Time Model
        "tblPisces_Forecast_cl1": {
                          "variables": ["NO3", "PO4", "Fe", "Si", "chl", "nppv", "temp", "salin"],
                          "tolerances": [4, 0.5, 0.5, 5]
                         }
        }


source = tn413_avg

tn413_cmap = pycmap.Sample(
              source=source, 
              targets=targets, 
              replaceWithMonthlyClimatolog=False
             )


Gathering metadata .... 
Sampling starts
Sampling tblPisces_Forecast_cl1 ... 58 / 5891                                                  

KeyboardInterrupt: 

## Saving as a CSV

In [None]:
tn413_cmap.isna().sum()

In [None]:
#saving as a CSV file
tn413_cmap.to_csv('data/modified/Seaflow_TN413_CMAP.csv', index=False)
