# Loading Functions

In [1]:
# !pip3 install pandas==2.0.0
# !pip3 install seaborn
# !pip3 install numpy==1.20.3

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.dates as mdates
import matplotlib.cbook as cbook

## Creating Real Time data set that includes CMAP and Underway data

### Loading in realtime data that has colocalized CMAP features

In [3]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [22]:
realtime_cmap_path = 'data/modified/Seaflow_TN413_CMAP.csv'
#using pandas to read in as a df
realtime_cmap = (pd.read_csv(realtime_cmap_path,parse_dates=[0]))
#taking a peak at the data
realtime_cmap.head(3)


Unnamed: 0,population,time,lat,lon,abundance_cells_per_microliter,diameter_micrometer,depth,CMAP_NO3_tblPisces_Forecast_cl1,CMAP_PO4_tblPisces_Forecast_cl1,CMAP_Fe_tblPisces_Forecast_cl1,CMAP_Si_tblPisces_Forecast_cl1,CMAP_chl_tblPisces_Forecast_cl1
0,picoeuk,2023-02-25T05:40:00,21.3067,-157.0366,14.303231,1.55786,0.0,0.000562,0.212505,0.000433,2.092356,0.072018
1,picoeuk,2023-02-25T05:50:00,21.3142,-157.0321,20.716981,1.576,0.0,0.000562,0.212505,0.000433,2.092356,0.072018
2,picoeuk,2023-02-25T06:00:00,21.3123,-157.0168,22.590023,1.606258,0.0,0.000562,0.212505,0.000433,2.092356,0.072018


### Loading in cruise underway CTD data

In [23]:
underway_path = 'data/original/TN413-underway.tab'
#reading in underway data and dropping unnessasary columns (conductivity, par)
underway = (pd.read_csv(underway_path, delimiter='	',skiprows=6)
            .drop(columns=['conductivity', 'par'], axis=1)
           )
#fixing time column
underway['time'] = underway['time'].str[:-1]
#dropping rows that dont have no temp or salinity 
underway = underway.dropna()
#making the time column a datetime64[ns]
underway['time'] = pd.to_datetime(underway['time'])
underway

Unnamed: 0,time,lat,lon,temp,salinity
339,2023-02-25 04:20:02,21.2259,-157.0856,26.5682,34.8463
340,2023-02-25 04:21:02,21.2251,-157.0827,25.0258,35.0090
341,2023-02-25 04:22:02,21.2243,-157.0801,24.5832,34.9939
342,2023-02-25 04:23:02,21.2236,-157.0774,24.3960,34.9996
343,2023-02-25 04:24:02,21.2230,-157.0746,24.3011,35.0086
...,...,...,...,...,...
21925,2023-03-14 02:55:32,-19.0392,178.8369,29.0697,34.7659
21926,2023-03-14 02:56:32,-19.0428,178.8369,29.0781,34.7757
21927,2023-03-14 02:57:32,-19.0464,178.8368,29.0831,34.7773
21928,2023-03-14 02:58:32,-19.0500,178.8368,29.0807,34.7699


In [24]:
underway.shape

(21438, 5)

In [25]:
underway.head(4)

Unnamed: 0,time,lat,lon,temp,salinity
339,2023-02-25 04:20:02,21.2259,-157.0856,26.5682,34.8463
340,2023-02-25 04:21:02,21.2251,-157.0827,25.0258,35.009
341,2023-02-25 04:22:02,21.2243,-157.0801,24.5832,34.9939
342,2023-02-25 04:23:02,21.2236,-157.0774,24.396,34.9996


## Joining realtime_cmap and underway data into one dataframe

### First we need to average underway data frame to an 10 min resolution

In [26]:
underway['time'] = pd.to_datetime(underway['time'])
#this drops the phytoplankton population categorical column, need to recalulate
underway_avg = underway.resample('10min', on='time').mean(numeric_only=True)
underway_avg.shape

(2440, 4)

### Now we can merge the underway CTD and Seaflow/CMAP data into one dataframe called TN413, after the cruise from which the observational data comes from.

In [27]:
realtime_cmap['time'] = pd.to_datetime(realtime_cmap['time'])
tn413 = realtime_cmap.merge(underway_avg.drop(['lat','lon'], axis=1), left_on=['time'], right_on=['time'])

In [28]:
tn413.isna().sum()

population                            0
time                                  0
lat                                1398
lon                                1398
abundance_cells_per_microliter     1398
diameter_micrometer                1398
depth                              1398
CMAP_NO3_tblPisces_Forecast_cl1    2256
CMAP_PO4_tblPisces_Forecast_cl1    2256
CMAP_Fe_tblPisces_Forecast_cl1     2256
CMAP_Si_tblPisces_Forecast_cl1     2256
CMAP_chl_tblPisces_Forecast_cl1    2256
temp                                  0
salinity                              0
dtype: int64

### Renaming Columns of both tn413 and covari dataframes so they're simpleier and the same.

In [29]:
#rename columns for the model

tn413 = (tn413
         .dropna()
         .rename({'CMAP_NO3_tblPisces_Forecast_cl1': 'NO3NO2', 'CMAP_PO4_tblPisces_Forecast_cl1': 'PO4',
             'CMAP_Fe_tblPisces_Forecast_cl1':'Fe', 'CMAP_Si_tblPisces_Forecast_cl1': 'SiO4',
                  'CMAP_chl_tblPisces_Forecast_cl1': 'Satellite_CHL',
                  'salinity':'salin'
             
         },axis=1)


        )
tn413.isna().sum()

population                        0
time                              0
lat                               0
lon                               0
abundance_cells_per_microliter    0
diameter_micrometer               0
depth                             0
NO3NO2                            0
PO4                               0
Fe                                0
SiO4                              0
Satellite_CHL                     0
temp                              0
salin                             0
dtype: int64

In [30]:
#saving as a CSV file
tn413.to_csv('data/modified/Seaflow_TN413_CMAP_ingested.csv', index=False)
