# In this notebook we will ingest and combine all of our data into one usable dataframe  

### Loading Pandas to manipulate our data

In [12]:
# !pip3 install pandas==2.0.0

In [2]:
import pandas as pd

## Setting working directory

In [1]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)


## Creating Real Time data set that includes CMAP and PAR data

### Loading in covariate data that has colocalized CMAP features

In [5]:
covari_cmap_path = 'data/modified/Seaflow_covariates_CMAP.csv'
#using pandas to read in as a df
covari_cmap = (pd.read_csv(covari_cmap_path))
#making time a datatime object
covari_cmap['time'] = pd.to_datetime(covari_cmap['time'])

#dropping depth column which was used to to get CMAP Darwin Nutrient Climatology data
covari_cmap.drop('depth', axis=1, inplace=True)
#taking a peak at the data
covari_cmap.head(3)


Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,temp,cruisename,CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_POSi_darwin_clim_tblDarwin_Nutrient_Climatology,...,CMAP_NO2_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_NH4_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_FeT_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOP_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DON_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOFe_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DIC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_CDOM_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.295276,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.295276,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665
2,2016-04-20 07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,0.295276,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665


### Loading in cruise par CTD data

In [18]:
par_path = 'data/original/EnvironmentalData.csv'
#reading in par data and dropping unnessasary columns (conductivity, par)
par = (pd.read_csv(par_path)
       #dropping unessasary columns
       .drop(columns=['salinity', 'temp', 'SiO4', 'NO3_NO2',
                     'PO4', 'Fe', 'PP', 'chl', 'MLD',
                     'cruise'], axis=1)
       #renaming so it matches the name format of the covariates df
       .rename({'date':'time'}, axis=1)
      )
      

#making the time column a datetime64[ns]
par['time'] = pd.to_datetime(par['time'])

#droping all nan values
par = par.dropna()
par.sort_values(by='time')

Unnamed: 0,par,time,lat,lon
92,1020.094444,2011-10-26 17:00:00,47.000000,-127.700000
454,992.973494,2011-10-26 22:00:00,46.300000,-128.900000
458,962.789610,2011-10-26 23:00:00,46.200000,-129.200000
464,1025.041818,2011-10-27 00:00:00,46.000000,-129.400000
489,1025.442857,2011-10-27 01:00:00,45.900000,-129.700000
...,...,...,...,...
935,1722.103000,2021-12-29 21:00:00,32.648323,-118.079951
936,1117.570500,2021-12-29 22:00:00,32.709330,-118.180315
949,568.868550,2021-12-29 23:00:00,32.806110,-118.294397
944,127.692150,2021-12-30 00:00:00,32.900902,-118.407436


## Joining covari_cmap and par data into one dataframe

### Now we can merge the par data and Seaflow/CMAP data into one dataframe called combined

In [6]:

combined = covari_cmap.merge(par.drop(['lat','lon'], axis=1), left_on=['time'], right_on=['time'])

In [7]:
combined

Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,temp,cruisename,CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_POSi_darwin_clim_tblDarwin_Nutrient_Climatology,...,CMAP_NH4_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_FeT_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOP_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DON_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOFe_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DIC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_CDOM_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology,par
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.019300
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.019300
2,2016-04-20 07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.019300
3,2016-04-20 07:00:00,picoeukaryotes (< 2µm),21.520326,-158.326984,0.701902,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.019300
4,2016-04-20 08:00:00,Prochlorococcus,21.662710,-158.323430,9.309387,34.902376,24.339265,KOK1606,-0.022845,-0.000127,...,1.282981,0.000015,0.013734,0.248717,0.000017,1.648093,1697.874775,0.000034,1954.876650,0.020382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11126,2021-12-30 00:00:00,picoeukaryotes (< 2µm),32.673493,-117.545342,3.774488,33.468151,15.189021,TN398,0.363296,0.099231,...,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,127.692150
11127,2021-12-30 01:00:00,Prochlorococcus,32.682100,-117.660321,0.874599,33.478846,15.327302,TN398,0.363296,0.099231,...,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,1.012800
11128,2021-12-30 01:00:00,Synechococcus,32.682100,-117.660321,9.707579,33.478846,15.327302,TN398,0.363296,0.099231,...,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,1.012800
11129,2021-12-30 01:00:00,nanoeukaryotes (2-5µm),32.682100,-117.660321,2.428084,33.478846,15.327302,TN398,0.363296,0.099231,...,0.242743,0.000497,0.164132,2.736920,0.000178,19.695796,1819.587625,0.000756,2008.417775,1.012800


## saving as a CSV file

In [8]:
combined.to_csv('data/modified/Seaflow_covari_CMAP_PAR.csv', index=False)