## This notebook we will be cleaning the data of any NaN values, and getting it ready for machine learning applications.

### Loading in covariate dataset that has CMAP features

In [1]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [2]:
import pandas as pd

covari_path = 'data/modified/Seaflow_covari_CMAP_PAR.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(5)


Unnamed: 0,time,PopulationName,lat,lon,Biomass_pgC_per_L,salin,temp,cruisename,CMAP_SiO2_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_POSi_darwin_clim_tblDarwin_Nutrient_Climatology,...,CMAP_NH4_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_FeT_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOP_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DON_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOFe_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DOC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_DIC_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_CDOM_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology,par
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
2,2016-04-20 07:00:00,nanoeukaryotes (2-5µm),21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
3,2016-04-20 07:00:00,picoeukaryotes (< 2µm),21.520326,-158.326984,0.701902,34.893785,24.351745,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.0193
4,2016-04-20 08:00:00,Prochlorococcus,21.66271,-158.32343,9.309387,34.902376,24.339265,KOK1606,-0.022845,-0.000127,...,1.282981,1.5e-05,0.013734,0.248717,1.7e-05,1.648093,1697.874775,3.4e-05,1954.87665,0.020382


### Need to adjust columns that contain numbers to floats

In [3]:
def ChangeObjectTypes(df):
    for column in df:
        if column == 'time':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruisename':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
covari = ChangeObjectTypes(covari)

### Now all of the objects are in the right format

In [4]:
print(pd.DataFrame(covari))
print('\n')
print(covari.dtypes)
print('\n')

                     time          PopulationName        lat         lon  \
0     2016-04-20 07:00:00         Prochlorococcus  21.520326 -158.326984   
1     2016-04-20 07:00:00           Synechococcus  21.520326 -158.326984   
2     2016-04-20 07:00:00  nanoeukaryotes (2-5µm)  21.520326 -158.326984   
3     2016-04-20 07:00:00  picoeukaryotes (< 2µm)  21.520326 -158.326984   
4     2016-04-20 08:00:00         Prochlorococcus  21.662710 -158.323430   
...                   ...                     ...        ...         ...   
11126 2021-12-30 00:00:00  picoeukaryotes (< 2µm)  32.673493 -117.545342   
11127 2021-12-30 01:00:00         Prochlorococcus  32.682100 -117.660321   
11128 2021-12-30 01:00:00           Synechococcus  32.682100 -117.660321   
11129 2021-12-30 01:00:00  nanoeukaryotes (2-5µm)  32.682100 -117.660321   
11130 2021-12-30 01:00:00  picoeukaryotes (< 2µm)  32.682100 -117.660321   

       Biomass_pgC_per_L      salin       temp cruisename  \
0              10.520443  

### Changing names of columns to be easier to read

In [5]:
# Define a regular expression pattern to extract the desired substring
pattern = r'_(.*?)_'

# Get the column names starting from the 9th column
columns_to_rename = covari.columns[8:-1]

# Generate new column names using str.extract() and assign them to the selected columns
covari.rename(columns=dict(zip(columns_to_rename, columns_to_rename.str.extract(pattern).squeeze())), inplace=True)

covari.rename({"PopulationName":"population", "Biomass_pgC_per_L": "biomass"}, axis='columns', inplace=True)
# Print the updated column names
print(covari.columns)


Index(['time', 'population', 'lat', 'lon', 'biomass', 'salin', 'temp',
       'cruisename', 'SiO2', 'POSi', 'PON', 'POFe', 'POC', 'PO4', 'PIC', 'O2',
       'NO3', 'NO2', 'NH4', 'FeT', 'DOP', 'DON', 'DOFe', 'DOC', 'DIC', 'CDOM',
       'ALK', 'par'],
      dtype='object')


## Saving machine learning ready dataset

In [6]:
covari.to_csv('data/modified/data_clean_covari.csv', index=False)