## This notebook we will be cleaning the data of any NaN values, and getting it ready for the next step towards machine learning applications.

### Loading in covariate dataset that has CMAP features

In [10]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)


In [11]:
import pandas as pd

covari_path = 'data_ingest/data/modified/Seaflow_covari_CMAP_PAR.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(5)

Unnamed: 0,time,cruise,lat,lon,abundance_prochloro,abundance_synecho,abundance_picoeuk,abundance_croco,diam_prochloro,diam_synecho,...,CMAP_sss_tblSSS_NRT,CMAP_sst_tblSST_AVHRR_OI_NRT,CMAP_ugos_tblAltimetry_REP_Signal,CMAP_vgos_tblAltimetry_REP_Signal,CMAP_Fe_tblPisces_NRT,CMAP_O2_tblPisces_NRT,CMAP_NO3_tblPisces_NRT,CMAP_PO4_tblPisces_NRT,CMAP_Si_tblPisces_NRT,hours_since_sunrise
0,2015-05-22 22:00:00,KM1508,21.3434,-158.2737,135.216812,2.021318,1.456863,0.006307,0.534793,1.043773,...,34.571716,25.653118,0.005764,-0.132531,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,6.129444
1,2015-05-22 23:00:00,KM1508,21.343533,-158.273744,136.856649,2.437622,1.774607,0.007009,0.539107,1.04922,...,34.571716,25.653118,0.005764,-0.132531,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,7.129444
2,2015-05-23 00:00:00,KM1508,21.346175,-158.27415,130.873523,3.810792,2.01813,0.006307,0.572932,1.054689,...,34.609317,25.646243,-0.002256,-0.132022,8.8e-05,216.794167,4.269278e-07,0.345151,9.464704,8.129722
3,2015-05-23 03:00:00,KM1508,21.87535,-158.252225,146.197107,1.348372,1.919261,0.006308,0.575585,1.027903,...,34.626952,25.24666,-0.154771,0.091308,0.000102,217.801038,5.199637e-07,0.350773,9.548164,11.146667
4,2015-05-23 04:00:00,KM1508,22.001275,-158.2243,136.327069,2.091172,1.868817,0.006308,0.592049,1.031783,...,34.626952,25.24666,-0.154771,0.091308,9.1e-05,218.341288,5.843342e-07,0.35319,9.569982,12.152222


## Dropping unessasary columns

In [12]:
covari.columns

Index(['time', 'cruise', 'lat', 'lon', 'abundance_prochloro',
       'abundance_synecho', 'abundance_picoeuk', 'abundance_croco',
       'diam_prochloro', 'diam_synecho', 'diam_picoeuk', 'diam_croco',
       'Qc_prochloro', 'Qc_synecho', 'Qc_picoeuk', 'Qc_croco',
       'biomass_prochloro', 'biomass_synecho', 'biomass_picoeuk',
       'biomass_croco', 'CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology',
       'CMAP_sss_tblSSS_NRT', 'CMAP_sst_tblSST_AVHRR_OI_NRT',
       'CMAP_ugos_tblAltimetry_REP_Signal',
       'CMAP_vgos_tblAltimetry_REP_Signal', 'CMAP_Fe_tblPisces_NRT',
       'CMAP_O2_tblPisces_NRT', 'CMAP_NO3_tblPisces_NRT',
       'CMAP_PO4_tblPisces_NRT', 'CMAP_Si_tblPisces_NRT',
       'hours_since_sunrise'],
      dtype='object')

In [13]:
covari = covari.drop(columns=['abundance_prochloro', 'abundance_synecho', 'abundance_picoeuk', 'abundance_croco', 'diam_prochloro', 'diam_synecho', 'diam_picoeuk', 'diam_croco', 'Qc_prochloro', 'Qc_synecho', 'Qc_picoeuk', 'Qc_croco'])

In [14]:
covari.columns

Index(['time', 'cruise', 'lat', 'lon', 'biomass_prochloro', 'biomass_synecho',
       'biomass_picoeuk', 'biomass_croco',
       'CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology',
       'CMAP_sss_tblSSS_NRT', 'CMAP_sst_tblSST_AVHRR_OI_NRT',
       'CMAP_ugos_tblAltimetry_REP_Signal',
       'CMAP_vgos_tblAltimetry_REP_Signal', 'CMAP_Fe_tblPisces_NRT',
       'CMAP_O2_tblPisces_NRT', 'CMAP_NO3_tblPisces_NRT',
       'CMAP_PO4_tblPisces_NRT', 'CMAP_Si_tblPisces_NRT',
       'hours_since_sunrise'],
      dtype='object')

### Need to adjust columns that contain numbers to floats

In [15]:
def ChangeObjectTypes(df):
    """
    This function takes the columns of our dataframe and changes the data type of each varaibles to 
    be the appropriate type.
    Time turns into a pandas datatime format
    PopulationName and cruisename turn into strings
    all other columns which are continous measurments are made to be numeric type
    """
    for column in df:
        if column == 'time':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruise':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
covari = ChangeObjectTypes(covari)

### Now all of the objects are in the right format

In [16]:

print(covari.dtypes)


time                                                   datetime64[ns]
cruise                                                         object
lat                                                           float64
lon                                                           float64
biomass_prochloro                                             float64
biomass_synecho                                               float64
biomass_picoeuk                                               float64
biomass_croco                                                 float64
CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology           float64
CMAP_sss_tblSSS_NRT                                           float64
CMAP_sst_tblSST_AVHRR_OI_NRT                                  float64
CMAP_ugos_tblAltimetry_REP_Signal                             float64
CMAP_vgos_tblAltimetry_REP_Signal                             float64
CMAP_Fe_tblPisces_NRT                                         float64
CMAP_O2_tblPisces_NR

### Changing names of columns to be easier to read

In [17]:
# Define a regular expression pattern to extract the desired substring
pattern = r'_(.*?)_'

# Get the column names starting from the 9th column
columns_to_rename = covari.columns[8:-1]

# Generate new column names using str.extract() and assign them to the selected columns
covari.rename(columns=dict(zip(columns_to_rename, columns_to_rename.str.extract(pattern).squeeze())), inplace=True)

covari.rename({'abundance_prochloro': 'abundance_pro',
       'abundance_synecho':'abundance_syn', 'abundance_picoeuk':'abundance_pico',
       'biomass_prochloro':'biomass_pro', 'biomass_synecho':'biomass_syn', 'biomass_picoeuk': 'biomass_pico'}, axis='columns', inplace=True)
# Print the updated column names
print(covari.columns)


Index(['time', 'cruise', 'lat', 'lon', 'biomass_pro', 'biomass_syn',
       'biomass_pico', 'biomass_croco', 'ALK', 'sss', 'sst', 'ugos', 'vgos',
       'Fe', 'O2', 'NO3', 'PO4', 'Si', 'hours_since_sunrise'],
      dtype='object')


In [18]:
covari

Unnamed: 0,time,cruise,lat,lon,biomass_pro,biomass_syn,biomass_pico,biomass_croco,ALK,sss,sst,ugos,vgos,Fe,O2,NO3,PO4,Si,hours_since_sunrise
0,2015-05-22 22:00:00,KM1508,21.343400,-158.273700,4.024661,0.337763,0.555395,0.009181,1952.641800,34.571716,25.653118,0.005764,-0.132531,0.000088,216.794167,4.269278e-07,0.345151,9.464704,6.129444
1,2015-05-22 23:00:00,KM1508,21.343533,-158.273744,4.167834,0.413687,0.720884,0.013144,1952.641800,34.571716,25.653118,0.005764,-0.132531,0.000088,216.794167,4.269278e-07,0.345151,9.464704,7.129444
2,2015-05-23 00:00:00,KM1508,21.346175,-158.274150,4.654360,0.654208,0.635654,0.008443,1952.641800,34.609317,25.646243,-0.002256,-0.132022,0.000088,216.794167,4.269278e-07,0.345151,9.464704,8.129722
3,2015-05-23 03:00:00,KM1508,21.875350,-158.252225,5.259977,0.216436,0.526369,0.022610,1953.802750,34.626952,25.246660,-0.154771,0.091308,0.000102,217.801038,5.199637e-07,0.350773,9.548164,11.146667
4,2015-05-23 04:00:00,KM1508,22.001275,-158.224300,5.273034,0.340149,0.625376,0.020750,1952.970325,34.626952,25.246660,-0.154771,0.091308,0.000091,218.341288,5.843342e-07,0.353190,9.569982,12.152222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,2022-03-30 01:00:00,KM2204,22.444933,-158.061668,8.526858,0.555619,3.524146,0.056690,1958.648025,34.968639,24.744993,-0.032912,0.018123,0.000457,0.000457,4.574313e-04,0.000457,0.000457,8.546111
4676,2022-03-30 02:00:00,KM2204,22.251114,-158.159919,8.726998,0.505494,3.717675,0.066258,1958.648025,34.964966,24.799995,-0.021673,-0.042388,0.000457,0.000457,4.574313e-04,0.000457,0.000457,9.538611
4677,2022-03-30 03:00:00,KM2204,22.113106,-158.228699,8.541367,0.792685,3.731822,0.046099,1957.562875,34.962398,24.894578,-0.048631,-0.033696,0.000554,0.000554,5.543368e-04,0.000554,0.000554,10.533333
4678,2022-03-30 04:00:00,KM2204,21.914814,-158.328896,7.745151,0.869759,3.936218,0.071808,1958.336425,34.962398,24.894578,-0.048631,-0.033696,0.000660,0.000660,6.599417e-04,0.000660,0.000660,11.525556


## Saving clean dataset

In [21]:
covari.to_csv('data_ingest/data/modified/RF_ready_covari.csv', index=False)