#  In this notebook we will clean up the global prediction data so it is ready for the RF model

In [1]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [2]:
#!pip install global_land_mask
import pandas as pd
import numpy as np
from global_land_mask import globe


In [3]:
predictors_cmap = pd.read_csv('data_ingest/data/original/predictors_cmap.csv', index_col=0)

In [4]:
predictors_cmap.columns


Index(['lat', 'lon', 'time', 'depth', 'CMAP_sss_smap_tblSSS_NRT_cl1',
       'CMAP_sst_tblSST_AVHRR_OI_NRT', 'CMAP_ugos_tblAltimetry_REP_Signal',
       'CMAP_vgos_tblAltimetry_REP_Signal', 'CMAP_fe_tblPisces_Forecast_cl1',
       'CMAP_o2_tblPisces_Forecast_cl1', 'CMAP_no3_tblPisces_Forecast_cl1',
       'CMAP_po4_tblPisces_Forecast_cl1', 'CMAP_si_tblPisces_Forecast_cl1',
       'CMAP_talk_tblPisces_Forecast_cl1'],
      dtype='object')

In [5]:
def ChangeObjectTypes(df):
    """
    This function takes the columns of our dataframe and changes the data type of each varaibles to 
    be the appropriate type.
    Time turns into a pandas datatime format
    PopulationName and cruisename turn into strings
    all other columns which are continous measurments are made to be numeric type
    """
    for column in df:
        if column == 'time':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruise':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
predictors = ChangeObjectTypes(predictors_cmap)

#### Creating hours since sunrise

Since we set the arbitrary time to be 4 hours past sunrise, this it not strictly needed, but this will be helpful later if you use a less specific time for sampling

In [6]:
import pandas as pd
import ephem

# Convert 'time' column to datetime format
predictors['time'] = pd.to_datetime(predictors['time']).dt.strftime('%Y/%m/%d %H:%M:%S')

# function to calculate sunrise and hours since sunrise
def calculate_hours_since_sunrise(row):
    """
    Both the dataframe and ephem need to be in the same time zone (in this case UTC).
    UTC is standard for ephem
    """
    
    #defining the ephem observer spatial temporal values to find previous sinrise time
    obs = ephem.Observer()
    obs.lat = str(row['lat'])
    obs.long = str(row['lon'])
    obs.date = row['time']
    
    #determining sunrise for each lat/lon/time
    sunrise = str(obs.previous_rising(ephem.Sun())) # 
    sunrise = pd.to_datetime(sunrise)
    
    #changing time column to datetime object
    row['time'] = pd.to_datetime(row['time'])
    
    row['sunrise'] = sunrise
    
    #calcualting hours since sunrise
    row['hours_since_sunrise'] = (row['time'] - sunrise).total_seconds() / 3600

    return row

# Apply the function to each row using apply
combined = predictors.apply(calculate_hours_since_sunrise, axis=1)

# Print the resulting DataFrame
combined


Unnamed: 0,lat,lon,time,depth,CMAP_sss_smap_tblSSS_NRT_cl1,CMAP_sst_tblSST_AVHRR_OI_NRT,CMAP_ugos_tblAltimetry_REP_Signal,CMAP_vgos_tblAltimetry_REP_Signal,CMAP_fe_tblPisces_Forecast_cl1,CMAP_o2_tblPisces_Forecast_cl1,CMAP_no3_tblPisces_Forecast_cl1,CMAP_po4_tblPisces_Forecast_cl1,CMAP_si_tblPisces_Forecast_cl1,CMAP_talk_tblPisces_Forecast_cl1,sunrise,hours_since_sunrise
0,-73.0,-143.0,2023-04-09 21:04:35,5.0,,-1.668134,-0.005808,-0.016058,0.000230,361.295364,26.627517,1.829983,52.720121,2.345027,2023-04-09 17:04:35,4.0
1,-73.0,-142.0,2023-04-09 21:00:34,5.0,,-1.663964,-0.001410,-0.014590,0.000215,361.356059,26.601347,1.843454,52.879913,2.344948,2023-04-09 17:00:34,4.0
2,-73.0,-141.0,2023-04-09 20:56:34,5.0,,-1.717504,0.001610,-0.002052,0.000185,360.978521,26.655291,1.868111,53.449751,2.345255,2023-04-09 16:56:34,4.0
3,-73.0,-140.0,2023-04-09 20:52:33,5.0,,-1.720005,-0.000979,-0.021442,0.000197,360.647106,26.677789,1.879767,54.381974,2.344987,2023-04-09 16:52:33,4.0
4,-73.0,-88.0,2023-04-09 17:23:49,5.0,37.974533,-1.331951,-0.040600,-0.031567,0.002671,356.679917,31.386101,1.812338,44.570525,2.298122,2023-04-09 13:23:49,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35437,79.0,8.0,2023-04-09 06:14:28,5.0,32.770884,2.207909,-0.035740,0.007669,0.000875,323.980874,11.433719,0.780950,5.315912,2.363638,2023-04-09 02:14:28,4.0
35438,79.0,9.0,2023-04-09 06:10:30,5.0,33.055099,1.981451,-0.017423,0.003802,0.001078,327.836197,11.240273,0.756109,4.995414,2.362805,2023-04-09 02:10:30,4.0
35439,79.0,10.0,2023-04-09 06:06:32,5.0,33.580739,0.706660,0.006590,0.019396,0.001615,337.143905,10.998011,0.720724,4.597876,2.360281,2023-04-09 02:06:32,4.0
35440,80.0,7.0,2023-04-09 05:49:52,5.0,29.426297,-1.168962,0.023835,-0.001790,0.000909,333.798676,11.343304,0.766618,5.354952,2.357392,2023-04-09 01:49:52,4.0


In [7]:
combined.isna().sum()

lat                                    0
lon                                    0
time                                   0
depth                                  0
CMAP_sss_smap_tblSSS_NRT_cl1         133
CMAP_sst_tblSST_AVHRR_OI_NRT           0
CMAP_ugos_tblAltimetry_REP_Signal      0
CMAP_vgos_tblAltimetry_REP_Signal      0
CMAP_fe_tblPisces_Forecast_cl1         0
CMAP_o2_tblPisces_Forecast_cl1         0
CMAP_no3_tblPisces_Forecast_cl1        0
CMAP_po4_tblPisces_Forecast_cl1        0
CMAP_si_tblPisces_Forecast_cl1         0
CMAP_talk_tblPisces_Forecast_cl1       0
sunrise                                0
hours_since_sunrise                    0
dtype: int64

In [8]:
#predictors = predictors.drop(['CMAP_sss_tblSSS_NRT'], axis=1)

In [9]:
predictors = combined.dropna()
predictors.isna().sum()
predictors = predictors.drop(['depth'], axis=1)
predictors = predictors.drop(['sunrise'], axis=1)

## Making column names readable

In [10]:
# Define a regular expression pattern to extract the desired substring
pattern = r'_(.*?)_'

# Get the column names starting from the 9th column
columns_to_rename = predictors.columns[3:-1]

# Generate new column names using str.extract() and assign them to the selected columns
predictors.rename(columns=dict(zip(columns_to_rename, columns_to_rename.str.extract(pattern).squeeze())), inplace=True)

predictors.rename({'abundance_prochloro': 'abundance_pro',
       'abundance_synecho':'abundance_syn', 'abundance_picoeuk':'abundance_pico',
       'biomass_prochloro':'biomass_pro', 'biomass_synecho':'biomass_syn', 'biomass_picoeuk': 'biomass_pico'}, axis='columns', inplace=True)
# Print the updated column names

print(predictors.columns)

Index(['lat', 'lon', 'time', 'sss', 'sst', 'ugos', 'vgos', 'fe', 'o2', 'no3',
       'po4', 'si', 'talk', 'hours_since_sunrise'],
      dtype='object')


In [11]:
predictors.rename({'talk':'ALK'}, axis='columns', inplace=True)

In [12]:
predictors.to_csv('data_ingest/data/modified/predictors_clean.csv', index=False)