In [76]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [77]:
#!pip install global_land_mask
import pandas as pd
import numpy as np
from global_land_mask import globe


In [78]:
predictors_cmap = pd.read_csv('data/original/predictors_cmap.csv', index_col=0)

In [79]:
predictors_cmap.columns


Index(['lat', 'lon', 'date', 'sunrise', 'time', 'depth',
       'CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology',
       'CMAP_sss_smap_tblSSS_NRT_cl1', 'CMAP_sst_tblSST_AVHRR_OI_NRT',
       'CMAP_ugos_tblAltimetry_REP_Signal',
       'CMAP_vgos_tblAltimetry_REP_Signal', 'CMAP_fe_tblPisces_Forecast_cl1',
       'CMAP_o2_tblPisces_Forecast_cl1', 'CMAP_no3_tblPisces_Forecast_cl1',
       'CMAP_po4_tblPisces_Forecast_cl1', 'CMAP_si_tblPisces_Forecast_cl1'],
      dtype='object')

In [80]:
predictors_cmap = predictors_cmap.drop(['date', 'sunrise'], axis=1)

In [81]:
def ChangeObjectTypes(df):
    """
    This function takes the columns of our dataframe and changes the data type of each varaibles to 
    be the appropriate type.
    Time turns into a pandas datatime format
    PopulationName and cruisename turn into strings
    all other columns which are continous measurments are made to be numeric type
    """
    for column in df:
        if column == 'time':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruise':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
predictors = ChangeObjectTypes(predictors_cmap)

In [82]:
import pandas as pd
import ephem

# Convert 'time' column to datetime format
predictors['time'] = pd.to_datetime(predictors['time']).dt.strftime('%Y/%m/%d %H:%M:%S')

# function to calculate sunrise and hours since sunrise
def calculate_hours_since_sunrise(row):
    """
    Both the dataframe and ephem need to be in the same time zone (in this case UTC).
    UTC is standard for ephem
    """
    
    #defining the ephem observer spatial temporal values to find previous sinrise time
    obs = ephem.Observer()
    obs.lat = str(row['lat'])
    obs.long = str(row['lon'])
    obs.date = row['time']
    
    #determining sunrise for each lat/lon/time
    sunrise = str(obs.previous_rising(ephem.Sun())) # 
    sunrise = pd.to_datetime(sunrise)
    
    #changing time column to datetime object
    row['time'] = pd.to_datetime(row['time'])
    
    row['sunrise'] = sunrise
    
    #calcualting hours since sunrise
    row['hours_since_sunrise'] = (row['time'] - sunrise).total_seconds() / 3600

    return row

# Apply the function to each row using apply
combined = predictors.apply(calculate_hours_since_sunrise, axis=1)

# Print the resulting DataFrame
combined


Unnamed: 0,lat,lon,time,depth,CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology,CMAP_sss_smap_tblSSS_NRT_cl1,CMAP_sst_tblSST_AVHRR_OI_NRT,CMAP_ugos_tblAltimetry_REP_Signal,CMAP_vgos_tblAltimetry_REP_Signal,CMAP_fe_tblPisces_Forecast_cl1,CMAP_o2_tblPisces_Forecast_cl1,CMAP_no3_tblPisces_Forecast_cl1,CMAP_po4_tblPisces_Forecast_cl1,CMAP_si_tblPisces_Forecast_cl1,sunrise,hours_since_sunrise
0,-80.0,-180.0,2023-04-10 00:52:39,5.0,2334.317900,,,,,,,,,,2023-04-09 20:52:39,4.0
1,-80.0,-179.0,2023-04-10 00:48:37,5.0,2334.289600,,,,,,,,,,2023-04-09 20:48:37,4.0
2,-80.0,-178.0,2023-04-10 00:44:35,5.0,2334.257350,,,,,,,,,,2023-04-09 20:44:35,4.0
3,-80.0,-177.0,2023-04-10 00:40:33,5.0,2334.183267,,,,,,,,,,2023-04-09 20:40:33,4.0
4,-80.0,-176.0,2023-04-10 00:36:32,5.0,2334.022950,,,,,,,,,,2023-04-09 20:36:32,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39989,80.0,176.0,2023-04-09 18:25:26,5.0,2165.282650,,-1.573960,,,0.000731,382.840740,4.064020,0.938073,13.734312,2023-04-09 14:25:26,4.0
39990,80.0,177.0,2023-04-09 18:21:29,5.0,2165.510700,,-1.561671,,,0.000727,382.952791,4.160100,0.947328,13.848302,2023-04-09 14:21:29,4.0
39991,80.0,178.0,2023-04-09 18:17:31,5.0,2165.757375,,-1.626254,,,0.000723,382.808289,4.258073,0.957771,13.905520,2023-04-09 14:17:31,4.0
39992,80.0,179.0,2023-04-09 18:13:34,5.0,2166.018425,,-1.624379,,,0.000722,382.373660,4.364238,0.967785,14.007137,2023-04-09 14:13:34,4.0


In [83]:
combined.isna().sum()

lat                                                       0
lon                                                       0
time                                                      0
depth                                                     0
CMAP_ALK_darwin_clim_tblDarwin_Nutrient_Climatology     505
CMAP_sss_smap_tblSSS_NRT_cl1                           4450
CMAP_sst_tblSST_AVHRR_OI_NRT                            187
CMAP_ugos_tblAltimetry_REP_Signal                      3026
CMAP_vgos_tblAltimetry_REP_Signal                      3025
CMAP_fe_tblPisces_Forecast_cl1                          362
CMAP_o2_tblPisces_Forecast_cl1                          362
CMAP_no3_tblPisces_Forecast_cl1                         362
CMAP_po4_tblPisces_Forecast_cl1                         362
CMAP_si_tblPisces_Forecast_cl1                          362
sunrise                                                   0
hours_since_sunrise                                       0
dtype: int64

In [84]:
#predictors = predictors.drop(['CMAP_sss_tblSSS_NRT'], axis=1)

In [85]:
predictors = combined.dropna()
predictors.isna().sum()
predictors = predictors.drop(['depth'], axis=1)
predictors = predictors.drop(['sunrise'], axis=1)

In [86]:
# Define a regular expression pattern to extract the desired substring
pattern = r'_(.*?)_'

# Get the column names starting from the 9th column
columns_to_rename = predictors.columns[3:-1]

# Generate new column names using str.extract() and assign them to the selected columns
predictors.rename(columns=dict(zip(columns_to_rename, columns_to_rename.str.extract(pattern).squeeze())), inplace=True)

predictors.rename({'abundance_prochloro': 'abundance_pro',
       'abundance_synecho':'abundance_syn', 'abundance_picoeuk':'abundance_pico',
       'biomass_prochloro':'biomass_pro', 'biomass_synecho':'biomass_syn', 'biomass_picoeuk': 'biomass_pico'}, axis='columns', inplace=True)
# Print the updated column names

print(predictors.columns)

Index(['lat', 'lon', 'time', 'ALK', 'sss', 'sst', 'ugos', 'vgos', 'fe', 'o2',
       'no3', 'po4', 'si', 'hours_since_sunrise'],
      dtype='object')


In [None]:
predictors.rename({'talk':'ALK'}, axis='columns', inplace=True)

In [87]:
predictors.to_csv('data/modified/predictors_clean.csv')