#  In this notebook we will clean up the global prediction data so it is ready for the RF model

In [73]:
# Set a working directory
#!pip install GitPython
import git
import os

repo = git.Repo('.', search_parent_directories=True)


os.chdir(repo.working_tree_dir)

In [74]:
#!pip install global_land_mask
import pandas as pd
import numpy as np
from global_land_mask import globe
import pyarrow.parquet as pq


In [75]:
predictors = pd.read_csv('data_ingest/data/original/predictors.csv')

In [76]:
sss1 = pq.read_pandas('data_ingest/data/original/sss.parquet').to_pandas()
sst1 = pq.read_pandas('data_ingest/data/original/sst.parquet').to_pandas()
alk1 = pq.read_pandas('data_ingest/data/original/alk.parquet').to_pandas()
pisces1 = pq.read_pandas('data_ingest/data/original/pisces.parquet').to_pandas()
sss1.dropna(inplace=True)
sst1.dropna(inplace=True)
alk1.dropna(inplace=True)
pisces1.dropna(inplace=True)

In [77]:
alk1['time'] = '2023-04-10T00:00:00'
alk1['time'] = pd.to_datetime(alk1['time'])
alk1.drop(columns=['depth'], inplace=True)

In [78]:
CMAP_dfs = [alk1, sss1, sst1, pisces1]

In [79]:
def lat_rounder(df):
    df.drop(columns=['time'], inplace=True)
    df = df.reset_index()
    df['lat_rounded'] = df['lat'].round()
    df['lon_rounded'] = df['lon'].round()

    
    df1 = df.groupby(['lat_rounded', 'lon_rounded']).mean().reset_index()

    
    return df1

In [80]:
alk2 = lat_rounder(alk1)
sss2 = lat_rounder(sss1)
sst2 = lat_rounder(sst1)
pisces2 = lat_rounder(pisces1)

In [81]:
new_dfs = [alk2, sss2, sst2, pisces2]

In [82]:
for df in new_dfs:
    df.drop(columns=['lat', 'lon'], inplace=True)
    df.drop(columns=['index'], inplace=True)
    df.rename(columns={'lat_rounded':'lat', 'lon_rounded':'lon'}, inplace=True)

In [83]:
def merge_dfs(predictors, dfs):
    i = 0
    for df in dfs:
        predictors = pd.merge(predictors, df, on=['lat', 'lon'], how='left')
        i += 1
        print('merged', {i})
    return predictors

In [84]:
predictors_cmap = merge_dfs(predictors, new_dfs)

merged {1}
merged {2}
merged {3}
merged {4}


In [85]:
predictors_cmap.columns


Index(['lat', 'lon', 'date', 'sunrise', 'time', 'depth', 'month',
       'ALK_darwin_clim', 'sss_smap', 'sst', 'fe', 'o2', 'no3', 'po4', 'si'],
      dtype='object')

In [86]:
predictors_cmap.drop(columns=['date', 'month'], inplace=True)

In [87]:
def ChangeObjectTypes(df):
    """
    This function takes the columns of our dataframe and changes the data type of each varaibles to 
    be the appropriate type.
    Time turns into a pandas datatime format
    PopulationName and cruisename turn into strings
    all other columns which are continous measurments are made to be numeric type
    """
    for column in df:
        if column == 'time':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruise':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
predictors = ChangeObjectTypes(predictors_cmap)

#### Creating hours since sunrise

Since we set the arbitrary time to be 4 hours past sunrise, this it not strictly needed, but this will be helpful later if you use a less specific time for sampling

In [88]:
import pandas as pd
import ephem

# Convert 'time' column to datetime format
predictors['time'] = pd.to_datetime(predictors['time']).dt.strftime('%Y/%m/%d %H:%M:%S')

# function to calculate sunrise and hours since sunrise
def calculate_hours_since_sunrise(row):
    """
    Both the dataframe and ephem need to be in the same time zone (in this case UTC).
    UTC is standard for ephem
    """
    
    #defining the ephem observer spatial temporal values to find previous sinrise time
    obs = ephem.Observer()
    obs.lat = str(row['lat'])
    obs.long = str(row['lon'])
    obs.date = row['time']
    
    #determining sunrise for each lat/lon/time
    sunrise = str(obs.previous_rising(ephem.Sun())) # 
    sunrise = pd.to_datetime(sunrise)
    
    #changing time column to datetime object
    row['time'] = pd.to_datetime(row['time'])
    
    row['sunrise'] = sunrise
    
    #calcualting hours since sunrise
    row['hours_since_sunrise'] = (row['time'] - sunrise).total_seconds() / 3600

    return row

# Apply the function to each row using apply
combined = predictors.apply(calculate_hours_since_sunrise, axis=1)

# Print the resulting DataFrame
combined


Unnamed: 0,lat,lon,sunrise,time,depth,ALK_darwin_clim,sss_smap,sst,fe,o2,no3,po4,si,hours_since_sunrise
0,-80.0,-180.0,2023-04-09 20:52:39,2023-04-10 00:52:39,5.0,2334.21730,,,,,,,,4.0
1,-80.0,-179.0,2023-04-09 20:48:37,2023-04-10 00:48:37,5.0,2334.18530,,,,,,,,4.0
2,-80.0,-178.0,2023-04-09 20:44:35,2023-04-10 00:44:35,5.0,2334.14430,,,,,,,,4.0
3,-80.0,-177.0,2023-04-09 20:40:33,2023-04-10 00:40:33,5.0,2334.08970,,,,,,,,4.0
4,-80.0,-176.0,2023-04-09 20:36:32,2023-04-10 00:36:32,5.0,2334.02295,,,,,,,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39989,80.0,176.0,2023-04-09 14:25:26,2023-04-09 18:25:26,5.0,2164.98450,,-1.583754,0.000740,381.843030,4.101029,0.934664,14.185914,4.0
39990,80.0,177.0,2023-04-09 14:21:29,2023-04-09 18:21:29,5.0,2165.23025,,-1.581255,0.000735,382.153229,4.190627,0.944138,14.234589,4.0
39991,80.0,178.0,2023-04-09 14:17:31,2023-04-09 18:17:31,5.0,2165.49005,,-1.646258,0.000729,382.286873,4.284198,0.955053,14.234134,4.0
39992,80.0,179.0,2023-04-09 14:13:34,2023-04-09 18:13:34,5.0,2165.76830,,-1.636263,0.000727,381.938673,4.403053,0.966554,14.343113,4.0


In [89]:
combined.isna().sum()

lat                       0
lon                       0
sunrise                   0
time                      0
depth                     0
ALK_darwin_clim         506
sss_smap               5147
sst                     187
fe                      392
o2                      392
no3                     392
po4                     392
si                      392
hours_since_sunrise       0
dtype: int64

In [None]:
#predictors = predictors.drop(['CMAP_sss_tblSSS_NRT'], axis=1)

In [90]:
predictors = combined.dropna()
predictors.isna().sum()
predictors = predictors.drop(['depth'], axis=1)
predictors = predictors.drop(['sunrise'], axis=1)

In [91]:
predictors.rename(columns={'ALK_darwin_clim': 'alk', 'sss_smap':'sss'}, inplace=True)

## Making column names readable

In [92]:
predictors.to_csv('data_ingest/data/modified/predictors_clean.csv', index=False)