## This notebook we will be cleaning the data of any NaN values, and getting it ready for machine learning applications.

In [9]:
#Running data ingest python notebook

%run 01_data-ingest-cmap.ipynb

### Need to adjust date, lat and lon columns so that they are in the appropriate formates

In [10]:
print(pd.DataFrame(covari))
print('\n')
print(covari.dtypes)
print('\n')

                    date   PopulationName        lat         lon    biomass  \
0    2016-04-20 07:00:00  Prochlorococcus  21.520326 -158.326984  10.520443   
1    2016-04-20 07:00:00    Synechococcus  21.520326 -158.326984   0.341429   
2    2016-04-20 07:00:00      non_pro_syn  21.520326 -158.326984   3.338212   
3    2016-04-20 07:00:00      non_pro_syn  21.520326 -158.326984   0.701902   
4    2016-04-20 08:00:00  Prochlorococcus  21.662710 -158.323430   9.309387   
...                  ...              ...        ...         ...        ...   
6896 2019-04-29 16:00:00      non_pro_syn  23.491830 -158.000200   1.678707   
6897 2019-04-29 17:00:00  Prochlorococcus  23.295040 -158.001120   6.017184   
6898 2019-04-29 17:00:00    Synechococcus  23.295040 -158.001120   0.558076   
6899 2019-04-29 17:00:00      non_pro_syn  23.295040 -158.001120   2.741156   
6900 2019-04-29 17:00:00      non_pro_syn  23.295040 -158.001120   1.548825   

          salin       temp cruisename        NO3NO2

### Need to ajust columns that contain numbers to floats

In [None]:
def ChangeObjectTypes(df):
    for column in df:
        if column == 'date':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruisename':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
        

In [None]:
ChangeObjectTypes(covari)

### Now all of the objects are in the right format

In [None]:
print(pd.DataFrame(covari))
print('\n')
print(covari.dtypes)
print('\n')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_nans(df):
    """
    This function takes a DataFrame and determines if there are NaN values. If there are NaN values, it returns a
    DataFrame that has the ratio of NaN values for each column, along with a bar graph showing the ratio of NaNs.
    """
    nan_counts = df.isnull().sum()
    total_rows = len(df)
    
    if nan_counts.sum() != 0:
        nan_ratio = nan_counts / total_rows * 100
        nan_df = pd.DataFrame({'Variables': nan_ratio.index, 'Ratio of Data Missing': nan_ratio.values})
        nan_df = nan_df[nan_df['Ratio of Data Missing'] > 0].sort_values(by='Ratio of Data Missing', ascending=False)
        
        # Plotting the NaN values
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.bar(nan_df['Variables'], nan_df['Ratio of Data Missing'])
        plt.xticks(rotation=45, ha='right')
        ax.set_title('Missing Data of Covariate DataFrame')
        ax.set_ylabel('NaNs (%)')
        ax.grid(axis='y')
        plt.tight_layout()
        plt.show()
        
        return nan_df
    else:
        print("No NaNs found. Yay!")
        return None


In [None]:
plot_nans(covari)

### Creating seperate dataframes of covar that have Nans and that do not

In [None]:
covari_nans = covari
covari = covari.dropna()

In [None]:
covari

##  Combining picoeuk and nanoeuk to one Population name (non_pro_syn)

In [None]:
covari['PopulationName'] = covari['PopulationName'].replace(
    {'nanoeukaryotes (2-5µm)': 'non_pro_syn', 'picoeukaryotes (< 2µm)': 'non_pro_syn'}
)
        

In [None]:
tn413

## Calculating Biomaass

### Seaflow uses 
1. Cell abundance
2. Cell Diameter

To Calulate Biomass we first need to take the diameter of the plankton and find the volume using:
d = diameter
r = radius
QC = Carbon Quotas


1. $\frac{Diameter}{2}\ = r$

2. $\frac{4}{3}\pi r^3 = biovolume$

3. *Qc = 0.261 * Biovolume <sup>0.86</sup>*

4. $biomass = Qc * abundance$

In [4]:
#first calulating volumn of a sphere given plankton diamter
import math
plankton_volume = (4/3) * math.pi * (realtime_cmap['diameter_micrometer']/2)**3
#calulating carbon quota
carbon_quota = 0.261 * plankton_volume**0.86
# multipling the result by abundance to get total biomass per measurement (multipling by 1000 to get pgC per L)
realtime_cmap['biomass'] = realtime_cmap['abundance_cells_per_microliter'] * carbon_quota

Unnamed: 0,population,time,lat,lon,abundance_cells_per_microliter,diameter_micrometer,depth,CMAP_NO3_tblPisces_Forecast_cl1,CMAP_PO4_tblPisces_Forecast_cl1,CMAP_Fe_tblPisces_Forecast_cl1,CMAP_Si_tblPisces_Forecast_cl1,CMAP_chl_tblPisces_Forecast_cl1,biomass
0,picoeuk,2023-02-25 05:40:00,21.30670,-157.0366,14.303231,1.557860,0.0,0.000562,0.212505,0.000433,2.092356,0.072018,6.716405
1,picoeuk,2023-02-25 05:50:00,21.31420,-157.0321,20.716981,1.576000,0.0,0.000562,0.212505,0.000433,2.092356,0.072018,10.023072
2,picoeuk,2023-02-25 06:00:00,21.31230,-157.0168,22.590023,1.606258,0.0,0.000562,0.212505,0.000433,2.092356,0.072018,11.478879
3,picoeuk,2023-02-25 06:10:00,21.31170,-156.9975,20.915637,1.544307,0.0,0.000534,0.211138,0.000450,2.089565,0.072336,9.602476
4,picoeuk,2023-02-25 06:20:00,21.31105,-156.9805,22.320419,1.549379,0.0,0.000534,0.211138,0.000450,2.089565,0.072336,10.334478
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5886,synecho,2023-03-11 07:50:00,,,,,,,,,,,
5887,synecho,2023-03-11 08:00:00,-19.42280,-181.8752,0.283794,1.008841,0.0,,,,,,0.043436
5888,synecho,2023-03-11 08:10:00,,,,,,,,,,,
5889,synecho,2023-03-11 08:20:00,-19.42080,-181.8129,0.283794,1.296998,0.0,,,,,,0.083055


## Combinging some of the senior cruise data to covariate

In [217]:
import pandas as pd

# Randomly sample 30% of rows from tn413 DataFrame
tn413['cruisename'] = 'TN413'
tn413.drop(['cell_abundance', 'cell_diameter'], axis=1, inplace=True)


stratum_column = 'lat'
sample_proportion = 0.3

# Calculate stratum sizes
stratum_sizes = (tn413.groupby(stratum_column).size() * sample_proportion).astype(int)

# Sample each group
sampled_groups = []
for group_name, group_data in tn413.groupby(stratum_column):
    sampled_group = group_data.sample(n=stratum_sizes[group_name])
    sampled_groups.append(sampled_group)

# concatenate sampled groups
stratified_sample = pd.concat(sampled_groups)

# Concatenate the sampled rows to covari DataFrame
model_data = pd.concat([covari, sampled_rows], ignore_index=True)

In [218]:
model_data

Unnamed: 0,date,PopulationName,lat,lon,biomass,salin,temp,cruisename,NO3NO2,PO4,Fe,SiO4,Satellite_CHL
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
2,2016-04-20 07:00:00,non_pro_syn,21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
3,2016-04-20 07:00:00,non_pro_syn,21.520326,-158.326984,0.701902,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
4,2016-04-20 08:00:00,Prochlorococcus,21.662710,-158.323430,9.309387,34.902376,24.339265,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7986,2023-02-26 11:20:00,prochloro,20.791500,-157.702450,1.378268,34.947910,24.581220,TN413,7.313388e-04,0.198196,0.000347,2.046753,0.069791
7987,2023-03-08 04:10:00,picoeuk,-5.896500,-179.343700,11.849405,35.659010,29.075130,TN413,1.527741e-03,0.519499,0.000009,2.992112,0.088274
7988,2023-02-28 19:50:00,picoeuk,13.914000,-167.738300,4.722555,34.446510,26.227920,TN413,7.691157e-04,0.224454,0.000020,2.254536,0.045945
7989,2023-02-28 18:40:00,synecho,14.046300,-167.553600,0.087319,34.467610,26.286360,TN413,7.480014e-04,0.222924,0.000020,2.246549,0.046282


In [219]:
#renaming these
model_data['PopulationName'] = model_data['PopulationName'].replace(
    {'nanoeukaryotes (2-5µm)': 'non_pro_syn', 'picoeukaryotes (< 2µm)': 'non_pro_syn',
    'picoeuk': 'non_pro_syn', 'synecho': 'non_pro_syn',
     'synecho': 'Synechococcus', 'prochloro':'Prochlorococcus'
    }
)
model_data.to_csv('/Users/cristianswift/Desktop/Spring-Quarter-2022-2023/SeniorThesis/data/model_data.csv', index=False)


In [220]:
model_data

Unnamed: 0,date,PopulationName,lat,lon,biomass,salin,temp,cruisename,NO3NO2,PO4,Fe,SiO4,Satellite_CHL
0,2016-04-20 07:00:00,Prochlorococcus,21.520326,-158.326984,10.520443,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
1,2016-04-20 07:00:00,Synechococcus,21.520326,-158.326984,0.341429,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
2,2016-04-20 07:00:00,non_pro_syn,21.520326,-158.326984,3.338212,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
3,2016-04-20 07:00:00,non_pro_syn,21.520326,-158.326984,0.701902,34.893785,24.351745,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
4,2016-04-20 08:00:00,Prochlorococcus,21.662710,-158.323430,9.309387,34.902376,24.339265,KOK1606,8.033585e-07,0.346238,0.000083,10.145326,0.035360
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7986,2023-02-26 11:20:00,Prochlorococcus,20.791500,-157.702450,1.378268,34.947910,24.581220,TN413,7.313388e-04,0.198196,0.000347,2.046753,0.069791
7987,2023-03-08 04:10:00,non_pro_syn,-5.896500,-179.343700,11.849405,35.659010,29.075130,TN413,1.527741e-03,0.519499,0.000009,2.992112,0.088274
7988,2023-02-28 19:50:00,non_pro_syn,13.914000,-167.738300,4.722555,34.446510,26.227920,TN413,7.691157e-04,0.224454,0.000020,2.254536,0.045945
7989,2023-02-28 18:40:00,Synechococcus,14.046300,-167.553600,0.087319,34.467610,26.286360,TN413,7.480014e-04,0.222924,0.000020,2.246549,0.046282


In [221]:
# # convert the 'time' column to datetime format
# realtime['time'] = realtime['time'].str[:-1]
# realtime['time'] = pd.to_datetime(realtime['time'])

# # set the 'time' column as the index
# realtime.set_index('time', inplace=True)

# # resample the dataframe by hour and take the mean of each group
# realtime = realtime.resample('H').mean()

# # print the result
# realtime