## This notebook we will be cleaning the data of any NaN values, and getting it ready for the next step towards machine learning applications.

### Loading in covariate dataset that has CMAP features

In [1]:
# Set a working directory
import os

directory_path = '/Users/cristianswift/Desktop/armbrust-lab/Seaflow-Machine-Learning/'
os.chdir(directory_path)


In [2]:
import pandas as pd

covari_path = 'data/modified/Seaflow_covari_CMAP_PAR.csv'
#using pandas to read in as a df
covari = (pd.read_csv(covari_path,parse_dates=[0]))
#taking a peak at the data
covari.head(5)


### Need to adjust columns that contain numbers to floats

In [3]:
def ChangeObjectTypes(df):
    """
    This function takes the columns of our dataframe and changes the data type of each varaibles to 
    be the appropriate type.
    Time turns into a pandas datatime format
    PopulationName and cruisename turn into strings
    all other columns which are continous measurments are made to be numeric type
    """
    for column in df:
        if column == 'time':
            #changing to datetime
            df[column] = pd.to_datetime(df[column], errors='coerce')
            
        elif column == 'PopulationName' or column == 'cruisename':
            #changing to string
            df[column] = df[column].astype(str)
            
        else:
            #changing to numeric type
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df
covari = ChangeObjectTypes(covari)

### Now all of the objects are in the right format

In [4]:
print(pd.DataFrame(covari))
print('\n')
print(covari.dtypes)
print('\n')

### Changing names of columns to be easier to read

In [5]:
# Define a regular expression pattern to extract the desired substring
pattern = r'_(.*?)_'

# Get the column names starting from the 9th column
columns_to_rename = covari.columns[8:-1]

# Generate new column names using str.extract() and assign them to the selected columns
covari.rename(columns=dict(zip(columns_to_rename, columns_to_rename.str.extract(pattern).squeeze())), inplace=True)

covari.rename({"PopulationName":"population", "Biomass_pgC_per_L": "biomass"}, axis='columns', inplace=True)
# Print the updated column names
print(covari.columns)


## Saving clean dataset

In [6]:
covari.to_csv('data/modified/data_clean_covari.csv', index=False)