### RULES
Requires format of skipped line between different phylums (ex. empty row above Diatom, Dinoflagellate, etc.)

Assumed all {Ochromonas, } are mixotrophs.

1. assume everything after "Unknown flagellates" is irrelevant (to be deleted)
2. diatoms are NOT mixotrophs
3. remove all "[name]-like" (without genus specified)
4. remove all "[genus name] spp." AND "[genus name] sp."
5. check "cysts of"

Status Key--  
Confirmed := explicitly in the Mixotroph Database  
Unsure (sp. in mdb) := genus in Mixotroph Database lists "[genus name] sp." (ex. Ochromonas sp. for Ochromonas danica)  
Unsure (inexact name):= LIS name is in a longer Mixotroph Database name or vice versa (ex. Chattonella marina in Chattonella marina var. ovata)   

In [3]:
import pandas as pd
import numpy as np

from datetime import datetime

pd.set_option("future.no_silent_downcasting", True)

In [4]:
# stores dataframe block
class Block:
    """
    A class to represent a portion of a dataframe.

    Attributes
    ----------
    ind : list/Index object
        indicices of where the block is located
    df : DataFrame
        contents of the block
    """
    def __init__(self, ind, df):
        self.ind = ind
        self.df = df

In [5]:
def classify(csv_name, known_mixo=["Ochromonas"]):
    # import mixotroph database
    mdb = pd.read_csv("MDB - 3Dec2022.csv")
    mdb.columns = mdb.iloc[1]
    mdb = mdb.drop([0, 1]).reset_index(drop=True)
    
    # edit mdb so that species ending in "sp" now end in "sp."
    mdb['Species Name'] = mdb['Species Name'].str.replace(r'sp$', 'sp.', regex=True)

    # import and clean LIS data
    lis = pd.read_csv(f"inputs/{csv_name}.csv")
    original_headers = lis.columns  # save original column headers
    lis.columns = lis.iloc[1]  # reset column headers
    lis = lis.iloc[3:].reset_index(drop=True)  

    # remove rows after unknown flagellates
    unknown_flagellates_ind = lis[lis["Phylum"] == "Unknown flagellates"].index[0] 
    lis = lis.iloc[:unknown_flagellates_ind]
    lis = lis.iloc[:lis.last_valid_index()+1]  # remove trailing nan rows

    # remove rows that contain "TOTAL"
    lis = lis[~lis["Phylum"].str.contains("TOTAL", na=False)].reset_index(drop=True)  

    # construct correct phylum column
    actual_phylum_ind = lis[lis["Species"].isna() & lis["Phylum"].isna()].index + 1
    lis = lis.rename(columns={"Phylum": "Genus"}) # rename phylum column to genus
    lis.insert(0, 'Phylum', lis["Genus"].iloc[actual_phylum_ind])  # reconstruct phylum column
    lis['Phylum'] = lis['Phylum'].ffill()  # forwardfill phylum
    
    lis['Genus'] = lis['Species'].str.split().str[0]  # fill genus using first word of species name
    
    lis = lis.dropna(subset=['Species']).reset_index(drop=True) # delete rows with na in Species column

    # add Status column
    lis.insert(0, 'Status', None)

    # store blocks of known mixotroph genuses 
    known_blocks = []
    for genus in known_mixo:
        ind = lis[lis["Species"].str.contains(genus)].index
        df = lis.iloc[ind]
        known_blocks.append(Block(ind, df))

    # remove based on hard coded rules (NOT RESETTING INDEX IN ORDER TO ADD BLOCKS BACK CORRECTLY)
    lis = lis[lis["Phylum"] != "Diatom"] # remove all diatoms
    lis = lis[~lis["Species"].str.contains("-like")] # remove species ending with "-like"
    lis = lis[~lis["Species"].str.contains("sp.|spp.")]  # remove all sp. / spp.

    # check "cysts of"
    CYSTS_LEN = len("cysts of ")
    cysts_of = lis[lis["Species"].str.contains("cysts of", regex=False)]["Species"].str.slice(CYSTS_LEN)
    filtered = cysts_of.isin(mdb['Species Name'])
    lis.loc[filtered[filtered].index, "Status"] = "Confirmed"

    # add back stored blocks of known mixotrophs and mark as Confirmed
    for known_block in known_blocks:
        lis = pd.concat([lis, known_block.df]).sort_index().drop_duplicates()
        lis.loc[known_block.ind, "Status"] = "Confirmed"

    # check if (in none status) direct match and mark all Trues as "Confirmed"
    filtered = lis[lis['Status'].isnull()]["Species"].isin(mdb['Species Name'])
    lis.loc[filtered[filtered].index, "Status"] = "Confirmed"
    
    # check (in remaining none status) if the genus has sp. and mark all Trues as "Unsure (sp. in mdb)"
    genus_to_check = lis[lis['Status'].isnull()]['Species'].str.split().str[0].drop_duplicates() + " sp."
    filtered = genus_to_check.isin(mdb['Species Name'])
    lis.loc[filtered[filtered].index, "Status"] = "Unsure (sp. mdb)"

    # check (in remaining none status) if the name is contained in the mdb and vice versa and mark all Trues as "Unsure (inexact name)"
    filtered = lis[lis['Status'].isnull()]["Species"].apply(lambda x: mdb["Species Name"].str.contains(x, regex=False).any())
    lis.loc[filtered[filtered].index, "Status"] = "Unsure (inexact name)"
    
    pattern = '|'.join(mdb['Species Name'])
    filtered = lis[lis['Status'].isnull()]["Species"].str.contains(pattern, regex=True)
    lis.loc[filtered[filtered].index, "Status"] = "Unsure (inexact name)"

    # drop all rows with Status = "None"
    lis = lis.dropna(subset=['Status']).reset_index(drop=True)
    
    SPECIES_COL = 3
    lis.iloc[:, SPECIES_COL+1:] = lis.iloc[:, SPECIES_COL+1:].replace(",", "", regex=True).astype(float)  # ensure numerical values are floats

    # replace Nans with zero
    lis = lis.fillna(0)
    
    without_totals = lis.copy()
    
    totals = lis.groupby('Phylum', as_index=False, sort=False).sum()
    
    # empty text-containing columns
    totals = totals.drop("Status", axis=1)
    totals.insert(0, 'Status', "") 
    totals["Genus"] = ""
    totals["Species"] = ""
    
    # add in line skips
    totals = totals.set_index(lis.groupby(['Phylum']).tail(1).index + 0.1)
    empty_df = pd.DataFrame("", index=lis.groupby(['Phylum']).tail(1).index+0.2, columns=totals.columns)
    totals = pd.concat([totals, empty_df]).sort_index()
    
    # rename to TOTAL "   "
    totals["Phylum"] = totals["Phylum"].str.upper().apply(lambda x: "TOTAL " + x + "S" if x != "" else x)

    with_totals = pd.concat([lis, totals]).sort_index().reset_index(drop=True)

    # add back multiheader
    needed_cols = pd.Series(np.full(len(lis.columns) - len(original_headers), None)) 
    original_headers = pd.concat([needed_cols, original_headers.to_series()], ignore_index=True)
    
    with_totals.columns = pd.MultiIndex.from_arrays([original_headers, lis.columns])
    without_totals.columns = pd.MultiIndex.from_arrays([original_headers, lis.columns])

    return with_totals, without_totals

In [6]:
csv_name = "LIS_2019-Phytoplankton_Final Report Data.xlsx - 2019 LIS phytoplankton count"

# classify mixotrophs
classified = classify(csv_name, known_mixo = ["Ochromonas"])
with_totals, without_totals = classified

# save dataframe to excel
with_totals.to_excel(f"outputs/{csv_name}-{str(datetime.now())}.xlsx")

  totals = lis.groupby('Phylum', as_index=False, sort=False).sum()


In [7]:
with_totals

Unnamed: 0_level_0,NaN,NaN,"(Note: S: surface water sample, B: bottom water sample",Unnamed: 1,A4S Note: Skeletonema bloom,B3S,C1S Note: low cell abundance,D3S,E1S Note: low cell abundance,F2S Note: low cell abundance,...,A4B Note: Skeletonema spp. Bloom,B3B Note: Skeletonema spp. Bloom,C1B Note: Low cell abundance,D3B Note: Very low cell abundance,E1B Note: Low cell abundance.2,F2B Note: some debris; very low cell abundance,H4B Note: Large amount of debris; low cell abundance,I2B Note: debris; very low cell abundance,J2B Note: Very low cell abundance .1,K2B Note: Very low cell abundance.4
1,Status,Phylum,Genus,Species,1/3/19,1/3/19,1/3/19,1/7/19,1/7/19,1/7/19,...,12/6/19,12/6/19,12/6/19,12/5/19,12/16/19,12/16/19,12/16/19,12/4/19,12/4/19,12/4/19
0,Confirmed,Dinoflagellate,Akashiwo,Akashiwo sanguinea,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Confirmed,Dinoflagellate,Dinophysis,Dinophysis acuminata,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Confirmed,Dinoflagellate,Dinophysis,Dinophysis miles,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Confirmed,Dinoflagellate,Dinophysis,Dinophysis norvegica,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Confirmed,Dinoflagellate,Gambierdiscus,Gambierdiscus toxicus,352.0,0.0,0.0,0.0,0.0,0.0,...,88.0,0.0,0.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0
5,Confirmed,Dinoflagellate,Gonyaulax,Gonyaulax polygramma,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Confirmed,Dinoflagellate,Heterocapsa,Heterocapsa circularisquama,17600.0,17600.0,30800.0,8800.0,0.0,17600.0,...,13200.0,13200.0,2904.0,352.0,0.0,0.0,0.0,2904.0,704.0,0.0
7,Confirmed,Dinoflagellate,Noctiluca,Noctiluca scintillans,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Confirmed,Dinoflagellate,Prorocentrum,Prorocentrum lima,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Confirmed,Dinoflagellate,Prorocentrum,Prorocentrum micans,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
