# Coral Names Cleanup

I have multiple csv files containing the coral name and the corresponding file name.  I need to sort these into groups by type of coral so that I can create directories for each of the coral types for model training.

In [1]:
import numpy as np
import pandas as pd
import os
import re

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
filepath = './coral_names'
filelist = os.listdir(filepath)

In [4]:
len(filelist)

29

In [5]:
dfs = []
for file in filelist:
    names = pd.read_csv(os.path.join(filepath, file), delimiter='|', header=0, names=['coral_name', 'filename'])
    names['file'] = re.search(r"(?<=files\_)(.*)(?=\.csv)", file)[0]
    dfs.append(names)
    
df = pd.concat(dfs, ignore_index=True)

In [6]:
df.tail(5)

Unnamed: 0,coral_name,filename,file
56054,Orange Rainbow Ricordea Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56055,Orange Rainbow Ricordea Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56056,Orange Rainbow Ricordea Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56057,Sour Willy Ricordea Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56058,Lime Willy Ricordea Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13


In [7]:
# from the filename, grab everything starting with the last %2F
df['shortfilename'] = df['filename'].str.extract(r'(\%2F(?:.(?!\%2F))+$)')

In [8]:
# grab everything after %2F up through .jpg, which is the name of the file I am looking for
df['shortfilename'] = df['shortfilename'].str.extract(r'(?<=\%2F)(.+\.jpg)')

In [9]:
# remove notations for non-WYSIWYG (what you see is what you get)
df['coral_name'] = df['coral_name'].str.replace(' - (NON-WYSIWYG)', '', regex=False)
df['coral_name'] = df['coral_name'].str.replace(' - MED (NON-WYSIWYG)', '', regex=False)
df['coral_name'] = df['coral_name'].str.replace(' - SM (NON-WYSIWYG)', '', regex=False)

Begin mapping to coral types to define directory structure.

For more details on the families of corals listed below, please see https://marinespecies.org/aphia.php?p=search. For a summary of stony corals, please see https://biophysics.sbg.ac.at/coral/family.htm.

Note that some situations will require special attention to order of operations.  For example, I cannot use `contains` with `acropora` without first capturing `anacropora`.

There are also numerous spelling errors that need to be corrected.  For example, `Ausssie Lord` versus `Aussie Lord`.

STILL TO DO
- Multiple corals on one frag, such as `Pulsing Xenia and Favia Combo`


In [10]:
# zoas, palys, and anemones
df['coral_type'] = np.where(df['coral_name'].str.contains('zoanthid', case=False), 'Zoanthid', None)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('dragon eye combo', case=False), 
    'Zoanthid', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('palythoa', case=False), 
    'Palythoa', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('anemone', case=False), 
    'Anemone', df['coral_type']
)

In [11]:
# mushrooms
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('rhodactis', case=False), 
    'Rhodactis', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('ricordea', case=False), 
    'Ricordea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('yuma', case=False), 
    'Yuma', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('mushroom', case=False), 
    'Mushroom', df['coral_type']
)

In [12]:
# xeniidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('anthelia|anthellia', case=False), 
    'Anthelia', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('cespitularia', case=False), 
    'Cespitularia', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('efflatounaria', case=False), 
    'Efflatounaria', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('sympodium', case=False), 
    'Sympodium', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('pulsing xenia', case=False), 
    'Pulsing Xenia', df['coral_type']
)

In [13]:
# other soft corals

# alcyoniidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('colt', case=False), 
    'Colt', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('leather|lobophytum', case=False), 
    'Leather', df['coral_type']
)

# tubiporidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('pipe organ', case=False), 
    'Pipe Organ', df['coral_type']
)

# clavulariidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('star polyp', case=False), 
    'Star Polyp', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('GSP', case=True), 
    'Star Polyp', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('clove (?:combo )?polyp', case=False), 
    'Clove Polyp', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('daisy polyp', case=False), 
    'Daisy Polyp', df['coral_type']
)

In [14]:
# euphyllia
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('frogspawn', case=False), 
    'Frogspawn', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Frog'), 
    'Frogspawn', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('frammer', case=False), 
    'Frammer', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('cristata', case=False), 
    'Cristata', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Torch'), 
    'Torch', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('torch xl', case=False), 
    'Torch', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Hammer'), 
    'Hammer', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Hammers'), 
    'Hammer', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('Hammer - 2 Heads|Hammer - 3 Heads', case=False), 
    'Hammer', df['coral_type']
)

In [15]:
# other euphyllidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('bubble coral', case=False), # plerogyra
    'Bubble', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('elegance', case=False), 
    'Elegance', df['coral_type']
)

In [16]:
# acroporidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('anacropora', case=False), 
    'Anacropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('astreopora', case=False), 
    'Astreopora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('montipora|montoipora', case=False), 
    'Montipora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Digitata'), 
    'Montipora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Acropora'), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Acrpopora'), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Acro'), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('acroproa', case=False), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Mirabilis'), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Acropora - New Release!'), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Acropora - New Release'), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Acropora '), 
    'Acropora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('Acropora - XL|Acropora - LG|blue stag|Acropora LG|Acropora XL', case=False), 
    'Acropora', df['coral_type']
)

In [17]:
# astrocoeniidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('stylocoeniella', case=False), 
    'Stylocoeniella', df['coral_type']
)

In [18]:
# pocilloporidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('pocillopora|pocillipora', case=False), 
    'Pocillopora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('stylophora', case=False), 
    'Stylophora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('birdnest|birdsnest', case=False), 
    'Seriatopora', df['coral_type']
)

In [19]:
# oculinidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('galaxea', case=False), 
    'Galaxea', df['coral_type']
)

In [20]:
# meandrinidae


In [21]:
# siderastreidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('psammocora|pssamacora', case=False), 
    'Psammocora', df['coral_type']
)

In [22]:
# agariciidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('pavona', case=False), 
    'Pavona', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('leptoseris', case=False), 
    'Leptoseris', df['coral_type']
)

In [23]:
# fungiidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('plate coral|diaseris', case=False), 
    'Plate', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('lithophyllon', case=False), 
    'Lithophyllon', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('tongue coral', case=False), 
    'Tongue', df['coral_type']
)

In [24]:
# pectiniidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('chalice|mycedium', case=False), 
    'Chalice', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('pectinia', case=False), 
    'Pectinia', df['coral_type']
)

In [25]:
# merulinidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('hydnophora', case=False), 
    'Hydnophora', df['coral_type']
)

In [26]:
# dendrophylliidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('turbinaria', case=False), 
    'Turbinaria', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('duncan', case=False), # duncanopsammia
    'Duncan', df['coral_type']
)

In [27]:
# caryophylliidae


In [28]:
# mussidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('blasto', case=False), 
    'Blastomussa', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('micromussa', case=False), 
    'Micromussa', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Echinata'), # acanthastrea echinata
    'Echinata', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Echinata '),
    'Echinata', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('lobophyllia', case=False), 
    'Lobophyllia', df['coral_type']
)

# formerly known as symphyllia
# now australophyllia wilsoni
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('symphyllia|wilsoni', case=False), 
    'Wilsoni', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('cynarina', case=False), 
    'Cynarina', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('scolymia', case=False), 
    'Scolymia', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('acanthophyllia', case=False), 
    'Acanthophyllia', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('bowerbanki', case=False), 
    'Bowerbanki', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('australomussa', case=False), 
    'Australomussa', df['coral_type']
)

# micromussa lordhowensis aka acan lord aka aussie lord aka indo lord
# reclassified from acanthastrea to micromussa in 2016
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('aussie lord', case=False), 
    'Lord', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('aussoe|ausssie', case=False), 
    'Lord', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('indo lord|stone lord', case=False), 
    'Lord', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('bubblegum lord|cheesecake lord|cotton candy lord|solar flare lord', case=False), 
    'Lord', df['coral_type']
)

In [29]:
# faviidae

# entratentacular budding
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('plesiastrea', case=False), 
    'Plesiastrea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('leptastrea', case=False), 
    'Leptastrea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('cyphastrea', case=False), 
    'Cyphastrea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Candy Cane'),
    'Caulastrea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Candy Canes'),
    'Caulastrea', df['coral_type']
)

# intratentacular budding
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('Trumpet'),
    'Caulastrea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('favites', case=False), 
    'Favites', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.endswith('War Coral'), 
    'Favites', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('goniastrea', case=False), 
    'Goniastrea', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('platygyra|platygra|playtygra', case=False), 
    'Platygyra', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('oulophyllia', case=False), 
    'Oulophyllia', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('echinopora', case=False), 
    'Echinopora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('favia', case=False), 
    'Favia', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('golden goose favis', case=False), 
    'Favia', df['coral_type']
)

In [30]:
# trachyphylliidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('trachyphyllia|trachyphllia', case=False), 
    'Trachyphyllia', df['coral_type']
)

In [31]:
# poritidae
df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('goniopora', case=False), 
    'Goniopora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('alveopora', case=False), 
    'Alveopora', df['coral_type']
)

df['coral_type'] = np.where(
    df['coral_type'].isna() 
    & df['coral_name'].str.contains('porites', case=False), 
    'Porites', df['coral_type']
)

In [32]:
# manual cleanup for bad names, such as "$79" or "(CLICK HERE TO BUY!)"
df.iloc[0,4] = 'Zoanthid'
df.iloc[1,4] = 'Zoanthid'
df.iloc[2,4] = 'Zoanthid'
df.iloc[3,4] = 'Zoanthid'
df.iloc[4,4] = 'Zoanthid'
df.iloc[5,4] = 'Zoanthid'
df.iloc[6,4] = 'Zoanthid'
df.iloc[7,4] = 'Zoanthid'
df.iloc[8,4] = 'Zoanthid'
df.iloc[9,4] = 'Zoanthid'
df.iloc[10,4] = 'Zoanthid'
df.iloc[11,4] = 'Zoanthid'
df.iloc[12,4] = 'Zoanthid'
df.iloc[13,4] = 'Zoanthid'
df.iloc[14,4] = 'Zoanthid'
df.iloc[316,4] = 'Delete'
df.iloc[317,4] = 'Zoanthid'
df.iloc[318,4] = 'Zoanthid'
df.iloc[319,4] = 'Zoanthid'
df.iloc[320,4] = 'Zoanthid'
df.iloc[321,4] = 'Zoanthid'
df.iloc[322,4] = 'Zoanthid'
df.iloc[323,4] = 'Delete'
df.iloc[324,4] = 'Zoanthid'
df.iloc[325,4] = 'Zoanthid'
df.iloc[326,4] = 'Zoanthid'
df.iloc[327,4] = 'Zoanthid'
df.iloc[328,4] = 'Zoanthid'
df.iloc[329,4] = 'Zoanthid'
df.iloc[330,4] = 'Zoanthid'
df.iloc[331,4] = 'Zoanthid'
df.iloc[332,4] = 'Zoanthid'
df.iloc[333,4] = 'Zoanthid'
df.iloc[334,4] = 'Zoanthid'
df.iloc[335,4] = 'Zoanthid'
df.iloc[336,4] = 'Zoanthid'
df.iloc[337,4] = 'Zoanthid'
df.iloc[338,4] = 'Zoanthid'
df.iloc[339,4] = 'Zoanthid'
df.iloc[340,4] = 'Zoanthid'
df.iloc[341,4] = 'Zoanthid'
df.iloc[342,4] = 'Zoanthid'
df.iloc[343,4] = 'Palythoa'
df.iloc[344,4] = 'Zoanthid'
df.iloc[345,4] = 'Zoanthid'
df.iloc[618,4] = 'Delete'
df.iloc[1108,4] = 'Delete'
df.iloc[1321,4] = 'Delete'
df.iloc[1322,4] = 'Delete'
df.iloc[1323,4] = 'Delete'
df.iloc[1324,4] = 'Delete'
df.iloc[1325,4] = 'Delete'
df.iloc[1326,4] = 'Platygyra'
df.iloc[1327,4] = 'Cyphastrea'
df.iloc[1328,4] = 'Favia'
df.iloc[1329,4] = 'Alveopora'
df.iloc[1330,4] = 'Favia'
df.iloc[1637,4] = 'Delete'
df.iloc[1638,4] = 'Zoanthid'
df.iloc[1639,4] = 'Zoanthid'
df.iloc[1640,4] = 'Zoanthid'
df.iloc[1641,4] = 'Delete'
df.iloc[1642,4] = 'Zoanthid'
df.iloc[2687,4] = 'Delete'
df.iloc[2736,4] = 'Delete'
df.iloc[5495,4] = 'Delete'
df.iloc[5496,4] = 'Delete'
df.iloc[5497,4] = 'Delete'
df.iloc[5498,4] = 'Delete'
df.iloc[5499,4] = 'Delete'
df.iloc[5500,4] = 'Delete'
df.iloc[14973,4] = 'Zoanthid'
df.iloc[14974,4] = 'Acropora'
df.iloc[14975,4] = 'Blastomussa'
df.iloc[14976,4] = 'Pipe Organ'
df.iloc[14977,4] = 'Lord'
df.iloc[14978,4] = 'Goniastrea'
df.iloc[14979,4] = 'Hammer'
df.iloc[14980,4] = 'Ricordea'
df.iloc[14981,4] = 'Lord'
df.iloc[22998,4] = 'Montipora'
df.iloc[22999,4] = 'Ricordea'
df.iloc[23000,4] = 'Zoanthid'
df.iloc[23001,4] = 'Caulastrea'
df.iloc[23002,4] = 'Platygyra'
df.iloc[24276,4] = 'Lord'
df.iloc[24277,4] = 'Duncan'
df.iloc[24278,4] = 'Chalice'
df.iloc[24279,4] = 'Zoanthid'
df.iloc[24280,4] = 'Seriatopora'
df.iloc[26490,4] = 'Montipora'
df.iloc[26491,4] = 'Ricordea'
df.iloc[26492,4] = 'Zoanthid'
df.iloc[26493,4] = 'Caulastrea'
df.iloc[26494,4] = 'Platygyra'
df.iloc[26596,4] = 'Delete'
df.iloc[26810,4] = 'Delete'
df.iloc[26818,4] = 'Montipora'
df.iloc[27575,4] = 'Torch'
df.iloc[27576,4] = 'Ricordea'
df.iloc[27842,4] = 'Lord'
df.iloc[27843,4] = 'Duncan'
df.iloc[27844,4] = 'Chalice'
df.iloc[27845,4] = 'Zoanthid'
df.iloc[27846,4] = 'Seriatopora'
df.iloc[28321,4] = 'Delete'
df.iloc[28729,4] = 'Delete'
df.iloc[28730,4] = 'Goniastrea'
df.iloc[28731,4] = 'Favia'
df.iloc[28732,4] = 'Favia'
df.iloc[28733,4] = 'Zoanthid'
df.iloc[28734,4] = 'Lord'
df.iloc[28735,4] = 'Ricordea'
df.iloc[28743,4] = 'Delete'
df.iloc[28933,4] = 'Chalice'
df.iloc[28934,4] = 'Favia'
df.iloc[28935,4] = 'Favia'
df.iloc[28936,4] = 'Zoanthid'
df.iloc[28937,4] = 'Zoanthid'
df.iloc[28938,4] = 'Echinata'
df.iloc[29063,4] = 'Acropora'
df.iloc[32554,4] = 'Favia'
df.iloc[32555,4] = 'Favia'
df.iloc[32556,4] = 'Favia'
df.iloc[32557,4] = 'Montipora'
df.iloc[33363,4] = 'Delete'
df.iloc[33364,4] = 'Delete'
df.iloc[33365,4] = 'Delete'
df.iloc[33366,4] = 'Delete'
df.iloc[33367,4] = 'Delete'

In [33]:
# for all the "Mystery" corals without pictures, correct into the "Delete" category
df['coral_type'] = np.where(
    df['coral_name'].str.contains('mystery', case=False), 
    'Delete', df['coral_type']
)
# except for "Mystery Machine Aussie Lord", which needs to be "Lord"
df['coral_type'] = np.where(
    df['coral_name'].str.contains('mystery machine', case=False), 
    'Lord', df['coral_type']
)

In [34]:
df['coral_type'].value_counts(dropna=False, ascending=True).to_frame()

Unnamed: 0,coral_type
Tongue,1
Elegance,2
Australomussa,2
Echinopora,6
Frammer,7
Cynarina,9
Efflatounaria,9
Acanthophyllia,10
Trachyphyllia,11
Cristata,16


In [35]:
df.head()

Unnamed: 0,coral_name,filename,file,shortfilename,coral_type
0,$79,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-22-13-647.jpg,Zoanthid
1,$59,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-30-64-151.jpg,Zoanthid
2,$29,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-24-68-443.jpg,Zoanthid
3,$69,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-24-84-814.jpg,Zoanthid
4,$119,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-26-82-597.jpg,Zoanthid


In [36]:
df.shape[0]

56059

In [37]:
# set coral_type to "Delete" if original image file is missing from download
df['coral_type'] = np.where(
    df['shortfilename'].isna(), 
    'Delete', df['coral_type']
)

In [38]:
df = df.drop_duplicates(
    subset=['filename', 'shortfilename', 'coral_type'], keep='first'
).reset_index(drop=True)

In [39]:
df.shape[0]

37731

In [40]:
#df[df.duplicated(subset=['filename', 'shortfilename'], keep=False)].sort_values(by=['shortfilename']).head(20)

In [41]:
# duplicate files (from filename and shortfilename) listed multiple ways
# set coral_type to "Delete" for incorrect listings
df.iloc[[5383,18751,6457], 4] = 'Delete'

In [42]:
# final dataframe includes all coral types except "Delete"
df = df[df['coral_type'] != 'Delete']

In [43]:
df.shape[0]

37364

In [44]:
df['filepath'] = 'scraped_images_' + df['file'].astype(str) + '/' + df['shortfilename']

In [45]:
df.head()

Unnamed: 0,coral_name,filename,file,shortfilename,coral_type,filepath
0,$79,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-22-13-647.jpg,Zoanthid,scraped_images_6/LSMAY-22-13-647.jpg
1,$59,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-30-64-151.jpg,Zoanthid,scraped_images_6/LSMAY-30-64-151.jpg
2,$29,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-24-68-443.jpg,Zoanthid,scraped_images_6/LSMAY-24-68-443.jpg
3,$69,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-24-84-814.jpg,Zoanthid,scraped_images_6/LSMAY-24-84-814.jpg
4,$119,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-26-82-597.jpg,Zoanthid,scraped_images_6/LSMAY-26-82-597.jpg


In [46]:
df['coral_type'].value_counts(dropna=False, ascending=True).to_frame()

Unnamed: 0,coral_type
Australomussa,1
Tongue,1
Elegance,2
Echinopora,3
Cynarina,5
Acanthophyllia,6
Frammer,6
Trachyphyllia,7
Efflatounaria,7
Colt,13


In [48]:
# create files for sorting images into folders based on coral_type
for coral in df['coral_type'].unique():
    df[df['coral_type'] == coral].filepath.to_csv('coral_types_filepaths/'+coral+'.txt', header=None, index=None)