# Coral Names Cleanup

I have multiple csv files containing the coral name and the corresponding file name.  I need to sort these into groups by type of coral so that I can create directories for each of the coral types for model training.

In [1]:
import pandas as pd
import os
import re

In [2]:
pd.set_option('display.max_rows', None)

In [3]:
filepath = './coral_names'
filelist = os.listdir(filepath)

In [4]:
len(filelist)

29

In [5]:
dfs = []
for file in filelist:
    names = pd.read_csv(os.path.join(filepath, file), delimiter='|', header=0, names=['coral_name', 'filename'])
    names['file'] = re.search(r"(?<=files\_)(.*)(?=\.csv)", file)[0]
    dfs.append(names)
    
df = pd.concat(dfs, ignore_index=True)

In [6]:
df.tail(20)

Unnamed: 0,coral_name,filename,file
56039,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56040,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56041,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56042,Neon Fuzzy Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56043,Neon Fuzzy Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56044,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56045,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56046,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56047,WWC King Tut Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56048,WWC King Tut Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13


In [7]:
# from the filename, grab everything starting with the last %2F
df['shortfilename'] = df['filename'].str.extract(r'(\%2F(?:.(?!\%2F))+$)')

In [8]:
# grab everything after %2F up through .jpg, which is the name of the file I am looking for
df['shortfilename'] = df['shortfilename'].str.extract(r'(?<=\%2F)(.+\.jpg)')

In [9]:
# create mapping from coral name to coral type
coral_mapping = {
    r'.*Acropora.*': 'Acropora',
    r'.*Acro.*': 'Acropora',
    r'.*Acrpopora.*': 'Acropora',
    r'.*Alveopora.*': 'Alveopora',
    r'.*Anacropora.*': 'Anacropora',
    r'.*Anemone.*': 'Anemone',
    r'.*Anthelia.*': 'Anthelia',
    r'.*Anthellia.*': 'Anthelia',
    r'.*Astreopora.*': 'Astreopora',
    r'.*Aussie Lord.*': 'Aussie Lord',
    r'.*Aussoe Lord.*': 'Aussie Lord',
    r'.*Ausssie Lord.*': 'Aussie Lord',
    r'.*Aussie lord.*': 'Aussie Lord',
    r'.*Birdsnest.*': 'Birdsnest',
    r'.*Birdnest.*': 'Birdsnest',
    r'.*Blastomussa.*': 'Blastomussa',
    r'.*Blasto.*': 'Blastomussa',
    r'.*Bowerbanki.*': 'Bowerbanki',
    r'.*Bubble.*': 'Bubble',
    r'.*Candy Cane.*': 'Candy Cane',
    r'.*Chalice.*': 'Chalice',
    r'.*Clove Polyp.*': 'Clove Polyp',
    r'.*Cynarina.*': 'Cynarina',
    r'.*Cyphastrea.*': 'Cyphastrea',
    r'.*Daisy Polyp.*': 'Daisy Polyp',
    r'.*Duncan.*': 'Duncan',
    r'.*Echinata.*': 'Echinata',
    r'.*Elegance.*': 'Elegance',
    r'.*Favia.*': 'Favia',
    r'.*favia.*': 'Favia',
    r'.*Favites.*': 'Favites',
    r'.*favites.*': 'Favites',
    r'.*Frogspawn.*': 'Frogspawn',
    r'.*Galaxea.*': 'Galaxea',
    r'.*Goniastrea.*': 'Goniastrea',
    r'.*Goniopora.*': 'Goniopora',
    r'.*GSP.*': 'Star Polyp',
    r'.*Hammer.*': 'Hammer',
    r'.*Hydnophora.*': 'Hydnophora',
    r'.*Leather.*': 'Leather',
    r'.*Leptastrea.*': 'Leptastrea',
    r'.*Leptoseris.*': 'Leptoseris',
    r'.*Lithophyllon.*': 'Lithophyllon',
    r'.*Lobophyllia.*': 'Lobophyllia',
    r'.*Lobophytum.*': 'Leather',
    r'.*Micromussa.*': 'Micromussa',
    r'.*Montipora.*': 'Montipora',
    r'.*Montoipora.*': 'Montipora',
    r'.*Palythoa.*': 'Palythoa',
    r'.*palythoa.*': 'Palythoa',
    r'.*Pavona.*': 'Pavona',
    r'.*Pectinia.*': 'Pectinia',
    r'.*Pipe Organ.*': 'Pipe Organ',
    r'.*Platygyra.*': 'Platygyra',
    r'.*Platygra.*': 'Platygyra',
    r'.*Playtygra.*': 'Platygyra',
    r'.*Plate.*': 'Plate',
    r'.*Plesiastrea.*': 'Plesiastrea',
    r'.*Pocillopora.*': 'Pocillopora',
    r'.*Porites.*': 'Porites',
    r'.*Psammocora.*': 'Psammocora',
    r'.*Pssamacora.*': 'Psammocora',
    r'.*Rhodactis.*': 'Rhodactis',
    r'.*Ricordea.*': 'Ricordea',
    r'.*Scolymia.*': 'Scolymia',
    r'.*Star Polyp.*': 'Star Polyp',
    r'.*Stylocoeniella.*': 'Stylocoeniella',
    r'.*Stylophora.*': 'Stylophora',
    r'.*Symphyllia.*': 'Symphyllia',
    r'.*Sympodium.*': 'Sympodium',
    r'.*Torch.*': 'Torch',
    r'.*Trachyphyllia.*': 'Trachyphyllia',
    r'.*Turbinaria.*': 'Turbinaria',
    r'.*Wilsoni.*': 'Wilsoni',
    r'.*Xenia.*': 'Xenia',
    r'.*Yuma.*': 'Yuma',
    r'.*Zoanthid.*': 'Zoanthid',
    r'.*zoanthid.*': 'Zoanthid'
}
mushroom_mapping = {
    r'.*Mushroom.*': 'Mushroom',
    r'.*mushroom.*': 'Mushroom'
}

In [10]:
# apply the mapping to get a standardized coral type
df['coral_type'] = df['coral_name'].replace(coral_mapping, regex=True)
df['coral_type'] = df['coral_type'].replace(mushroom_mapping, regex=True)

In [11]:
df.sample(20, random_state=8)

Unnamed: 0,coral_name,filename,file,shortfilename,coral_type
5684,Fairy Tales Aussie Lord,https://www.reef2reef.com/proxy.php?image=http...,9,LSFR-40-54-955.jpg,Aussie Lord
43393,WWC Atomic Green Montipora,https://www.reef2reef.com/proxy.php?image=http...,24,ANR14-20-032.jpg,Montipora
11772,WWC Lemon Tip Clove Polyps,https://www.reef2reef.com/proxy.php?image=http...,25,ANR45-61-223.jpg,Clove Polyp
37390,Mystery Machine Aussie Lord,https://www.reef2reef.com/proxy.php?image=http...,28,LSBFS-08-62-870.jpg,Aussie Lord
46613,Rainbow Montipora,https://www.reef2reef.com/proxy.php?image=http...,2,R04-72-393t.jpg,Montipora
29674,WWC Christmas Mirabilis Acropora,https://www.reef2reef.com/proxy.php?image=http...,5,TSR18-01-223.jpg,Acropora
46541,Neon Blastomussa,https://www.reef2reef.com/proxy.php?image=http...,2,R35-16-721t.jpg,Blastomussa
19165,Pulsing Xenia,https://www.reef2reef.com/proxy.php?image=http...,22,LSFF-R19-67-722.jpg,Xenia
7491,Rastas Zoanthids,https://www.reef2reef.com/proxy.php?image=http...,27,LSCMH-R21-41-481.jpg,Zoanthid
1376,Strawberry Pie Bowerbanki,https://www.reef2reef.com/proxy.php?image=http...,6,LSMAY-02-34-773.jpg,Bowerbanki


In [12]:
df['coral_type'].value_counts(dropna=False, ascending=True).to_frame()

Unnamed: 0,coral_type
$5!,1
$20!,1
$129,1
Dog's Coral Viewing Glasses,1
$25!,1
WWC Orange Diaseris,1
Maxi-Tech Vorjet MP1200 Powerhead,1
WWC PuffCoin Crypto,1
Streaking Orange Tongue Coral,1
CLICK HERE,1
