# Coral Names Cleanup

I have multiple csv files containing the coral name and the corresponding file name.  I need to sort these into groups by type of coral so that I can create directories for each of the coral types for model training.

In [1]:
import pandas as pd
import os
import re

In [2]:
filepath = './coral_names'
filelist = os.listdir(filepath)

In [3]:
len(filelist)

29

In [4]:
dfs = []
for file in filelist:
    names = pd.read_csv(os.path.join(filepath, file), delimiter='|', header=0, names=['coral_name', 'filename'])
    names['file'] = re.search(r"(?<=files\_)(.*)(?=\.csv)", file)[0]
    dfs.append(names)
    
df = pd.concat(dfs, ignore_index=True)

In [5]:
df.tail(20)

Unnamed: 0,coral_name,filename,file
56039,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56040,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56041,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56042,Neon Fuzzy Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56043,Neon Fuzzy Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56044,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56045,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56046,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56047,WWC King Tut Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13
56048,WWC King Tut Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13


In [6]:
# from the filename, grab everything starting with the last %2F
df['shortfilename'] = df['filename'].str.extract(r'(\%2F(?:.(?!\%2F))+$)')

In [7]:
# grab everything after %2F up through .jpg, which is the name of the file I am looking for
df['shortfilename'] = df['shortfilename'].str.extract(r'(?<=\%2F)(.+\.jpg)')

In [8]:
# create mapping from coral name to coral type
coral_mapping = {
    r'.*Mushroom.*': 'Mushroom',
    r'.*Zoanthid.*': 'Zoanthid'
}

In [9]:
# apply the mapping to get a standardized coral type
df['coral_type'] = df['coral_name'].replace(coral_mapping, regex=True)

In [10]:
df.tail(20)

Unnamed: 0,coral_name,filename,file,shortfilename,coral_type
56039,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-30-984.jpg,Mushroom
56040,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-31-263.jpg,Mushroom
56041,Tangerine Bliss Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-32-446.jpg,Mushroom
56042,Neon Fuzzy Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-33-228.jpg,Mushroom
56043,Neon Fuzzy Rhodactis Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-34-456.jpg,Mushroom
56044,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-49-221.jpg,Mushroom
56045,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-50-894.jpg,Mushroom
56046,Blue Dot Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-51-678.jpg,Mushroom
56047,WWC King Tut Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-52-024.jpg,Mushroom
56048,WWC King Tut Disco Mushroom,https://www.reef2reef.com/proxy.php?image=http...,13,LSBFS-04-53-224.jpg,Mushroom


In [11]:
df['coral_type'].value_counts(dropna=False)

Zoanthid                             10092
Mushroom                              2584
Pulsing Xenia                          355
ORA Neon Birdsnest                     341
Jason Fox Sector 001 Favites           326
                                     ...  
WWC Ice Block Favia                      1
Seafoam Acropora                         1
WWC Copperhead Bowerbanki                1
LG Tyree Pink Lemonade Acropora          1
Aussie Purple Stalagmite Acropora        1
Name: coral_type, Length: 2778, dtype: int64