In [1]:
import csv
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
full_df = pd.read_csv('full-index.csv')

In [3]:
counts = full_df.annotation.value_counts()
counts

annotation
AA      85273
CA      12658
ISO     12518
R       11893
SC      10898
        ...  
OULA        2
PEC         2
MIL         1
J           1
TA          1
Name: count, Length: 79, dtype: int64

In [4]:
full_df["year"] = full_df["filepath"].str[:4]
full_df

Unnamed: 0,filepath,annotation,timestamp,height,width,year
0,2016 (INCOMPLETE)/TRANSECT/IMAGE AND CPCE FILE...,AA,20160419_094715,500,500,2016
1,2016 (INCOMPLETE)/TRANSECT/IMAGE AND CPCE FILE...,RCK,20160419_094715,500,500,2016
2,2016 (INCOMPLETE)/TRANSECT/IMAGE AND CPCE FILE...,ISO,20160419_094715,500,500,2016
3,2016 (INCOMPLETE)/TRANSECT/IMAGE AND CPCE FILE...,RCK,20160419_094715,500,500,2016
4,2016 (INCOMPLETE)/TRANSECT/IMAGE AND CPCE FILE...,ISO,20160419_094715,500,500,2016
...,...,...,...,...,...,...
222840,2021 (COMPLETE)/QUADRAT/IMAGE AND CPCE FILE/SH...,R,20210526_094632,500,500,2021
222841,2021 (COMPLETE)/QUADRAT/IMAGE AND CPCE FILE/SH...,SP,20210526_094632,500,500,2021
222842,2021 (COMPLETE)/QUADRAT/IMAGE AND CPCE FILE/SH...,R,20210526_094632,500,500,2021
222843,2021 (COMPLETE)/QUADRAT/IMAGE AND CPCE FILE/SH...,OT,20210526_094632,500,500,2021


In [5]:
full_df["year"].value_counts()

year
2024    31800
2021    30500
2009    28715
2019    26800
2016    25000
2020    23490
2011    22810
2010    19440
2018    13390
2022      900
Name: count, dtype: int64

In [6]:
counts_per_year = full_df.groupby(["year"]).annotation.value_counts()
counts_per_year = counts_per_year.reset_index().pivot_table(
    values='count', index='annotation', columns='year').fillna(0).astype(pd.Int64Dtype())

In [7]:
counts_per_year

year,2009,2010,2011,2016,2018,2019,2020,2021,2022,2024
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AA,16486,4034,5317,3604,6258,10342,6913,14201,631,17487
ACAN,0,1,1,0,6,9,4,15,1,16
ACB,887,604,300,1008,44,347,415,398,0,81
ACC,205,380,106,341,88,206,114,289,6,247
ACD,34,21,14,33,28,29,30,59,0,58
...,...,...,...,...,...,...,...,...,...,...
TUBI,0,0,0,0,0,2,1,0,0,0
TURB,0,0,3,1,7,7,1,0,0,7
TWB,550,15,38,59,792,940,1681,1353,24,514
UND,0,22,0,0,0,0,0,0,0,0


In [8]:
codes = collections.defaultdict(lambda: [])
for filename in ["NACRE_SHINE_40.txt", "OMLC_code-7CAT.txt"]:
    with open(filename, "r") as f:
        # skip header
        for _ in range(3):
            f.readline()

        reader = csv.reader(f)
        for row in reader:
            codes[row[0]].append(row[1].replace("\"", "").strip())

codes

defaultdict(<function __main__.<lambda>()>,
            {'HC': ['coral', 'Hard Coral', 'Hard Coral'],
             'AA': ['algae assemblage',
              'Algal assemblage',
              'Algal assemblage',
              'Algal assemblage'],
             'AB': ['abiotic', 'Abiotic components', 'Abiotic components'],
             'MA': ['macroalgae', 'Macroalgae'],
             'HA': ['halimeda', 'Halimeda'],
             'OB': ['other biota'],
             'TWB': ['Tape', 'Tape', 'Tape', 'Tape'],
             'ACAN': ['Acanthastrea'],
             'ACB': ['Acropora branching'],
             'ACC': ['Acropora corymbose'],
             'ACD': ['Acropora digitate'],
             'ACH': ['Acropora hispidose'],
             'ACR': ['Acropora robusta group'],
             'ISO': ['Isopora'],
             'ACT': ['Acropora plate'],
             'AF': ['Attached fungiids'],
             'AST': ['Astreopora'],
             'BUB': ['Other bubble corals'],
             'CB': ['Other branching 

In [9]:
cleaned_codes = {}

for code, meanings in codes.items():
    cleaned_codes[code] = meanings[-1]

cleaned_codes

{'HC': 'Hard Coral',
 'AA': 'Algal assemblage',
 'AB': 'Abiotic components',
 'MA': 'Macroalgae',
 'HA': 'Halimeda',
 'OB': 'other biota',
 'TWB': 'Tape',
 'ACAN': 'Acanthastrea',
 'ACB': 'Acropora branching',
 'ACC': 'Acropora corymbose',
 'ACD': 'Acropora digitate',
 'ACH': 'Acropora hispidose',
 'ACR': 'Acropora robusta group',
 'ISO': 'Isopora',
 'ACT': 'Acropora plate',
 'AF': 'Attached fungiids',
 'AST': 'Astreopora',
 'BUB': 'Other bubble corals',
 'CB': 'Other branching corals',
 'CE': 'Other encrusting corals',
 'CF': 'Other foliose corals',
 'CM': 'Other massive corals',
 'CAU': 'Caulastrea',
 'CMR': 'Fungia',
 'COE': 'Coeloseris',
 'COS': 'Coscinarea',
 'CYP': 'Cyphastrea',
 'DIP': 'Diploastrea heliopora',
 'EUP': 'Euphyllia',
 'ECHI': 'Echinopora',
 'ECHY': 'Echinophyllia',
 'FAV': 'Favia',
 'FOT': 'Other free living fungiids',
 'FVI': 'Favites',
 'GAL': 'Galaxea',
 'GONIA': 'Goniastrea',
 'GONIO': 'Goniopora',
 'HEL': 'Heliopora',
 'HYD': 'Hydnophora',
 'LEPA': 'Leptoria',

In [10]:
counts_per_year["meaning"] = counts_per_year.index.to_series().apply(
    lambda x: cleaned_codes[x] if x in cleaned_codes else None
)

counts_per_year = counts_per_year.iloc[:, [10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]

counts_per_year

year,meaning,2009,2010,2011,2016,2018,2019,2020,2021,2022,2024
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA,Algal assemblage,16486,4034,5317,3604,6258,10342,6913,14201,631,17487
ACAN,Acanthastrea,0,1,1,0,6,9,4,15,1,16
ACB,Acropora branching,887,604,300,1008,44,347,415,398,0,81
ACC,Acropora corymbose,205,380,106,341,88,206,114,289,6,247
ACD,Acropora digitate,34,21,14,33,28,29,30,59,0,58
...,...,...,...,...,...,...,...,...,...,...,...
TUBI,Tubipora musica,0,0,0,0,0,2,1,0,0,0
TURB,Turbinaria,0,0,3,1,7,7,1,0,0,7
TWB,Tape,550,15,38,59,792,940,1681,1353,24,514
UND,,0,22,0,0,0,0,0,0,0,0


In [11]:
counts_per_year.to_csv('full-counts-per-year.csv')