#### Load dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load your fully screened dataset
df = pd.read_csv(r"C:\Users\dolap\OneDrive\Documents\DOLAPO\data-analysis\literature_syn_analysis\fully_screened.csv")   # adjust path if needed

print(df.shape)
print(df.columns)
print(df.head())


(218, 6)
Index(['title', 'Imaging Method', 'Biodiversity Indicators Type', 'keywords',
       'authors', 'year'],
      dtype='object')
                                               title          Imaging Method  \
0  The CARICOMP Network of Caribbean Marine Labor...      photoquadrat, DOV,   
1  Persistent vegetation greening trends across C...       satellite imagery   
2  A new deep-water species of Discias Rathbun, 1...                     ROV   
3  Is reduced benthic flux related to the Diporei...       satellite imagery   
4  Distributional change in seagrass as an ecolog...  AUV, satellite imagery   

                        Biodiversity Indicators Type  \
0  Structural diversity, Species diversity, commu...   
1  spatial distribution, community structure, hab...   
2                                   species richness   
3                              population, abundance   
4        presence and absence, distributional change   

                                            ke

#### Clean and summarise imaging methods

In [2]:
# Make sure columns are strings
df["Imaging Method"] = df["Imaging Method"].fillna("").astype(str)
df["Biodiversity Indicators Type"] = df["Biodiversity Indicators Type"].fillna("").astype(str)

def split_clean_methods(s: str):
    parts = [p.strip() for p in s.split(",") if p.strip()]
    return parts

methods_series = df["Imaging Method"].apply(split_clean_methods)
methods_exploded = methods_series.explode()

# Frequency of imaging methods
method_counts = methods_exploded.value_counts().sort_values(ascending=False)
print(method_counts.head(20))


Imaging Method
satellite imagery                   43
BRUV                                36
ROV                                 32
DOV                                 24
AUV                                 18
UAV                                 16
photoquadrats                       14
sfm                                 11
satellite imagery/remote sensing    10
photoquadrat                         5
remote sensing                       5
still images                         5
USV                                  3
Aerial photographs                   3
still Images                         2
Photoquadrats                        2
UAS                                  2
satellite imagery/Remote Sensing     2
satellite images                     2
UVC                                  2
Name: count, dtype: int64


#### summary of the biodiversity indicator types

In [3]:
def split_clean_inds(s: str):
    parts = [p.strip() for p in s.split(",") if p.strip()]
    return parts

inds_series = df["Biodiversity Indicators Type"].apply(split_clean_inds)
inds_exploded = inds_series.explode()

indicator_counts = inds_exploded.value_counts().sort_values(ascending=False)
print(indicator_counts.head(20))


Biodiversity Indicators Type
species distribution      16
species richness          13
habitat mapping           11
habitat structure          9
species composition        8
spatial distribution       7
Species richness           7
species coverage           6
species abundance          5
abundance                  5
structural complexity      5
biomass                    5
habitat classification     4
Habitat structure          4
relative abundance         4
community structure        4
Species distribution       4
density                    3
Relative abundance         3
aereal extent              3
Name: count, dtype: int64


#### Publication per year

In [4]:
year_counts = df["year"].value_counts().sort_index()
print(year_counts)


year
1990     1
1997     1
2002     2
2003     2
2005     1
2006     1
2007     1
2008     8
2010     7
2011     1
2012     9
2013     5
2014     8
2015     3
2016     9
2017     5
2018    11
2019    20
2020    15
2021    23
2022    20
2023    10
2024    16
2025    38
2026     1
Name: count, dtype: int64


In [5]:
# remove records with year 2026 and update related summary
df = df[df["year"] != 2026].reset_index(drop=True)

year_counts = df["year"].value_counts().sort_index()
print("New shape:", df.shape)
print(year_counts)

New shape: (217, 6)
year
1990     1
1997     1
2002     2
2003     2
2005     1
2006     1
2007     1
2008     8
2010     7
2011     1
2012     9
2013     5
2014     8
2015     3
2016     9
2017     5
2018    11
2019    20
2020    15
2021    23
2022    20
2023    10
2024    16
2025    38
Name: count, dtype: int64


##### Crosstab: top methods Ã— top indicators

In [6]:
top_methods = method_counts.head(5).index.tolist()
top_inds = indicator_counts.head(8).index.tolist()

df_long = df.copy()
df_long["Imaging Method List"] = df_long["Imaging Method"].apply(split_clean_methods)
df_long["Indicator List"] = df_long["Biodiversity Indicators Type"].apply(split_clean_inds)

rows = []
for _, row in df_long.iterrows():
    for m in row["Imaging Method List"]:
        for ind in row["Indicator List"]:
            rows.append((m, ind))

mi_df = pd.DataFrame(rows, columns=["Method", "Indicator"])

mi_df_top = mi_df[mi_df["Method"].isin(top_methods) & mi_df["Indicator"].isin(top_inds)]
crosstab_top = pd.crosstab(mi_df_top["Method"], mi_df_top["Indicator"])

print(crosstab_top)


Indicator          Species richness  habitat mapping  habitat structure  \
Method                                                                    
AUV                               0                5                  2   
BRUV                              3                0                  0   
DOV                               2                0                  1   
ROV                               1                2                  2   
satellite imagery                 1                2                  2   

Indicator          spatial distribution  species composition  \
Method                                                         
AUV                                   1                    0   
BRUV                                  0                    1   
DOV                                   0                    1   
ROV                                   1                    3   
satellite imagery                     3                    0   

Indicator          specie