<a href="https://colab.research.google.com/github/BenUCL/Reef-acoustics-and-AI/blob/main/Code/Unsupervised_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Unsupervised clustering with UMAP and affinity propogation**
This script once again calculates UMAP embeddings from each feature set, but this time reduces to 10 dimensions rather than the 2 used for the 2D UMAP plots. Affinity propogation clustering is then performed on these embeddings. The fidelity of clusters to true classes is then quantified with a Chi-squared test.

Bold headings indicate the dataset and task for each block.


In [None]:
# Connect Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install umap-learn
!pip install umap-learn[plot]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 3.6 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.7.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 25.6 MB/s 
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=998e3f1dfa1eac0300e81b41e40b190082458fbc008782a373d419741e83123e
  Stored in directory: /root/.cache/pip/wheels/b3/52/a5/1fd9e3e76a7ab34f134c07469cd6f16e27ef3a37aeff1fe821
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for pynndescent: filename=pynndescent-0.5.7-py3-none-any.whl size=54286 sha256=34c7d09f89c2d3f408a901ba58e83b024378e648056b997504fd1345d4a9e0af
  Stored in directo

In [None]:
import numpy as np
import pandas as pd
import umap
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
from sklearn.datasets import make_blobs
from sklearn.cluster import AffinityPropagation
from scipy import stats

import random
random.seed(123)

# **Indo Site Level Acoustic indices**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
indices_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/compound_index_indo.csv'

data = pd.read_csv(indices_path) #load dataframe
indices_df = data.reset_index() #put index in order
indices_df = indices_df.iloc[: , 2:] #remove unnecessary index

#Now remove the file labels so just features can be input in HBDSCAN
indices_features = indices_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[1][4:5]
    return t


#if get_class(i) == 'D':

site_name = []
for i in indices_df['minute']:
  class_type = get_class(i)
  prefix = i[0:2]
  if prefix == 'Bo':
    site = 'Bontosua' + class_type
    site_name.append(site)
  if prefix == 'Sa':
    site = 'Salisi' + class_type
    site_name.append(site)
  if prefix == 'Ba':
    site = 'Badi' + class_type
    site_name.append(site)



indices_df.insert(1, 'Site', site_name)

# Check the class column was added 
indices_df.head()

Unnamed: 0,minute,Site,fish_ACI,fish_ACI_std,fish_ADI,fish_ADI_std,fish_H,fish_H_std,fish_Hf,fish_Hf_std,...,full_H,full_H_std,full_Hf,full_Hf_std,full_M,full_M_std,full_BI,full_BI_std,NDSI,NDSI_std
0,BoF2.0930D.1678278701.180828.NT0930.wav,BontosuaD,180.66798,22.555182,2.156149,0.081462,0.999806,0.000139,0.46764,0.038913,...,0.999958,2.6e-05,0.742105,0.022057,0.000786,9.6e-05,3.756085,0.957902,0.628277,0.1772
1,SaF4.0902D.671907872.180830.NT0924.wav,SalisiD,202.084403,21.433853,2.195789,0.292442,0.999908,4.5e-05,0.483962,0.025508,...,0.999942,3.1e-05,0.744351,0.011731,0.001902,0.000176,5.525697,0.881614,0.481185,0.106375
2,SaF3.1355D.805322778.180829.NT1356.wav,SalisiD,181.766749,25.356359,2.19955,0.048918,0.999864,0.000118,0.476466,0.026665,...,0.999902,5.8e-05,0.741129,0.028133,0.001304,9.3e-05,4.907524,1.054116,0.441334,0.181443
3,SaN11.0940D.1678278701.180906.NT0954.wav,SalisiD,212.115207,15.154346,2.198496,0.289967,0.9999,5.2e-05,0.485119,0.020143,...,0.999928,2.8e-05,0.739152,0.019008,0.001867,0.000165,4.53168,1.029127,0.350785,0.11575
4,BoF2.0930D.1678278701.180828.NT0950.wav,BontosuaD,162.062029,15.067175,1.838506,0.212173,0.999364,0.000434,0.38388,0.047324,...,0.999957,2.4e-05,0.738157,0.02152,0.000937,7.6e-05,3.721564,0.882146,0.570787,0.168368


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    n_components=10, #how many dimensions to reduce the data to
    random_state=42,
).fit_transform(indices_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
clustering = af.fit(clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(clustering.cluster_centers_indices_)

22

In [None]:
# Create contingency table
class_type = indices_df['Site'].to_numpy()
cluster_labels = clustering.labels_

contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

6045.891295516464

# **Indo Site Level Pretrained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
pretrained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/pretrained_CNN_indo.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv(pretrained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
pretrained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
pretrained_features = pretrained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[1][4:5]
    return t


#if get_class(i) == 'D':

site_name = []
for i in pretrained_df['minute']:
  class_type = get_class(i)
  prefix = i[0:2]
  if prefix == 'Bo':
    site = 'Bontosua' + class_type
    site_name.append(site)
  if prefix == 'Sa':
    site = 'Salisi' + class_type
    site_name.append(site)
  if prefix == 'Ba':
    site = 'Badi' + class_type
    site_name.append(site)



pretrained_df.insert(1, 'Site', site_name)

# Check the class column was added 
pretrained_df.head()

Unnamed: 0,minute,Site,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,BoF2.0930D.1678278701.180828.NT0930.wav,BontosuaD,0.002933,0.17724,0.38843,0.12929,0.0,0.0,0.0,0.0,...,0.0,0.041895,0.0,0.0,0.0,0.433386,0.320776,0.0,0.0,0.0
1,SaF4.0902D.671907872.180830.NT0924.wav,SalisiD,0.000917,0.073721,0.515714,0.111717,0.0,0.0,0.0,0.0,...,0.0,0.166053,9.3e-05,0.000208,0.048393,0.167975,0.032886,0.0,0.0,0.0
2,SaF3.1355D.805322778.180829.NT1356.wav,SalisiD,0.023495,0.104395,0.360114,0.143027,0.0,0.0,0.0,0.000855,...,0.0,0.104303,0.036998,0.001894,0.006081,0.209761,0.121807,0.0,0.0,0.0
3,SaN11.0940D.1678278701.180906.NT0954.wav,SalisiD,0.000865,0.126758,0.437884,0.209243,0.0,0.0,0.0,0.0,...,0.0,0.117832,0.001151,0.011284,0.000152,0.328505,0.123348,0.0,0.007058,0.0
4,BoF2.0930D.1678278701.180828.NT0950.wav,BontosuaD,0.019745,0.176513,0.39904,0.040165,2.7e-05,0.0,0.0,0.006952,...,0.0,0.020741,0.020154,0.003591,0.003634,0.350962,0.285468,0.002281,0.000323,0.0


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
pretrained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=42,
).fit_transform(pretrained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
pretrained_clustering = af.fit(pretrained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(pretrained_clustering.cluster_centers_indices_)

27

In [None]:
# Create contingency table
class_type = pretrained_df['Site'].to_numpy()
cluster_labels = pretrained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

8578.883375328458

# **Indo Site Level Trained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
trained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/trained_CNN_indo.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv (trained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
trained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
trained_features = trained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[1][4:5]
    return t


#if get_class(i) == 'D':

site_name = []
for i in trained_df['minute']:
  class_type = get_class(i)
  prefix = i[0:2]
  if prefix == 'Bo':
    site = 'Bontosua' + class_type
    site_name.append(site)
  if prefix == 'Sa':
    site = 'Salisi' + class_type
    site_name.append(site)
  if prefix == 'Ba':
    site = 'Badi' + class_type
    site_name.append(site)



trained_df.insert(1, 'Site', site_name)

# Check the class column was added 
trained_df.head()

Unnamed: 0,minute,Site,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,BoF2.0930D.1678278701.180828.NT0930.wav,BontosuaD,0.47765,5.012879,1.915835,0.0,0.0,0.0,1.062819,0.0,...,0.0,0.0,2.679418,0.000814,2.236241,0.036649,0.0,0.408422,0.602255,0.006626
1,SaF4.0902D.671907872.180830.NT0924.wav,SalisiD,0.0,7.778984,0.0,0.051206,0.0,3.912268,8.325169,0.0,...,0.0,3.696773,1.264271,0.075675,0.183685,4.998559,0.0,0.0,6.332646,8.498202
2,SaF3.1355D.805322778.180829.NT1356.wav,SalisiD,0.0,6.534332,0.0,0.005836,1.756667,5.124594,9.57667,0.0,...,0.0,6.169143,0.528039,0.015261,1.281214,5.205583,0.0,0.0,4.791251,10.420626
3,SaN11.0940D.1678278701.180906.NT0954.wav,SalisiD,0.0,6.228039,0.014625,0.080811,0.0,2.651456,7.318202,0.0,...,0.0,3.214074,0.648099,0.202112,0.153203,4.936759,0.0,0.0,4.411826,7.012685
4,BoF2.0930D.1678278701.180828.NT0950.wav,BontosuaD,0.50047,6.841762,1.05641,0.0,0.0,0.0,1.647999,0.0,...,0.0,0.0,3.390443,0.025399,2.531892,0.0,0.0,0.208863,1.573495,0.001159


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
trained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    n_components=10, #how many dimensions to reduce the data to
    random_state=42,
).fit_transform(trained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
trained_clustering = af.fit(trained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(trained_clustering.cluster_centers_indices_)

22

In [None]:
# Create contingency table
class_type = trained_df['Site'].to_numpy()
cluster_labels = trained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

9529.10185312842

# **GBR Site Acoustic indices**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
indices_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/compound_index_aus.csv'

data = pd.read_csv(indices_path) #load dataframe
indices_df = data.reset_index() #put index in order
indices_df = indices_df.iloc[: , 2:] #remove unnecessary index

#Now remove the file labels so just features can be input in HBDSCAN
indices_features = indices_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in indices_df['minute']:
  class_type.append(get_class(i))

indices_df.insert(1, 'Habitat', class_type)

# Check the class column was added 
indices_df.head()

Unnamed: 0,minute,Habitat,fish_ACI,fish_ACI_std,fish_ADI,fish_ADI_std,fish_H,fish_H_std,fish_Hf,fish_Hf_std,...,full_H,full_H_std,full_Hf,full_Hf_std,full_M,full_M_std,full_BI,full_BI_std,NDSI,NDSI_std
0,SiteJdeployment2.1677983769.181103032034.wav,SiteJ,179.372677,26.483996,2.192782,0.063717,0.999843,0.000144,0.481993,0.030042,...,0.99993,4.1e-05,0.726764,0.019765,0.002191,0.000152,6.620203,1.060968,0.695277,0.157566
1,SiteAdeployment1.805322778.181023172733.wav,SiteA,168.80185,25.504564,1.969023,0.395328,0.999686,0.000299,0.445248,0.056819,...,0.999946,3.4e-05,0.733206,0.039498,0.000549,5.1e-05,3.68473,0.750008,0.547568,0.218633
2,SiteIdeployment2.805322778.181102073302.wav,SiteI,174.98872,24.714481,2.197048,0.295056,0.999771,0.000238,0.497067,0.028419,...,0.999944,3.1e-05,0.731974,0.035967,0.000685,5.1e-05,4.032168,1.002873,0.574645,0.199514
3,SiteDdeployment4.805322778.181123173922.wav,SiteD,165.651071,17.444454,1.960197,0.139087,0.999693,0.000213,0.427126,0.043692,...,0.999937,3.5e-05,0.743762,0.024463,0.000573,3.5e-05,4.024412,0.804414,0.557476,0.186468
4,SiteEdeployment6.1677983769.181209085140.wav,SiteE,180.360927,20.778196,2.119658,0.095857,0.999758,0.000208,0.465938,0.038046,...,0.999944,2.8e-05,0.751055,0.020132,0.000948,7.6e-05,3.399768,0.97676,0.596916,0.143404


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(indices_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
indices_clustering = af.fit(clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(indices_clustering.cluster_centers_indices_)

63

In [None]:
# Create contingency table
class_type = indices_df['Habitat'].to_numpy()
cluster_labels = indices_clustering.labels_

contingency_table = pd.crosstab(cluster_labels, class_type)

In [None]:
# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

14643.438182620306

# **GBR Site Pretrained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
pretrained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/pretrained_CNN_aus.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv(pretrained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
pretrained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
pretrained_features = pretrained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in pretrained_df['minute']:
  class_type.append(get_class(i))

pretrained_df.insert(1, 'Habitat', class_type)

# Check the class column was added 
pretrained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteJdeployment2.1677983769.181103032034.wav,SiteJ,0.000929,0.093042,0.28538,0.210208,0.0,0.0,0.0,0.006659,...,0.0,0.023046,0.00022,0.0,0.0,0.2253,0.304314,0.0,0.0,0.0
1,SiteAdeployment1.805322778.181023172733.wav,SiteA,0.03078,0.100007,0.408427,0.050889,0.0,0.0,0.0,0.002414,...,0.0,0.060092,0.010739,0.0,0.028925,0.244283,0.167082,0.0,0.0,0.0
2,SiteIdeployment2.805322778.181102073302.wav,SiteI,0.02387,0.043412,0.331813,0.121506,0.0,0.0,0.0,0.0,...,0.0,0.043277,7.7e-05,0.0,0.016917,0.357364,0.186497,0.0,0.0,0.0
3,SiteDdeployment4.805322778.181123173922.wav,SiteD,0.039953,0.076123,0.30476,0.088957,0.001289,0.0,0.0,0.014608,...,0.0,0.078101,0.053534,0.001675,0.01304,0.177259,0.129732,0.0,0.0,0.0
4,SiteEdeployment6.1677983769.181209085140.wav,SiteE,0.005541,0.105616,0.485045,0.074837,0.0,0.0,0.0,0.0,...,0.0,0.056985,0.008627,0.0,0.020391,0.282245,0.129462,0.0,0.0,0.0


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
pretrained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(pretrained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
pretrained_clustering = af.fit(pretrained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(pretrained_clustering.cluster_centers_indices_)

64

In [None]:
# Create contingency table
class_type = pretrained_df['Habitat'].to_numpy()
cluster_labels = pretrained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

36356.661228557394

# **GBR Site Trained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
trained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/trained_CNN_aus.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv (trained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
trained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
trained_features = trained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in trained_df['minute']:
  class_type.append(get_class(i))

trained_df.insert(1, 'Habitat', class_type)

# Check the class column was added 
trained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteJdeployment2.1677983769.181103032034.wav,SiteJ,0.011415,10.289283,0.0,7.256893,1.295557,0.0,0.0,18.542894,...,0.42096,0.01748,4.07629,0.0,0.0,0.202933,0.843459,0.0,0.158123,1.249514
1,SiteAdeployment1.805322778.181023172733.wav,SiteA,0.0,15.645812,0.883047,4.936174,0.729615,0.0,0.812924,1.675555,...,0.0,0.323658,0.013336,0.0,0.294935,0.021129,7.113554,0.0,1.237369,0.036465
2,SiteIdeployment2.805322778.181102073302.wav,SiteI,0.3034,10.904243,3.00733,0.043125,7.771619,0.0,0.913815,1.154913,...,0.055531,0.0,0.0,0.027132,3.051994,1.315529,11.504429,0.002753,0.05596,3.387733
3,SiteDdeployment4.805322778.181123173922.wav,SiteD,0.0,17.460434,0.002275,3.685592,4.593746,0.023709,0.074226,5.876309,...,0.0,4.29445,0.918361,0.618396,1.253748,0.623302,5.469021,0.0,4.070124,0.0
4,SiteEdeployment6.1677983769.181209085140.wav,SiteE,0.422065,8.544225,0.009086,2.017138,4.957919,0.084011,6.327863,8.813068,...,0.018185,0.01833,0.066417,0.86262,0.0,3.120564,6.191011,0.031814,1.907597,8.696228


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
trained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(trained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
trained_clustering = af.fit(trained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(trained_clustering.cluster_centers_indices_)

44

In [None]:
# Create contingency table
class_type = trained_df['Habitat'].to_numpy()
cluster_labels = trained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

72610.07042198256

# **Polynesia Site Acoustic indices**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
indices_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/compound_index_poly.csv'

data = pd.read_csv(indices_path) #load dataframe
indices_df = data.reset_index() #put index in order
indices_df = indices_df.iloc[: , 2:] #remove unnecessary index

#Now remove the file labels so just features can be input in HBDSCAN
indices_features = indices_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in indices_df['minute']:
  class_type.append(get_class(i))

indices_df.insert(1, 'Site', class_type)

# Check the class column was added 
indices_df.head()

Unnamed: 0,minute,Site,fish_ACI,fish_ACI_std,fish_ADI,fish_ADI_std,fish_H,fish_H_std,fish_Hf,fish_Hf_std,...,full_H,full_H_std,full_Hf,full_Hf_std,full_M,full_M_std,full_BI,full_BI_std,NDSI,NDSI_std
0,SiteCDay3.805322778.210303214058.wav,SiteC,174.218205,17.12913,1.87549,0.173853,0.999264,0.000535,0.378237,0.043518,...,0.999935,4e-05,0.723675,0.03254,0.001088,9.6e-05,3.503309,0.543755,0.409907,0.203473
1,SiteXDay3.5210.210215103530.wav,SiteX,157.968812,10.113108,1.060467,0.72585,0.99844,0.000723,0.274085,0.024864,...,0.999478,0.000483,0.418846,0.072226,0.000478,7.7e-05,2.933921,0.506337,-0.594888,0.182238
2,SiteYDay2.5210.210302223537.wav,SiteY,167.24063,12.467357,2.092218,0.144701,0.99933,0.000496,0.402144,0.032731,...,0.999927,3.5e-05,0.752729,0.014195,0.000225,1.3e-05,2.624677,0.405407,0.460494,0.091826
3,SiteDDay2.5210.210317091056.wav,SiteD,167.300849,14.807595,2.132824,0.137796,0.999494,0.000356,0.437447,0.04229,...,0.999893,6.6e-05,0.753096,0.02181,0.001418,0.000228,3.267791,0.523877,0.594795,0.117539
4,SiteADay3.805322778.210205190014.wav,SiteA,166.893735,12.189808,2.266322,0.044466,0.999403,0.000294,0.490927,0.020568,...,0.999714,0.00016,0.747237,0.009536,0.005664,0.000574,4.519593,0.552794,0.567966,0.058083


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(indices_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
indices_clustering = af.fit(clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(indices_clustering.cluster_centers_indices_)

35

In [None]:
# Create contingency table
class_type = indices_df['Site'].to_numpy()
cluster_labels = indices_clustering.labels_

contingency_table = pd.crosstab(cluster_labels, class_type)

In [None]:
# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

45001.05190675995

# **Polynesia Site Pretrained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
pretrained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/pretrained_CNN_poly.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv(pretrained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
pretrained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
pretrained_features = pretrained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in pretrained_df['minute']:
  class_type.append(get_class(i))

pretrained_df.insert(1, 'Site', class_type)

# Check the class column was added 
pretrained_df.head()

Unnamed: 0,minute,Site,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteCDay3.805322778.210303214058.wav,SiteC,0.024271,0.015441,0.673868,0.01581,0.0,0.0,0.0,0.013191,...,0.0,0.075812,0.079045,0.0,0.164519,0.16055,0.02702,0.0,0.000133,0.001901
1,SiteXDay3.5210.210215103530.wav,SiteX,0.024545,0.001385,0.885764,0.007736,0.0,0.0,0.0,0.003682,...,0.0,0.054833,0.031658,0.0,0.210327,0.050721,0.043802,0.0,0.0,0.0
2,SiteYDay2.5210.210302223537.wav,SiteY,0.038698,0.0,0.621418,0.04574,0.0,0.0,0.0,0.029869,...,0.0,0.042964,0.068022,0.0,0.006417,0.156122,0.001582,0.0,0.0,0.0
3,SiteDDay2.5210.210317091056.wav,SiteD,0.011494,0.099208,0.532689,0.007028,0.0,0.0,0.0,0.0,...,0.0,0.006671,0.003732,0.0,0.039698,0.226014,0.066506,0.0,0.001176,0.0
4,SiteADay3.805322778.210205190014.wav,SiteA,0.0,0.084872,0.07253,0.075937,0.0,0.0,0.0,0.0,...,0.0,0.023012,0.0,0.0,0.237829,0.050223,0.057576,0.0,0.0,0.008781


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
pretrained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(pretrained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
pretrained_clustering = af.fit(pretrained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(pretrained_clustering.cluster_centers_indices_)

24

In [None]:
# Create contingency table
class_type = pretrained_df['Site'].to_numpy()
cluster_labels = pretrained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

60507.802966013885

# **Polynesia Site Trained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
trained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/trained_CNN_poly.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv (trained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
trained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
trained_features = trained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in trained_df['minute']:
  class_type.append(get_class(i))

trained_df.insert(1, 'Site', class_type)

# Check the class column was added 
trained_df.head()

Unnamed: 0,minute,Site,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteCDay3.805322778.210303214058.wav,SiteC,0.0,0.0,12.082821,6.763444,7.179127,0.0,13.351444,20.285992,...,0.0,0.051185,16.882797,0.0,11.385775,4.218807,0.278274,0.0,0.031203,0.0
1,SiteXDay3.5210.210215103530.wav,SiteX,0.247064,0.859065,0.172915,0.105359,4.611157,0.0,0.0,1.952035,...,0.0,28.823147,17.230404,0.0,28.133654,0.948916,6.289915,0.0,0.0,0.0
2,SiteYDay2.5210.210302223537.wav,SiteY,8.51776,0.0,8.564845,6.75922,0.0,0.0,0.0,5.992061,...,0.0,14.142094,10.024358,0.0,0.887872,2.4327,0.0,0.0,0.0,0.0
3,SiteDDay2.5210.210317091056.wav,SiteD,0.0,0.642867,10.952372,1.292887,13.916732,0.0,2.82494,0.004537,...,0.0,0.0,11.003201,15.533544,10.891987,19.48122,0.070055,0.0,0.0,0.0
4,SiteADay3.805322778.210205190014.wav,SiteA,0.0,7.664385,0.153724,5.604998,0.009447,0.0,9.419145,0.0,...,0.0,0.0,0.625055,11.007442,1.294202,7.215663,2.364219,0.0,9.198437,0.0


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
trained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(trained_features)

  "Graph is not fully connected, spectral embedding may not work as expected."


In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
trained_clustering = af.fit(trained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(trained_clustering.cluster_centers_indices_)

23

In [None]:
# Create contingency table
class_type = trained_df['Site'].to_numpy()
cluster_labels = trained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

62831.99999999999

# **Indo Habitat Acoustic indices**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
indices_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/compound_index_indo.csv'

data = pd.read_csv(indices_path) #load dataframe
indices_df = data.reset_index() #put index in order
indices_df = indices_df.iloc[: , 2:] #remove unnecessary index

#Now remove the file labels so just features can be input in HBDSCAN
indices_features = indices_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[1][4:5]
    return t

class_type = []
for i in indices_df['minute']:
  if get_class(i) == 'D':
    class_type.append('Degraded')
  if get_class(i) == 'H':
    class_type.append('Healthy')

indices_df.insert(1, 'Habitat', class_type)

# Check the class column was added 
indices_df.head()

Unnamed: 0,minute,Habitat,fish_ACI,fish_ACI_std,fish_ADI,fish_ADI_std,fish_H,fish_H_std,fish_Hf,fish_Hf_std,...,full_H,full_H_std,full_Hf,full_Hf_std,full_M,full_M_std,full_BI,full_BI_std,NDSI,NDSI_std
0,BoF2.0930D.1678278701.180828.NT0930.wav,Degraded,180.66798,22.555182,2.156149,0.081462,0.999806,0.000139,0.46764,0.038913,...,0.999958,2.6e-05,0.742105,0.022057,0.000786,9.6e-05,3.756085,0.957902,0.628277,0.1772
1,SaF4.0902D.671907872.180830.NT0924.wav,Degraded,202.084403,21.433853,2.195789,0.292442,0.999908,4.5e-05,0.483962,0.025508,...,0.999942,3.1e-05,0.744351,0.011731,0.001902,0.000176,5.525697,0.881614,0.481185,0.106375
2,SaF3.1355D.805322778.180829.NT1356.wav,Degraded,181.766749,25.356359,2.19955,0.048918,0.999864,0.000118,0.476466,0.026665,...,0.999902,5.8e-05,0.741129,0.028133,0.001304,9.3e-05,4.907524,1.054116,0.441334,0.181443
3,SaN11.0940D.1678278701.180906.NT0954.wav,Degraded,212.115207,15.154346,2.198496,0.289967,0.9999,5.2e-05,0.485119,0.020143,...,0.999928,2.8e-05,0.739152,0.019008,0.001867,0.000165,4.53168,1.029127,0.350785,0.11575
4,BoF2.0930D.1678278701.180828.NT0950.wav,Degraded,162.062029,15.067175,1.838506,0.212173,0.999364,0.000434,0.38388,0.047324,...,0.999957,2.4e-05,0.738157,0.02152,0.000937,7.6e-05,3.721564,0.882146,0.570787,0.168368


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    n_components=10, #how many dimensions to reduce the data to
    random_state=42,
).fit_transform(indices_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
clustering = af.fit(clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(clustering.cluster_centers_indices_)

22

In [None]:
# Create contingency table
class_type = indices_df['Habitat'].to_numpy()
cluster_labels = clustering.labels_

contingency_table = pd.crosstab(cluster_labels, class_type)

In [None]:
# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

#Old result: 1721.3861512744872

1721.3861512744872

# **Indo Habitat Pretrained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
pretrained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/pretrained_CNN_indo.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv(pretrained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
pretrained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
pretrained_features = pretrained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[1][4:5]
    return t

class_type = []
for i in pretrained_df['minute']:
  if get_class(i) == 'D':
    class_type.append('Degraded')
  if get_class(i) == 'H':
    class_type.append('Healthy')

pretrained_df.insert(1, 'Habitat', class_type)

# Check the class column was added 
pretrained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,BoF2.0930D.1678278701.180828.NT0930.wav,Degraded,0.002933,0.17724,0.38843,0.12929,0.0,0.0,0.0,0.0,...,0.0,0.041895,0.0,0.0,0.0,0.433386,0.320776,0.0,0.0,0.0
1,SaF4.0902D.671907872.180830.NT0924.wav,Degraded,0.000917,0.073721,0.515714,0.111717,0.0,0.0,0.0,0.0,...,0.0,0.166053,9.3e-05,0.000208,0.048393,0.167975,0.032886,0.0,0.0,0.0
2,SaF3.1355D.805322778.180829.NT1356.wav,Degraded,0.023495,0.104395,0.360114,0.143027,0.0,0.0,0.0,0.000855,...,0.0,0.104303,0.036998,0.001894,0.006081,0.209761,0.121807,0.0,0.0,0.0
3,SaN11.0940D.1678278701.180906.NT0954.wav,Degraded,0.000865,0.126758,0.437884,0.209243,0.0,0.0,0.0,0.0,...,0.0,0.117832,0.001151,0.011284,0.000152,0.328505,0.123348,0.0,0.007058,0.0
4,BoF2.0930D.1678278701.180828.NT0950.wav,Degraded,0.019745,0.176513,0.39904,0.040165,2.7e-05,0.0,0.0,0.006952,...,0.0,0.020741,0.020154,0.003591,0.003634,0.350962,0.285468,0.002281,0.000323,0.0


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
pretrained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=42,
).fit_transform(pretrained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
pretrained_clustering = af.fit(pretrained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(pretrained_clustering.cluster_centers_indices_)

27

In [None]:
# Create contingency table
class_type = pretrained_df['Habitat'].to_numpy()
cluster_labels = pretrained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

2656.1877406875706

# **Indo Habitat Trained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
trained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/trained_CNN_indo.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv (trained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
trained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
trained_features = trained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[1][4:5]
    return t

class_type = []
for i in trained_df['minute']:
  if get_class(i) == 'D':
    class_type.append('Degraded')
  if get_class(i) == 'H':
    class_type.append('Healthy')

trained_df.insert(1, 'Habitat', class_type)

# Check the class column was added 
trained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,BoF2.0930D.1678278701.180828.NT0930.wav,Degraded,0.47765,5.012879,1.915835,0.0,0.0,0.0,1.062819,0.0,...,0.0,0.0,2.679418,0.000814,2.236241,0.036649,0.0,0.408422,0.602255,0.006626
1,SaF4.0902D.671907872.180830.NT0924.wav,Degraded,0.0,7.778984,0.0,0.051206,0.0,3.912268,8.325169,0.0,...,0.0,3.696773,1.264271,0.075675,0.183685,4.998559,0.0,0.0,6.332646,8.498202
2,SaF3.1355D.805322778.180829.NT1356.wav,Degraded,0.0,6.534332,0.0,0.005836,1.756667,5.124594,9.57667,0.0,...,0.0,6.169143,0.528039,0.015261,1.281214,5.205583,0.0,0.0,4.791251,10.420626
3,SaN11.0940D.1678278701.180906.NT0954.wav,Degraded,0.0,6.228039,0.014625,0.080811,0.0,2.651456,7.318202,0.0,...,0.0,3.214074,0.648099,0.202112,0.153203,4.936759,0.0,0.0,4.411826,7.012685
4,BoF2.0930D.1678278701.180828.NT0950.wav,Degraded,0.50047,6.841762,1.05641,0.0,0.0,0.0,1.647999,0.0,...,0.0,0.0,3.390443,0.025399,2.531892,0.0,0.0,0.208863,1.573495,0.001159


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
trained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    n_components=10, #how many dimensions to reduce the data to
    random_state=42,
).fit_transform(trained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
trained_clustering = af.fit(trained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(trained_clustering.cluster_centers_indices_)

22

In [None]:
# Create contingency table
class_type = trained_df['Habitat'].to_numpy()
cluster_labels = trained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

#old result using the habitat trained CNN: 3129.6856166120774
#new result using the site trained CNN: 3090.5696657197923

3090.5696657197923

# **GBR Habitat Acoustic indices**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
indices_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/compound_index_aus.csv'

data = pd.read_csv(indices_path) #load dataframe
indices_df = data.reset_index() #put index in order
indices_df = indices_df.iloc[: , 2:] #remove unnecessary index


In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in indices_df['minute']:
  class_type.append(get_class(i))

indices_df.insert(1, 'Habitat', class_type)

# Remove the 4 sites not being used for fish habitat comparison
indices_df.drop(indices_df.index[indices_df['Habitat'] == 'SiteC'], inplace=True)
indices_df.drop(indices_df.index[indices_df['Habitat'] == 'SiteD'], inplace=True)
indices_df.drop(indices_df.index[indices_df['Habitat'] == 'SiteE'], inplace=True)
indices_df.drop(indices_df.index[indices_df['Habitat'] == 'SiteI'], inplace=True)

# Rename the renaming sites to High or Low fish
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteA'],'Low fish')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteB'],'Low fish')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteG'],'Low fish')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteJ'],'Low fish')

indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteF'],'High fish')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteH'],'High fish')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteK'],'High fish')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteL'],'High fish')

#Now remove the file labels so just features can be input in HBDSCAN
indices_features = indices_df.drop(['minute', 'Habitat'], axis = 1)

# Check the class column was added 
indices_df.head()

Unnamed: 0,minute,Habitat,fish_ACI,fish_ACI_std,fish_ADI,fish_ADI_std,fish_H,fish_H_std,fish_Hf,fish_Hf_std,...,full_H,full_H_std,full_Hf,full_Hf_std,full_M,full_M_std,full_BI,full_BI_std,NDSI,NDSI_std
0,SiteJdeployment2.1677983769.181103032034.wav,Low fish,179.372677,26.483996,2.192782,0.063717,0.999843,0.000144,0.481993,0.030042,...,0.99993,4.1e-05,0.726764,0.019765,0.002191,0.000152,6.620203,1.060968,0.695277,0.157566
1,SiteAdeployment1.805322778.181023172733.wav,Low fish,168.80185,25.504564,1.969023,0.395328,0.999686,0.000299,0.445248,0.056819,...,0.999946,3.4e-05,0.733206,0.039498,0.000549,5.1e-05,3.68473,0.750008,0.547568,0.218633
5,SiteBdeployment5.1677983769.181201071827.wav,Low fish,175.991643,20.887505,2.067315,0.143728,0.999749,0.000232,0.453967,0.057671,...,0.99995,2.9e-05,0.73685,0.0288,0.001101,0.000129,4.27812,0.97824,0.580337,0.142176
7,SiteAdeployment6.805322778.181208200239.wav,Low fish,181.623479,22.365652,2.178056,0.149959,0.999835,0.000163,0.488024,0.038268,...,0.999924,5e-05,0.75036,0.015946,0.001342,0.000162,4.511504,1.068721,0.629611,0.12526
8,SiteJdeployment3.1677983769.181110075705.wav,Low fish,178.154917,24.206607,2.194177,0.075626,0.99979,0.000179,0.464995,0.045202,...,0.999929,6e-05,0.740933,0.022895,0.001517,0.000112,4.166993,1.360326,0.666806,0.12872


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(indices_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
indices_clustering = af.fit(clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(indices_clustering.cluster_centers_indices_)

46

In [None]:
# Create contingency table
class_type = indices_df['Habitat'].to_numpy()
cluster_labels = indices_clustering.labels_

contingency_table = pd.crosstab(cluster_labels, class_type)

In [None]:
# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

1578.904869668929

# **GBR Habitat Pretrained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
pretrained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/pretrained_CNN_aus.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv(pretrained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
pretrained_df = temp_df.rename(columns={"index": "minute"})

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in pretrained_df['minute']:
  class_type.append(get_class(i))

pretrained_df.insert(1, 'Habitat', class_type)

# Remove the 4 sites not being used for fish habitat comparison
pretrained_df.drop(pretrained_df.index[pretrained_df['Habitat'] == 'SiteC'], inplace=True)
pretrained_df.drop(pretrained_df.index[pretrained_df['Habitat'] == 'SiteD'], inplace=True)
pretrained_df.drop(pretrained_df.index[pretrained_df['Habitat'] == 'SiteE'], inplace=True)
pretrained_df.drop(pretrained_df.index[pretrained_df['Habitat'] == 'SiteI'], inplace=True)

# Rename the renaming sites to High or Low fish
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteA'],'Low fish')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteB'],'Low fish')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteG'],'Low fish')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteJ'],'Low fish')

pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteF'],'High fish')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteH'],'High fish')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteK'],'High fish')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteL'],'High fish')

#Now remove the file labels so just features can be input in HBDSCAN
pretrained_features = pretrained_df.drop(['minute', 'Habitat'], axis = 1)

# Check the class column was added 
pretrained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteJdeployment2.1677983769.181103032034.wav,Low fish,0.000929,0.093042,0.28538,0.210208,0.0,0.0,0.0,0.006659,...,0.0,0.023046,0.00022,0.0,0.0,0.2253,0.304314,0.0,0.0,0.0
1,SiteAdeployment1.805322778.181023172733.wav,Low fish,0.03078,0.100007,0.408427,0.050889,0.0,0.0,0.0,0.002414,...,0.0,0.060092,0.010739,0.0,0.028925,0.244283,0.167082,0.0,0.0,0.0
5,SiteBdeployment5.1677983769.181201071827.wav,Low fish,0.002083,0.104573,0.366874,0.168646,0.0,0.0,0.0,0.0019,...,0.0,0.039408,0.008483,0.002023,0.026743,0.295693,0.211083,0.0,0.0,0.0
7,SiteAdeployment6.805322778.181208200239.wav,Low fish,0.028723,0.063788,0.302572,0.160899,0.0,0.0,0.0,0.003868,...,0.0,0.077238,0.007906,0.000191,0.039737,0.211901,0.197882,0.0,0.0,0.0
8,SiteJdeployment3.1677983769.181110075705.wav,Low fish,0.001223,0.154777,0.341569,0.162932,0.0,0.0,0.0,0.0,...,0.0,0.026811,2e-05,0.0,0.003966,0.317349,0.288543,0.0,0.0,0.0


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
pretrained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(pretrained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
pretrained_clustering = af.fit(pretrained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(pretrained_clustering.cluster_centers_indices_)

47

In [None]:
# Create contingency table
class_type = pretrained_df['Habitat'].to_numpy()
cluster_labels = pretrained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

3455.5811643683455

# **GBR Habitat Level Trained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
trained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/trained_CNN_aus.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv (trained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
trained_df = temp_df.rename(columns={"index": "minute"})

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in trained_df['minute']:
  class_type.append(get_class(i))
  

trained_df.insert(1, 'Habitat', class_type)

# Remove the 4 sites not being used for fish habitat comparison
trained_df.drop(trained_df.index[trained_df['Habitat'] == 'SiteC'], inplace=True)
trained_df.drop(trained_df.index[trained_df['Habitat'] == 'SiteD'], inplace=True)
trained_df.drop(trained_df.index[trained_df['Habitat'] == 'SiteE'], inplace=True)
trained_df.drop(trained_df.index[trained_df['Habitat'] == 'SiteI'], inplace=True)

# Rename the renaming sites to High or Low fish
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteA'],'Low fish')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteB'],'Low fish')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteG'],'Low fish')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteJ'],'Low fish')

trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteF'],'High fish')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteH'],'High fish')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteK'],'High fish')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteL'],'High fish')

#Now remove the file labels so just features can be input in HBDSCAN
trained_features = trained_df.drop(['minute', 'Habitat'], axis = 1)

# Check the class column was added 
trained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteJdeployment2.1677983769.181103032034.wav,Low fish,0.011415,10.289283,0.0,7.256893,1.295557,0.0,0.0,18.542894,...,0.42096,0.01748,4.07629,0.0,0.0,0.202933,0.843459,0.0,0.158123,1.249514
1,SiteAdeployment1.805322778.181023172733.wav,Low fish,0.0,15.645812,0.883047,4.936174,0.729615,0.0,0.812924,1.675555,...,0.0,0.323658,0.013336,0.0,0.294935,0.021129,7.113554,0.0,1.237369,0.036465
5,SiteBdeployment5.1677983769.181201071827.wav,Low fish,1.223899,10.373435,0.0,5.740677,6.125507,3.783882,1.65598,2.684944,...,0.050977,0.0,0.217532,0.156455,0.670688,5.363922,5.922828,0.0,2.551788,1.078198
7,SiteAdeployment6.805322778.181208200239.wav,Low fish,0.0,7.985122,0.0,7.463294,1.406634,0.0,2.038883,1.577258,...,0.0,0.0,0.16972,2.870732,0.793656,1.769272,2.468574,0.0,15.004949,5.499966
8,SiteJdeployment3.1677983769.181110075705.wav,Low fish,0.719422,9.616189,0.0,8.666222,5.063313,0.117243,0.016884,17.091463,...,0.883278,0.011976,3.400477,0.0,0.0,2.11325,3.245694,0.0,0.747485,0.399263


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
trained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(trained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
trained_clustering = af.fit(trained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(trained_clustering.cluster_centers_indices_)

35

In [None]:
# Create contingency table
class_type = trained_df['Habitat'].to_numpy()
cluster_labels = trained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

#old result using the habitat trained CNN: 5062.040421006918
#new result using the site trained CNN: 5062.040421006918

5062.040421006918

# **Polynesia Habitat Level Acoustic indices**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
indices_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/compound_index_poly.csv'

data = pd.read_csv(indices_path) #load dataframe
indices_df = data.reset_index() #put index in order
indices_df = indices_df.iloc[: , 2:] #remove unnecessary index

#Now remove the file labels so just features can be input in HBDSCAN
indices_features = indices_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in indices_df['minute']:
  class_type.append(get_class(i))
  
indices_df.insert(1, 'Habitat', class_type)

# Rename the renaming sites to High or Low fish
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteA'],'Photic')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteB'],'Photic')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteC'],'Photic')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteD'],'Photic')

indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteW'],'Mesophotic')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteX'],'Mesophotic')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteY'],'Mesophotic')
indices_df['Habitat'] = indices_df['Habitat'].replace(['SiteZ'],'Mesophotic')

# Check the class column was added 
indices_df.head()

Unnamed: 0,minute,Habitat,fish_ACI,fish_ACI_std,fish_ADI,fish_ADI_std,fish_H,fish_H_std,fish_Hf,fish_Hf_std,...,full_H,full_H_std,full_Hf,full_Hf_std,full_M,full_M_std,full_BI,full_BI_std,NDSI,NDSI_std
0,SiteCDay3.805322778.210303214058.wav,Photic,174.218205,17.12913,1.87549,0.173853,0.999264,0.000535,0.378237,0.043518,...,0.999935,4e-05,0.723675,0.03254,0.001088,9.6e-05,3.503309,0.543755,0.409907,0.203473
1,SiteXDay3.5210.210215103530.wav,Mesophotic,157.968812,10.113108,1.060467,0.72585,0.99844,0.000723,0.274085,0.024864,...,0.999478,0.000483,0.418846,0.072226,0.000478,7.7e-05,2.933921,0.506337,-0.594888,0.182238
2,SiteYDay2.5210.210302223537.wav,Mesophotic,167.24063,12.467357,2.092218,0.144701,0.99933,0.000496,0.402144,0.032731,...,0.999927,3.5e-05,0.752729,0.014195,0.000225,1.3e-05,2.624677,0.405407,0.460494,0.091826
3,SiteDDay2.5210.210317091056.wav,Photic,167.300849,14.807595,2.132824,0.137796,0.999494,0.000356,0.437447,0.04229,...,0.999893,6.6e-05,0.753096,0.02181,0.001418,0.000228,3.267791,0.523877,0.594795,0.117539
4,SiteADay3.805322778.210205190014.wav,Photic,166.893735,12.189808,2.266322,0.044466,0.999403,0.000294,0.490927,0.020568,...,0.999714,0.00016,0.747237,0.009536,0.005664,0.000574,4.519593,0.552794,0.567966,0.058083


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(indices_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
indices_clustering = af.fit(clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(indices_clustering.cluster_centers_indices_)

35

In [None]:
# Create contingency table
class_type = indices_df['Habitat'].to_numpy()
cluster_labels = indices_clustering.labels_

contingency_table = pd.crosstab(cluster_labels, class_type)

In [None]:
# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

6971.938944848403

# **Polynesia Habitat Level Pretrained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
pretrained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/pretrained_CNN_poly.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv(pretrained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
pretrained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
pretrained_features = pretrained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in pretrained_df['minute']:
  class_type.append(get_class(i))

pretrained_df.insert(1, 'Habitat', class_type)

# Rename the renaming sites to High or Low fish
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteA'],'Photic')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteB'],'Photic')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteC'],'Photic')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteD'],'Photic')

pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteW'],'Mesophotic')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteX'],'Mesophotic')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteY'],'Mesophotic')
pretrained_df['Habitat'] = pretrained_df['Habitat'].replace(['SiteZ'],'Mesophotic')

# Check the class column was added 
pretrained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteCDay3.805322778.210303214058.wav,Photic,0.024271,0.015441,0.673868,0.01581,0.0,0.0,0.0,0.013191,...,0.0,0.075812,0.079045,0.0,0.164519,0.16055,0.02702,0.0,0.000133,0.001901
1,SiteXDay3.5210.210215103530.wav,Mesophotic,0.024545,0.001385,0.885764,0.007736,0.0,0.0,0.0,0.003682,...,0.0,0.054833,0.031658,0.0,0.210327,0.050721,0.043802,0.0,0.0,0.0
2,SiteYDay2.5210.210302223537.wav,Mesophotic,0.038698,0.0,0.621418,0.04574,0.0,0.0,0.0,0.029869,...,0.0,0.042964,0.068022,0.0,0.006417,0.156122,0.001582,0.0,0.0,0.0
3,SiteDDay2.5210.210317091056.wav,Photic,0.011494,0.099208,0.532689,0.007028,0.0,0.0,0.0,0.0,...,0.0,0.006671,0.003732,0.0,0.039698,0.226014,0.066506,0.0,0.001176,0.0
4,SiteADay3.805322778.210205190014.wav,Photic,0.0,0.084872,0.07253,0.075937,0.0,0.0,0.0,0.0,...,0.0,0.023012,0.0,0.0,0.237829,0.050223,0.057576,0.0,0.0,0.008781


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
pretrained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(pretrained_features)

In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
pretrained_clustering = af.fit(pretrained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(pretrained_clustering.cluster_centers_indices_)

24

In [None]:
# Create contingency table
class_type = pretrained_df['Habitat'].to_numpy()
cluster_labels = pretrained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

8696.977623628116

# **Polynesia Habitat Level Trained CNN**

Load and tranform data

In [None]:
# Uppload the csv of features and copy the path
trained_path = r'/content/drive/MyDrive/Reef soundscapes with AI/Results/full_dataset_features/trained_CNN_poly.csv'

# for VGGISH wrangle the features df into the right format
data = pd.read_csv (trained_path) #load dataframe
temp_df = data.reset_index() #put index in order
temp_df = temp_df.iloc[: , 2:] #remove unnecessary index
temp_df = temp_df.T #transpose to match indices format
temp_df = temp_df.reset_index() #re-add the index
trained_df = temp_df.rename(columns={"index": "minute"})

#Now remove the file labels so just features can be input in HBDSCAN
trained_features = trained_df.drop(['minute'], axis = 1)

In [None]:
# Add a column that denotes class
def get_class(filename):
    #find part of the name that corresponds to the deployment
     #adapted the get_identifier function above to only get class (e.g healthy)
    t = filename.split(".")[0][0:5]
    return t

class_type = []
for i in trained_df['minute']:
  class_type.append(get_class(i))
  
trained_df.insert(1, 'Habitat', class_type)

# Rename the renaming sites to High or Low fish
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteA'],'Photic')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteB'],'Photic')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteC'],'Photic')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteD'],'Photic')

trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteW'],'Mesophotic')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteX'],'Mesophotic')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteY'],'Mesophotic')
trained_df['Habitat'] = trained_df['Habitat'].replace(['SiteZ'],'Mesophotic')

trained_df.head()

Unnamed: 0,minute,Habitat,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,SiteCDay3.805322778.210303214058.wav,Photic,0.0,0.0,12.082821,6.763444,7.179127,0.0,13.351444,20.285992,...,0.0,0.051185,16.882797,0.0,11.385775,4.218807,0.278274,0.0,0.031203,0.0
1,SiteXDay3.5210.210215103530.wav,Mesophotic,0.247064,0.859065,0.172915,0.105359,4.611157,0.0,0.0,1.952035,...,0.0,28.823147,17.230404,0.0,28.133654,0.948916,6.289915,0.0,0.0,0.0
2,SiteYDay2.5210.210302223537.wav,Mesophotic,8.51776,0.0,8.564845,6.75922,0.0,0.0,0.0,5.992061,...,0.0,14.142094,10.024358,0.0,0.887872,2.4327,0.0,0.0,0.0,0.0
3,SiteDDay2.5210.210317091056.wav,Photic,0.0,0.642867,10.952372,1.292887,13.916732,0.0,2.82494,0.004537,...,0.0,0.0,11.003201,15.533544,10.891987,19.48122,0.070055,0.0,0.0,0.0
4,SiteADay3.805322778.210205190014.wav,Photic,0.0,7.664385,0.153724,5.604998,0.009447,0.0,9.419145,0.0,...,0.0,0.0,0.625055,11.007442,1.294202,7.215663,2.364219,0.0,9.198437,0.0


Create the UMAP embedding, run affinity propogation and compute the chi-sq

In [None]:
trained_clusterable_embedding = umap.UMAP(
    n_neighbors=50,
    min_dist=0,
    #n_components=10, #how many dimensions to reduce the data to
    random_state=123,
).fit_transform(trained_features)

  "Graph is not fully connected, spectral embedding may not work as expected."


In [None]:
# Run affinity propogation clustering
af = AffinityPropagation(random_state = 123, damping = 0.9)
trained_clustering = af.fit(trained_clusterable_embedding) # X this should be the umap embedding

In [None]:
# Number of clusters
len(trained_clustering.cluster_centers_indices_)

23

In [None]:
# Create contingency table
class_type = trained_df['Habitat'].to_numpy()
cluster_labels = trained_clustering.labels_

# Put cluster and true class into contingency table
contingency_table = pd.crosstab(cluster_labels, class_type)

# Calculate chi^2 statistic on contingency table
h_chi2, h_p, _, _ = stats.chi2_contingency(contingency_table)
h_chi2

#old result using the habitat trained CNN: 8976.000000000002
#new result using the site trained CNN: 8975.999999999998 

8975.999999999998