## Bat Detection/Classsification

In [None]:
!pip install geojson

In [1]:
import base64
import itertools
import json
import os
import geojson
import  matplotlib.pyplot as plt
import multiprocessing 
import requests
import shapely.wkt
import shutil
import time

from datetime import datetime
from dotenv import load_dotenv

from dask_cuda import LocalCUDACluster
from dask.distributed import Client

from yuntu.core.geometry.intervals import TimeInterval
from yuntu.collection.methods import collection
from BATMX_full.probe import BATMX_probe
from BATMX_full.utils import LABELS

from termcolor import cprint

1 Physical GPUs, 1 Logical GPUs
1 Physical GPUs, 1 Logical GPUs
1 Physical GPUs, 1 Logical GPUs
1 Physical GPUs, 1 Logical GPUs


In [2]:
LIMIT = 1000
# alfresco query
CUMULUS = 13 # INT
PAGESIZE = 5000

# probe config
DETECTION_THRESOLHOLD = 0.5
MIN_ANN_DURATION = 0.05
BATCH_SIZE = 50
DETECTION_THRESHOLD = 0.9

# Dask
N_WORKERS = 2

# results directory
RESULTS_DIR = "/shared_volume_efs/audio/bat_detection_classification/bat-detection"

# upload to alfresco
ALFRESCO_NODE_ID = "031a7669-335a-4c9d-bb36-89d595201309"
BASE_ENDPOINT = "alfresco/api/-default-/public/alfresco/versions/1"
AUTH_ENDPOINT = "alfresco/api/-default-/public/authentication/versions/1"

In [3]:
def convert2geoJson(wkt_string):
    # Convert to a shapely.geometry.polygon.Polygon object
    g1 = shapely.wkt.loads(wkt_string)
    g2 = geojson.Feature(geometry=g1, properties={})
    
    return g2.geometry

def create_results_folder_str(results_dir, cumulo, nodes_list, rec_list, dep_list): 
    # results directory
    os.makedirs(results_dir, exist_ok=True)
    # cumulus subdir
    cum_subdir = os.path.join(results_dir, str(cumulo))
    os.makedirs(cum_subdir, exist_ok=True)
    # node subdirs
    for node in nodes_list:
        node_subdir = os.path.join(cum_subdir, node)
        os.makedirs(node_subdir, exist_ok=True)
        # recorder subdirs
        for rec in rec_list:
            rec_subdir = os.path.join(node_subdir, rec)
            os.makedirs(rec_subdir, exist_ok=True)
            # deployment subdirs
            for dep in dep_list:
                dep_subdir = os.path.join(rec_subdir, dep)
                os.makedirs(dep_subdir, exist_ok=True)
                
def filter_pred(entry, detection_threshold=0.8):
    if entry['metadata']['score']['detection']['mean'] >= detection_threshold:
        return entry
    
def filter_pred_list(pred_list, detection_threshold):
    filtered_preds = [filter_pred(e, detection_threshold) for e in pred_list]
    filtered_preds = [i for i in filtered_preds if i != None] 
    return filtered_preds
    
def login():
    """
    Tries a login to alfresco api and returns a session
    object with credentials 
    Returns: 
        session (Session):  A session object to make 
                            requests to zendro.
    """
    try:
        auth = {
            "userId": os.getenv("ALFRESCO_USER"),
            "password": os.getenv("ALFRESCO_PASSWORD"),
        }

        login = requests.post(os.getenv("ALFRESCO_URL") + AUTH_ENDPOINT + "/tickets",data=json.dumps(auth))

        base64_login = base64.b64encode(bytes(login.json()["entry"]["id"], 'utf-8')).decode()

        # se crea un objeto de Session para hacer requests
        session = requests.Session()
        # se establece bearer token
        session.headers.update({'Authorization': 'Basic ' + base64_login})

        return session
    except Exception as e:
        print("Login failed: ", e)
                
def remove_empty_folders(path_abs):
    walk = list(os.walk(path_abs))
    for path, _, _ in walk[::-1]:
        if len(os.listdir(path)) == 0:
            os.rmdir(path)    

def get_annotation_list(audio_id, annotations_df):
    annotations_list = []
    for idx_ann, ann in annotations_df.iterrows():
        ann_dict = {
            "observation_type" : "animal",
            "file_id" : audio_id,
            "geometry" : convert2geoJson(str(ann["geometry"])),
            "video_frame_num" : int(idx_ann + 1),
            "frequency_min" : float(ann["min_freq"]),
            "frequency_max" : float(ann["max_freq"]),
            "time_min" : float(ann["start_time"]),
            "time_max" : float(ann["end_time"]),
            "metadata" : ann["metadata"],
#             "updatedAt" : datetime.now(),
#             "createdAt" : datetime.now()
        }
        annotations_list.append(ann_dict) 
            
    #save file
    return annotations_list    
    
def plot_annotated_audio(audio_obj, audio_id, annotations_df, cumulus, node, recorder, deployment,  save_path_folder=False, figsize=(20,10)):
    ax = audio_obj.features.db_spectrogram().plot(figsize=figsize)
    ax.set_ylabel('F (KHz)')
    for _, ann in annotations_df.iterrows():
        geom = TimeInterval(start_time=ann["start_time"],
                            end_time=ann["end_time"])
        geom.plot(ax=ax)  

    plt.tight_layout()
    
    if save_path_folder:
        file_path = os.path.join(save_path_folder, f"{audio_id}_annSpectrogram.png")
        plt.savefig(file_path)
    # plt.show()
    
    save_metadata_annotated_spectrogram(audio_id, save_path_folder, 
                              cumulus, node, recorder, deployment, parent="Null")
            
def remove_empty_folders(path_abs):
    walk = list(os.walk(path_abs))
    for path, _, _ in walk[::-1]:
        if len(os.listdir(path)) == 0:
            os.rmdir(path)  
            
def save_metadata_annotated_spectrogram(audio_id,
                  path, cumulus, node, recorder, deployment, parent="Null"):

    product_name = "Spectrogram - Bat detection and classification"
    file_path = os.path.join(path, f"{audio_id}_annSpectrogram.png")
    metadata_filename = os.path.join(path, f"{audio_id}_annSpectrogram_metadata.json")

    if int(node.split("_")[2]) == 0:
        node_category = "Degradado"
    elif int(node.split("_")[2]) == 1:
        node_category = "Integro"

    metadata = {
        "product_parent": parent,
        "product_name": product_name,
        "product_path": file_path,
        "product_spectrum": "Ultrasonic",
        "CumulusName": cumulus,
        "NodeCategoryIntegrity": node_category,
        "NomenclatureNode": node,
        "SerialNumber": recorder,
        "DateDeployment": deployment,
        "AudioID": audio_id
    }
    with open(metadata_filename, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)
        
    # print(f"{file_path} saved.")
    print(f"{metadata_filename} saved.\n")

In [None]:
load_dotenv()
DB_CONFIG = {
    'provider': 'alfresco',
    'config': {
        'api_url': 'https://api.conabio.gob.mx/alfresco/',
        'page_size': PAGESIZE,
        'api_key': os.getenv("X_API_KEY"),
        'base_filter': "+TYPE: \"sipecam:Audio\" AND -TYPE:\"dummy\"",
        'recording_parser': {"path": "/shared_volume_efs/audio/bat_detection_classification/utils.py",
                             "object_name": "parser"}
        
    }
}
COL_CONFIG = {
    "col_type": "alfresco",
    "db_config": DB_CONFIG
}

col = collection(**COL_CONFIG)
query = f"(sipecam:CumulusName:\"{CUMULUS}\")"  # AND (sipecam:SampleRate:{SAMPLERATE})

if LIMIT:
    recs = col.get_recording_dataframe(query, limit=LIMIT, with_metadata = True, with_geometry = False)
else:
    recs = col.get_recording_dataframe(query, with_metadata = True, with_geometry = False)

# include filtering columns for processing units
recs = recs[recs["spectrum"]=="ultrasonic"]
recs.loc[:, "node"] = recs.metadata.apply(lambda x: x["entry"]["properties"]["sipecam:NomenclatureNode"])
recs.loc[:, "recorder"] = recs.metadata.apply(lambda x: x["entry"]["properties"]["sipecam:SerialNumber"]) 
recs.loc[:, "deployment"] = recs.metadata.apply(lambda x: x["entry"]["path"]["name"].split("/audio")[0].split("/")[-1])
recs.loc[:,"proc_unit"] = recs.apply(lambda x: (x["node"], x["recorder"], x["deployment"]), axis=1)

## Folder structure

In [None]:
# create results folder structure
nodes_list = recs.node.unique()
recorders_list = recs.recorder.unique()
deployments_list = recs.deployment.unique()
if os.path.isdir(RESULTS_DIR):
    shutil.rmtree(RESULTS_DIR)
create_results_folder_str(RESULTS_DIR, CUMULUS, nodes_list, recorders_list, deployments_list)

## Probe processing

In [None]:
PROBE_CONFIG = {
    "module": {
        "path": "/shared_volume_efs/audio/bat_detection_classification/BATMX_full/probe.py",
        "object_name": "BATMX_probe"
    },
    "kwargs": {},
    "annotate_args": {
        "detection_threshold": DETECTION_THRESOLHOLD,
        "min_ann_duration": 0.05,
        "batch_size": 200
    },
    "use_metadata": False,
    "use_annotations": False
}


cluster = LocalCUDACluster()
client = Client(cluster)
npartitions=len(client.ncores())

# process audio for each processing-unit
start_time = time.time()
proc_units = recs.proc_unit.unique()
not_processed_units = []
for proc_unit in proc_units:
    node, recorder, deployment = proc_unit
    print(f"* Processing: node {node} | recorder {recorder} | deployment date {deployment}")
    file_path = os.path.join(RESULTS_DIR, str(CUMULUS), str(node), recorder, deployment)
    unit_audio_col_df = recs[recs.proc_unit == proc_unit]
    audio_ids_list = unit_audio_col_df.id.unique().tolist() # list of audios
    print(f"\tNumber of audios in df: {unit_audio_col_df.shape[0]}")
    # apply probe to each audio
    annotations_df = unit_audio_col_df.audio.apply_probe(PROBE_CONFIG, client=client, npartitions=npartitions, 
                                                         work_dir="/shared_volume_efs/audio/bat_detection_classification/")
    elapsed = time.time() - start_time
    print(f"Elapsed: {(elapsed)/60} mins")
    
    # for each audio plot annotations
    for audio_id in [audio_ids_list]:
        sub_ann_df = annotations_df.query(f"recording=='{audio_id}'").reset_index(drop=True) 
        sub_ann_df.loc[:, "detection_score"] = sub_ann_df.metadata.apply(lambda x: x["score"]["detection"]["mean"]) 
        sub_ann_df = sub_ann_df.query(f"detection_score >= {DETECTION_THRESHOLD}")
        print(f"# Annotations {sub_ann_df.shape[0]}")
        audio_obj = unit_audio_col_df.query(f"id=='{audio_id}'")
        alfresco_id = audio_obj.iloc[0]["metadata"]["entry"]['id']

        if sub_ann_df.shape[0] > 0: # has at least one annotation
            print(f"\t-Processing audio: {audio_id}")
            plot_annotated_audio(audio_obj.audio[0], 
                                 alfresco_id, sub_ann_df,
                                 CUMULUS, node, recorder, deployment, 
                                 save_path_folder=file_path)  
            
            # save metadata from annotations list
            annotations_list = get_annotation_list(audio_id, sub_ann_df)
            annotations_dict = {}
            for idx, ann in enumerate(annotations_list):
                annotations_dict[str(idx)] = ann
                
            file_path_ann_metadata = os.path.join(file_path, f"{audio_id}_annotations_metadata.json")
            with open(file_path_ann_metadata, 'w', encoding='utf-8') as f:
                json.dump(annotations_dict, f, ensure_ascii=False, indent=4)
        else:
            print(f"\t-Skipping audio: {audio_id}")

end_time = time.time()

print(f"Total time {(start_time - end_time)/60}")
client.close()
cluster.close()

In [None]:
# for each audio plot annotations
for audio_id in audio_ids_list:
    sub_ann_df = annotations_df.query(f"recording=='{audio_id}'").reset_index(drop=True) 
    sub_ann_df.loc[:, "detection_score"] = sub_ann_df.metadata.apply(lambda x: x["score"]["detection"]["mean"]) 
    sub_ann_df = sub_ann_df.query(f"detection_score >= {DETECTION_THRESHOLD}")
    print(f"# Annotations {sub_ann_df.shape[0]}")
    audio_obj = unit_audio_col_df.query(f"id=='{audio_id}'")
    alfresco_id = audio_obj.iloc[0]["metadata"]["entry"]['id']

    if sub_ann_df.shape[0] > 0: # has at least one annotation
        print(f"\t-Processing audio: {audio_id}")
        plot_annotated_audio(audio_obj.audio[0], 
                             alfresco_id, sub_ann_df,
                             CUMULUS, node, recorder, deployment, 
                             save_path_folder=file_path)  

        # save metadata from annotations list
        annotations_list = get_annotation_list(audio_id, sub_ann_df)
        annotations_dict = {}
        for idx, ann in enumerate(annotations_list):
            annotations_dict[str(idx)] = ann

        file_path_ann_metadata = os.path.join(file_path, f"{audio_id}_annotations_metadata.json")
        with open(file_path_ann_metadata, 'w', encoding='utf-8') as f:
            json.dump(annotations_dict, f, ensure_ascii=False, indent=4)
    else:
        print(f"\t-Skipping audio: {audio_id}")

## Write metadata to Zendro

In [None]:
###


## Remove empty subdirectories

In [None]:
remove_empty_folders(RESULTS_DIR)