# **Complete MSDS pipeline breakdown**

## 1) Start with our imports:

### The cell below contains the imports needed for `src/pipeline/pipeline.py`

In [1]:
import numpy as np
import pandas as pd
import io

from pathlib import Path

import torch
from torch import multiprocessing
from tqdm import tqdm

### The cell below contains the imports needed for `src/pipeline/audio_segmentor.py`

In [2]:
import os
import soundfile as sf

### The cell below contains the imports needed for `src/cli.py`

In [3]:
import sys

# # append the path of the
# # parent directory
sys.path.append('..')
sys.path.append('../src/')
sys.path.append('../src/models/bat_call_detector/batdetect2/')

from src.cfg import get_config
from src.pipeline import pipeline
import bat_detect.utils.detector_utils as du
from bat_detect.detector import models
import bat_detect.detector.compute_features as feats
import bat_detect.detector.post_process as pp
import bat_detect.utils.audio_utils as au

## 2) Write any custom methods below:

### The below method is a modified version of the generate_segments() method found in `src/pipeline/audio_segmentor.py`

The modifications are:
- Needed to use soundfile to read in only desired frames from audio file, so changed the dependency of librosa in `src/pipeline/audio_segmentor.py` to soundfile.
- The naming convention of the segments did not contain start_time. They were 0.00_(endtime) for all segments. Changed this to (starttime)_(endtime).

In [4]:
def generate_segments(audio_file: Path, output_dir: Path, start_time: float, duration: float):
    """
    Segments audio_file into clips of duration length and saves them to output_dir.
    start_time: seconds
    duration: seconds
    """

    ip_audio = sf.SoundFile(audio_file)

    sampling_rate = ip_audio.samplerate
    # Convert to sampled units
    ip_start = int(start_time * sampling_rate)
    ip_duration = int(duration * sampling_rate)
    ip_end = ip_audio.frames

    output_files = []

    # for the length of the duration, process the audio into duration length clips
    for sub_start in range(ip_start, ip_end, ip_duration):
        sub_end = np.minimum(sub_start + ip_duration, ip_end)

        # For file names, convert back to seconds 
        op_file = os.path.basename(audio_file.name).replace(" ", "_")
        start_seconds =  sub_start / sampling_rate
        end_seconds =  sub_end / sampling_rate
        op_file_en = "__{:.2f}".format(start_seconds) + "_" + "{:.2f}".format(end_seconds)
        op_file = op_file[:-4] + op_file_en + ".wav"
        
        op_path = os.path.join(output_dir, op_file)
        output_files.append({
            "audio_file": op_path, 
            "offset":  start_time + (sub_start/sampling_rate),
        })
        
        if (os.path.exists(op_path) == False):
            sub_length = sub_end - sub_start
            ip_audio.seek(sub_start)
            op_audio = ip_audio.read(sub_length)
            sf.write(op_path, op_audio, sampling_rate, subtype='PCM_16')

    return output_files 

## 3) Now we begin our actual dissection of the MSDS pipeline:

### Let's start with the audio file that the MSDS team used for testing:

In [5]:
filepath = f"{Path.home()}/Documents/Research/Lab_related"
filename = "20210910_030000.WAV"

### The below command is the command line invocation of the MSDS pipeline.

Command: `python src/cli.py --csv --num_processes=4 audio.wav output_dir/`

- `--csv` is optional; .tsv is the default format of the output detections file
- `--num_processes=4` is optional; default num_processes = 4
- `audio.wav` is required; this is the input audio file
- `output_dir/` is required; this is the directory where the detections.csv/tsv will be stored

### Calling the above command runs the following code:

- `args = parse_args()`: **Takes in the command line positional arguments**

- `cfg = get_config()`: **Sets default model configuration found in `src/cfg.py`**

- `cfg["should_csv"] = args["csv"]`

- `cfg["output_dir"] = Path(args["output_directory"])`

- `cfg["tmp_dir"] = Path(args["tmp_directory"])`

- `cfg["audio_file"] = Path(args["input_audio"])`

- `cfg["num_processes"] = args["num_processes"]`

- `_ = pipeline.run(cfg)`: **Begins the pipeline by passing the model configurations, input audio, and output directory**

### Below cell contains hard-coded arguments for the args dictionary

Note: args["output_directory"] and args["tmp_directory"] have default values if arguments not provided

In [6]:
args = dict()
args["input_audio"] = f'{filepath}/{filename}'
args["output_directory"] = f'../output_dir/'
args["tmp_directory"] = f'../output/tmp'
args["csv"] = True
args["num_processes"] = 4

### Below cell contains the cfg model parameters just as they are called in original code

In [7]:
cfg = get_config()
cfg["should_csv"] = args["csv"]
cfg["output_dir"] = Path(args["output_directory"])
cfg["tmp_dir"] = Path(args["tmp_directory"])
cfg["audio_file"] = Path(f'{filepath}/{filename}')
cfg["num_processes"] = args["num_processes"]
cfg

{'time_expansion_factor': 1.0,
 'start_time': 0.0,
 'segment_duration': 30.0,
 'models': [<models.bat_call_detector.model_detector.BatCallDetector at 0x28a6cb340>],
 'should_csv': True,
 'output_dir': PosixPath('../output_dir'),
 'tmp_dir': PosixPath('../output/tmp'),
 'audio_file': PosixPath('/Users/adityakrishna/Documents/Research/Lab_related/20210910_030000.WAV'),
 'num_processes': 4}

### When pipeline.run(cfg) is called, the following code is run:

- `_prepare_output_dirs(cfg)` 
   - **Make output and temp directories if they don't already exist**
- `segmented_file_paths = _segment_input_audio(cfg)` 
   - **Generate segments of [segment_duration] length from input_audio for model**
- `return _apply_models(cfg, segmented_file_paths)`
   - **Apply model on each audio segment and concatenate results into a final detections dataframe that is also saved in the output directory**

#### **Let's skip `prepare_output_dirs(cfg)` since we already have those directories**

### Below is the code contained in `_segment_input_audio(cfg)`

- Here I am calling my custom `generate_segments()` function defined above
- `pipeline.generate_segments()` can also be used here

In [8]:
segmented_file_paths = generate_segments(
        audio_file = cfg['audio_file'], 
        output_dir = cfg['tmp_dir'],
        start_time = cfg['start_time'],
        duration   = cfg['segment_duration'],
    )

### Below is the relevant code for `_apply_models(cfg, segmented_file_paths)`

In [9]:
def _apply_models(cfg, segmented_file_paths):
    csv_names = []
    audio_file_path = cfg['audio_file']
    process_pool = multiprocessing.Pool(cfg['num_processes'])

    for model in cfg['models']:

        l_for_mapping = [{
            'audio_seg': audio_seg, 
            'model': model,
            'original_file_name': audio_file_path,
            } for audio_seg in segmented_file_paths]

        pred_dfs = tqdm(
            process_pool.imap(_apply_model, l_for_mapping, chunksize=1), 
            desc=f"Applying {model.get_name()}",
            total=len(l_for_mapping),
        )

        agg_df = gen_empty_df() 
        agg_df = pd.concat(pred_dfs, ignore_index=True)

        csv_name = _generate_csv(agg_df, model.get_name(),
            audio_file_path.name,
            cfg['output_dir'],
            cfg['should_csv']
        )
        csv_names.append(csv_name)

    return csv_names

def _apply_model(item):
    annotations_df = item['model'].run(item['audio_seg']['audio_file'])
    return pipeline._correct_annotation_offsets(
        annotations_df,
        item['original_file_name'],
        item['audio_seg']['offset']
    )

def _generate_csv(annotation_df, model_name, audio_file_name, output_path, should_csv):
    file_name = f"{model_name}-{audio_file_name}"
    extension = ".csv"
    sep = ","

    if not should_csv:
        extension = ".txt"
        sep = "\t"
        annotation_df = convert_df_ravenpro(annotation_df)

    csv_path = output_path / f"{file_name}{extension}"
    annotation_df.to_csv(csv_path, sep=sep, index=False)
    return csv_path

def gen_empty_df():
    """
    Generates an empty dataframe with the correct columns for the output csv
    """
    return pd.DataFrame({
            "start_time": [],
            "end_time": [],
            "low_freq": [],
            "high_freq": [],
            "detection_confidence":[],
            "event": [],
        })

def convert_df_ravenpro(df: pd.DataFrame):
    """
    Converts a dataframe to the format used by RavenPro
    """

    ravenpro_df = df.copy()

    ravenpro_df.rename(columns={
        "start_time": "Begin Time (s)",
        "end_time": "End Time (s)",
        "low_freq": "Low Freq (Hz)",
        "high_freq": "High Freq (Hz)",
        "event": "Annotation",
    }, inplace=True)

    ravenpro_df["Selection"] = "Waveform 1"
    ravenpro_df["View"] = "1"
    ravenpro_df["Channel"] = "1"

    return ravenpro_df

1) We only have 1 model in `cfg['models]`:
   - So we can remove the for loop and rewrite `_apply_models()` by replacing every instance of `model` with `cfg['models'][0]`:


In [10]:
def _apply_models(cfg, segmented_file_paths):
    csv_names = []
    audio_file_path = cfg['audio_file']
    process_pool = multiprocessing.Pool(cfg['num_processes'])

    l_for_mapping = [{
        'audio_seg': audio_seg, 
        'model': cfg['models'][0],
        'original_file_name': audio_file_path,
        } for audio_seg in segmented_file_paths]

    pred_dfs = tqdm(
        process_pool.imap(_apply_model, l_for_mapping, chunksize=1), 
        desc=f"Applying {cfg['models'][0].get_name()}",
        total=len(l_for_mapping),
    )

    agg_df = gen_empty_df() 
    agg_df = pd.concat(pred_dfs, ignore_index=True)

    csv_name = _generate_csv(agg_df, cfg['models'][0].get_name(),
        audio_file_path.name,
        cfg['output_dir'],
        cfg['should_csv']
    )
    csv_names.append(csv_name)

    return csv_names

2) We are going to run code after `segmented_file_paths` has been generated, so let's also remove the method header and return.

3) We see the top portion of this code generates mappings called `l_for_mapping`

4) The middle portion calls `_apply_model()` to run the model on the mappings.

5) The bottom portion deals in generating a csv using the DataFrame `pred_dfs`

### Let's break it up just like that:

Let's also remove defining csv_names as there will only be 1 csv_name because of 1 model:

#### Top portion:

In [11]:
audio_file_path = cfg['audio_file']
process_pool = multiprocessing.Pool(cfg['num_processes'])

l_for_mapping = [{
    'audio_seg': audio_seg, 
    'model': cfg['models'][0],
    'original_file_name': audio_file_path,
    } for audio_seg in segmented_file_paths]

l_for_mapping

[{'audio_seg': {'audio_file': '../output/tmp/20210910_030000__0.00_30.00.wav',
   'offset': 0.0},
  'model': <models.bat_call_detector.model_detector.BatCallDetector at 0x28a6cb340>,
  'original_file_name': PosixPath('/Users/adityakrishna/Documents/Research/Lab_related/20210910_030000.WAV')},
 {'audio_seg': {'audio_file': '../output/tmp/20210910_030000__30.00_60.00.wav',
   'offset': 30.0},
  'model': <models.bat_call_detector.model_detector.BatCallDetector at 0x28a6cb340>,
  'original_file_name': PosixPath('/Users/adityakrishna/Documents/Research/Lab_related/20210910_030000.WAV')},
 {'audio_seg': {'audio_file': '../output/tmp/20210910_030000__60.00_90.00.wav',
   'offset': 60.0},
  'model': <models.bat_call_detector.model_detector.BatCallDetector at 0x28a6cb340>,
  'original_file_name': PosixPath('/Users/adityakrishna/Documents/Research/Lab_related/20210910_030000.WAV')},
 {'audio_seg': {'audio_file': '../output/tmp/20210910_030000__90.00_120.00.wav',
   'offset': 90.0},
  'model': <m

#### Middle portion:

In [12]:
def middle_portion():
    pred_dfs = tqdm(
        process_pool.imap(_apply_model, l_for_mapping, chunksize=1), 
        desc=f"Applying {cfg['models'][0].get_name()}",
        total=len(l_for_mapping),
    )

#### Bottom portion:

In [13]:
def bottom_portion(pred_dfs):
    agg_df = gen_empty_df() 
    agg_df = pd.concat(pred_dfs, ignore_index=True)

    csv_name = _generate_csv(agg_df, cfg['models'][0].get_name(),
        audio_file_path.name,
        cfg['output_dir'],
        cfg['should_csv']
    )

### To focus on the detection pipeline, let's dissect the middle portion now

#### Below is the relevant code with the middle portion packaged in a method to not interfere with workflow

In [14]:
def middle_portion():
    pred_dfs = tqdm(
        process_pool.imap(_apply_model, l_for_mapping, chunksize=1), 
        desc=f"Applying {cfg['models'][0].get_name()}",
        total=len(l_for_mapping),
    )

def _apply_model(item):
    annotations_df = item['model'].run(item['audio_seg']['audio_file'])
    return pipeline._correct_annotation_offsets(
        annotations_df,
        item['original_file_name'],
        item['audio_seg']['offset']
    )

### The middle portion can be written as:

In [16]:
all_preds = pd.DataFrame()
for i in tqdm(range(len(l_for_mapping))):
    cur_seg = l_for_mapping[i]
    df = _apply_model(cur_seg)
    all_preds = pd.concat([all_preds, df])

100%|██████████| 60/60 [08:49<00:00,  8.82s/it]


### Seeing this, we can decide to explore `_apply_model(item)`

In [17]:
def _apply_model(item):
    annotations_df = item['model'].run(item['audio_seg']['audio_file'])
    return pipeline._correct_annotation_offsets(
        annotations_df,
        item['original_file_name'],
        item['audio_seg']['offset']
    )

### To go deeper, we need to explore `item['model'].run(item['audio_seg']['audio_file'])`

- `item['model']` is a `models.bat_call_detector.model_detector.BatCallDetector` object
- `.run()` can be found in `src/models/bat_call_detector/model_detector.py`
- `item['audio_seg]['audio_file]` is a path to an audio segment.

### This is the `.run()` method in `src/models/bat_call_detector/model_detector.py`

In [None]:
def run(self, audio_file):
    """
    Creates a loop for feeding buzz to remove false positive.
    Parameters::
        bd_output: pd.DataFrame
            DataFrame containing bat calls true positive values, result from Bat Detect pipeline.

        fb_output: pd.DataFrame
            DataFrame containing feeding buzz detections, result from Template Matching pipeline.
            
    Return: pd.DataFrame
    """
    bd_output = self._run_batdetect(audio_file)
    fb_output = self._run_feedbuzz(audio_file)
    fb_final_output = self._buzzfeed_fp_removal(bd_output, fb_output)

    return pd.concat([bd_output,fb_final_output])

Here:
- `_run_batdetect(audio_file)` will provide the detections generated by batdetect2
- `_run_feedbuzz(audio_file)` will provide the detections generated by scikit-MAAD template matching
- Feedbuzzes are also compared against search-phase calls with `_buzzfeed_fp_removal(bd_output, fb_output)` for false negative elimination

### Below is the middle portion referred to above, this time broken up to show each detection pipeline:

In [None]:
all_preds = pd.DataFrame()
for i in tqdm(range(len(l_for_mapping))):
    cur_seg = l_for_mapping[i]
    bd_annotations_df = cur_seg['model']._run_batdetect(cur_seg['audio_seg']['audio_file'])
    bd_preds = pipeline._correct_annotation_offsets(
            bd_annotations_df,
            cur_seg['original_file_name'],
            cur_seg['audio_seg']['offset']
        )

    fb_annotations_df = cur_seg['model']._run_feedbuzz(cur_seg['audio_seg']['audio_file'])
    fb_final_output = cur_seg['model']._buzzfeed_fp_removal(bd_annotations_df, fb_annotations_df)
    fb_preds = pipeline._correct_annotation_offsets(
            fb_final_output,
            cur_seg['original_file_name'],
            cur_seg['audio_seg']['offset']
        )
    
    df = pd.concat([bd_preds, fb_preds])
    all_preds = pd.concat([all_preds, df])

In [18]:
all_preds

Unnamed: 0,start_time,end_time,low_freq,high_freq,detection_confidence,event,input_file
0,161.411866,161.637555,17670.76,49011.86,0.266925,Feeding Buzz,/Users/adityakrishna/Documents/Research/Lab_re...
0,246.932500,246.941800,28046.00,37602.00,0.549000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
1,247.216500,247.226400,28046.00,38133.00,0.573000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
2,247.352500,247.362700,28046.00,38892.00,0.555000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
3,247.471500,247.479400,27187.00,42404.00,0.763000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
...,...,...,...,...,...,...,...
49,1781.446500,1781.456500,25468.00,32664.00,0.584000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
50,1781.578500,1781.589100,26328.00,32298.00,0.577000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
51,1781.938500,1781.947700,26328.00,34069.00,0.614000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...
52,1782.061500,1782.070200,25468.00,34393.00,0.600000,Echolocation,/Users/adityakrishna/Documents/Research/Lab_re...


In [None]:
def _run_batdetect(self, audio_file)-> pd.DataFrame: #
        """
        Parameters:: 
            audio_file: a path containing the post-processed wav file.

        Returns:: a pd.Dataframe containing the bat calls detections
        """
        model, params = du.load_model(self.model_path)

        # Suppress output from this call
        text_trap = io.StringIO()
        sys.stdout = text_trap

        model_output = du.process_file(
            audio_file=audio_file, 
            model=model, 
            params=params, 
            args= {
                'detection_threshold': self.detection_threshold,
                'spec_slices': self.spec_slices,
                'chunk_size': self.chunk_size,
                'quiet': self.quiet,
                'spec_features' : False,
                'cnn_features': self.cnn_features,
            },
            time_exp=self.time_expansion_factor,
        )
        # Restore stdout
        sys.stdout = sys.__stdout__

        annotations = model_output['pred_dict']['annotation']

        out_df = gen_empty_df()
        if annotations:
            out_df = pd.DataFrame.from_records(annotations) 
            out_df['detection_confidence'] = out_df['det_prob']
            out_df.drop(columns = ['class', 'class_prob', 'det_prob','individual'], inplace=True) # <--- These are the batdetect2 variables being dropped!
        return out_df

In [None]:
def process_file(audio_file, model, params, args, time_exp=None, top_n=5, return_raw_preds=False, max_duration=False): # <--- This method is from batdetect (not batdetect2)

    # store temporary results here
    predictions = []
    spec_feats  = []
    cnn_feats   = []
    spec_slices = []

    # get time expansion  factor
    if time_exp is None:
        time_exp = args['time_expansion_factor']

    params['detection_threshold'] = args['detection_threshold']

    # load audio file
    sampling_rate, audio_full = au.load_audio_file(audio_file, time_exp,
                                   params['target_samp_rate'], params['scale_raw_audio'])

    # clipping maximum duration
    if max_duration is not False:
        max_duration = np.minimum(int(sampling_rate*max_duration), audio_full.shape[0])
        audio_full = audio_full[:max_duration]
    
    duration_full = audio_full.shape[0] / float(sampling_rate)

    return_np_spec = args['spec_features'] or args['spec_slices']

    # loop through larger file and split into chunks
    # TODO fix so that it overlaps correctly and takes care of duplicate detections at borders
    num_chunks = int(np.ceil(duration_full/args['chunk_size']))
    for chunk_id in range(num_chunks):

        # chunk
        chunk_time   = args['chunk_size']*chunk_id
        chunk_length = int(sampling_rate*args['chunk_size'])
        start_sample = chunk_id*chunk_length
        end_sample   = np.minimum((chunk_id+1)*chunk_length, audio_full.shape[0])
        audio = audio_full[start_sample:end_sample]

        # load audio file and compute spectrogram
        duration, spec, spec_np = du.compute_spectrogram(audio, sampling_rate, params, return_np_spec)

        # evaluate model
        with torch.no_grad():
            outputs = model(spec, return_feats=args['cnn_features'])

        # run non-max suppression
        pred_nms, features = pp.run_nms(outputs, params, np.array([float(sampling_rate)]))
        pred_nms = pred_nms[0]
        pred_nms['start_times'] += chunk_time
        pred_nms['end_times'] += chunk_time

        # if we have a background class
        if pred_nms['class_probs'].shape[0] > len(params['class_names']):
            pred_nms['class_probs'] = pred_nms['class_probs'][:-1, :]

        predictions.append(pred_nms)

        # extract features - if there are any calls detected
        if (pred_nms['det_probs'].shape[0] > 0):
            if args['spec_features']:
                spec_feats.append(feats.get_feats(spec_np, pred_nms, params))

            if args['cnn_features']:
                cnn_feats.append(features[0])

            if args['spec_slices']:
                spec_slices.extend(feats.extract_spec_slices(spec_np, pred_nms, params))

    # convert the predictions into output dictionary
    file_id = os.path.basename(audio_file)
    predictions, spec_feats, cnn_feats, spec_slices =\
              du.merge_results(predictions, spec_feats, cnn_feats, spec_slices)
    results = du.convert_results(file_id, time_exp, duration_full, params,
                              predictions, spec_feats, cnn_feats, spec_slices)

    # summarize results
    if not args['quiet']:
        num_detections = len(results['pred_dict']['annotation'])
        print('{}'.format(num_detections) + ' call(s) detected above the threshold.')

    # print results for top n classes
    if not args['quiet'] and (num_detections > 0):
        class_overall = pp.overall_class_pred(predictions['det_probs'], predictions['class_probs'])
        print('species name'.ljust(30) + 'probablity present')
        for cc in np.argsort(class_overall)[::-1][:top_n]:
            print(params['class_names'][cc].ljust(30) + str(round(class_overall[cc], 3)))

    if return_raw_preds:
        return predictions
    else:
        return results