Install Librosa

In [2]:
!conda install -y -c conda-forge librosa

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - librosa


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _openmp_mutex-4.5          |            1_gnu          22 KB
    appdirs-1.4.4              |     pyh9f0ad1d_0          13 KB  conda-forge
    audioread-2.1.9            |   py37h89c1867_1          33 KB  conda-forge
    conda-4.11.0               |   py37h89c1867_0        16.9 MB  conda-forge
    ffmpeg-4.2.2               |       h20bf706_0        59.6 MB
    gettext-0.19.8.1           |    h0b5b191_1005         3.6 MB  conda-forge
    gnutls-3.6.13              |       h85f3911_1         2.0 MB  conda-forge
    lame-3.100                 |    h7f98852_1001         496 KB  conda-forge
    libflac-1.3.3              |       h9c3ff4c_1         486 KB  conda-forg

Import Audio modules

In [3]:
from audio_features import AudioFeatures, calc_log_mel_energy_features
from audio_signal import AudioSignal
from audio_labels import load_labels, clean_overlapping_labels
from noise_reducer import NoiseReducer

Import general libraries

In [4]:
import os
import librosa
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import tempfile

from time import gmtime, strftime 
from datetime import datetime
from IPython.display import display

Import SKLearn

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

Import Sagemaker

In [6]:
import sagemaker 
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
import boto3
import s3fs

Check Pandas version and update pandas

In [7]:
pd.__version__

'1.0.1'

In [7]:
conda update pandas

Collecting package metadata (current_repodata.json): done
Solving environment: - 
  - defaults/linux-64::six-1.15.0-py37h06a4308_0, defaults/linux-64::tenacity-8.0.1-py37h06a4308_0, defaults/noarch::plotly-5.1.0-pyhd3eb1b0_0
  - defaults/linux-64::plotly-3.6.1-py37_0, defaults/linux-64::retrying-1.3.3-py37_2, defaults/noarch::six-1.16.0-pyhd3eb1b0done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - pandas


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    appdirs-1.4.4              |     pyhd3eb1b0_0          12 KB
    autovizwidget-0.18.0       |     pyhd3eb1b0_0          14 KB
    ca-certificates-2021.10.26 |       h06a4308_2         115 KB
    certifi-2021.10.8          |   py37h06a4308_0         151 KB
    cryptography-35.0.0        |   py37hd23ed53_0         1.3 MB
    gnutls-3.6.15              |       he1e5248_0         1.0 MB
    hdijupyteru

Function to get file keys from S3 bucket

In [9]:
def get_file_keys(bucket, prefix, client=None):
    if client is None:
        client = boto3.client('s3')
    objs = client.list_objects(Bucket = bucket ,Prefix = prefix)
    keys = []
    for x in objs['Contents']:
        keys.append(x['Key'])
    return keys

Create client and get key lists

In [14]:
client = boto3.client('s3')

audio_keys = get_file_keys('demo-bucket', 'Training_Data/Original_Audio/', client)[1:]
label_keys = get_file_keys('demo-bucket', 'Training_Data/Labels/', client)[1:]

print(len(audio_keys))
print(len(label_keys))

154
154


Get audio file information from file key

In [15]:
def parse_file_key_info(key):
    file_name = key.split('/')[-1]
    splits = file_name.split('_')
    y, m, d = splits[2].split('-')
    h, mn, s, _ = splits[3].split('.')
    
    info = {'year': int(y), 'month': int(m), 'day': int(d), 'hour': int(h), 'minute': int(mn), 'second':int(s)}
    
    return info

#print(parse_file_key_info(audio_keys[0]))

Calculate age and day/night variables

In [16]:
def calc_age_DN(key, nrow, start_date=datetime(2021,5,11)):
    info = parse_file_key_info(key)
    if info['hour'] < 8 or info['hour'] >= 22:
        dn = np.full((nrow,1), 0) #0 means Nighttime hours, before 8am or after 10pm
    else:
        dn = np.full((nrow,1), 1) #1 means Daytime hours, after 8am or before 10pm

    #calculates age of birds in total seconds (rounded to nearest day)
    age = np.full((nrow,1), (datetime(info['year'],info['month'],info['day']) - start_date).total_seconds()) #create age array
    
    return age, dn

Declare audio labels for classification

In [17]:
label_list = ['cluck', 'trill', 'squawk']

Helper function to load an audio feature from S3, optional arguments to run noise reduction algorithm and calculate deltas

In [18]:
def load_audio_features(bucket, tempfile, file_key, deltas=False, noise_reducer=False):
    audio_obj = bucket.Object(file_key)
    with open(tempfile.name, 'wb') as f:
        audio_obj.download_fileobj(f)
        signal = AudioSignal.from_file(tempfile.name)
        if noise_reducer:
            signal = NoiseReducer.reduce_noise(signal)
        features = calc_log_mel_energy_features(signal)
        if deltas:
            deltas = features.deltas()
            deltas2 = deltas.deltas()
            features = AudioFeatures.stack(features, deltas, deltas2)
    return features

Function to generate numpy data arrays from list of audio and label file keys. Boolean arguments to add deltas, run through noise reduction algo, and calculate age/day and night variables.

In [19]:
def dataset_from_keys(bucket_name, audio_keys, label_keys, label_list=None, deltas=False, noise_reducer=False, calc_age_dn=False):
    s3 = boto3.resource('s3', region_name='us-east-1')
    bucket = s3.Bucket(bucket_name)
    
    feature_files = []
    day_night = []
    ages = []
    tmp = tempfile.NamedTemporaryFile()
    
    label_keys.sort()
    audio_keys.sort()
    audio_names = [''.join(fname.split('/')[-1].split('_')) for fname in audio_keys] #NEED TO CHANGE TO .split('_')[:-1] IF NOT USING DENOISED
    label_names = [''.join(fname.split('/')[-1].split('_')[:-1])+'.flac' for fname in label_keys]

    assert audio_names == label_names, 'Files do not match' #check if audio files match label files

    for ak, lk in zip(audio_keys, label_keys):
        try:
            features = load_audio_features(bucket, tmp, ak, deltas=deltas, noise_reducer=noise_reducer)
            labels = load_labels(bucket.Object(lk).get()['Body'])                
        except RuntimeError:
            print(ak.split('/')[-1])
            continue
        
        if calc_age_dn:
            nrow = features.features.shape[0]
            age, dn = calc_age_DN(ak, nrow)
            day_night.append(dn)
            ages.append(age)
                           
        features.event_names = label_list
        features.match_labels(labels)
        feature_files.append(features)

    combined_features = AudioFeatures.concatenate(*feature_files)
    
    if calc_age_dn:                       
        age_vector = np.vstack(ages)
        dn_vector = np.vstack(day_night)
        x = np.hstack((age_vector, dn_vector, combined_features.features))
    else: 
        x = combined_features.features
                           
    combined_labels = combined_features.true_events
    for i in range(combined_labels.shape[1]):
        combined_labels[combined_labels[:,i] > 0, i] = i+1

    y = np.max(combined_labels, axis = -1)
    return (x, y)

Create training dataset

In [20]:
bucket_name = 'demo-bucket'
x, y = dataset_from_keys(bucket_name, audio_keys, label_keys, label_list, deltas=True, noise_reducer=True, calc_age_dn=True)
print(x.shape)
print(x[0,:])





Split data into train/validate/test

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1765, random_state=1) # 0.1765 x 0.85 = 0.15

Function to export data to S3

In [16]:
from io import StringIO
#ouputs data matrix as csv. Y is the first column in the csv file.
def write_data_to_s3(x, y, bucket_name, filename, prefix='Demo/Demo_Model_Data'):
    csv_buffer = StringIO()
    mat = np.hstack((y.reshape(y.shape[0],1), x))
    np.savetxt(csv_buffer, mat, delimiter=',')
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket_name, prefix+'/'+filename).put(Body=csv_buffer.getvalue())

Export training and validation sets

In [17]:
write_data_to_s3(x_train, y_train, bucket_name, 'train.csv')
write_data_to_s3(x_val, y_val, bucket_name, 'validate.csv')
write_data_to_s3(x_test, y_test, bucket_name, 'test.csv')

Helper function to upsample minority classes to the same size as the majority class.

In [33]:
def upsample(x, y, seed=47):
    np.random.seed(seed)
    classes, counts = np.unique(y, return_counts=True)
    majority_count = np.max(counts)
    majority_class = classes[np.argmax(counts)]
    
    resamps = [x]
    ys = [y[:,None]]
    for clss, cnt in zip(classes, counts):
        if clss != majority_class:
            sub_x = x[np.where(y==clss)]
            sample = sub_x[np.random.choice(np.arange(cnt), size=majority_count-cnt, replace=True)]
            resamps.append(sample)
            ys.append(np.full(shape=(majority_count-cnt,1), fill_value=clss))                               
                                            
    return (np.vstack(resamps), np.vstack(ys))

Create training set with balanced classes and write to s3

In [16]:
upsampled_x_train, upsampled_y_train = upsample(x_train, y_train)
write_data_to_s3(upsampled_x_train, upsampled_y_train, bucket_name, 'upsampled_train.csv')

Initiate XGBoost

In [25]:
role = sagemaker.get_execution_role()
container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')
sess = sagemaker.Session()

Uncomment to use with hyperparameter tuning or to fit model.

In [45]:
'''
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://demo-bucket/Howell_Models/Demo_Model',
                                    sagemaker_session=sess)
xgb.set_hyperparameters(objective = 'multi:softmax',
                        num_class=(len(label_list)+1))
'''



Read in S3 data

In [46]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data="s3://demo-bucket/Howell_Models/Data/Demo_Model_Data/train.csv", content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data="s3://demo-bucket/Howell_Models/Data/Demo_Model_Data/validate.csv", content_type='csv')

Fit model. Uncomment to fit model.

In [48]:
#xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

XGBoost Hyperparameter Tuning. Uncomment to tune model.

In [47]:
'''
#from https://towardsdatascience.com/xgboost-in-amazon-sagemaker-28e5e354dbcd
# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
    'colsample_bylevel': ContinuousParameter(0.1, 1,scaling_type="Logarithmic"),
    'colsample_bytree': ContinuousParameter(0.5, 1, scaling_type='Logarithmic'),
    'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
    'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),
    'lambda': ContinuousParameter(0,100,scaling_type='Auto'),
    'max_delta_step': IntegerParameter(0,10,scaling_type='Auto'),
    'max_depth': IntegerParameter(0,10,scaling_type='Auto'),
    'min_child_weight': ContinuousParameter(0,10,scaling_type='Auto'),
    'num_round': IntegerParameter(1,4000,scaling_type='Auto'),
    'subsample': ContinuousParameter(0.5,1,scaling_type='Logarithmic')}

objective_metric_name = 'validation:mlogloss'

tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=3,
    strategy='Bayesian',
    objective_type = 'Minimize'
)

## Starts the hyperparameter tuning job
tuner_log.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata=False)

## Prints the status of the latest hyperparameter tuning job
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']
'''

.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


'Completed'

Uncomment to deploy best model once tuning job is completed.

In [49]:
#from sagemaker.serializers import CSVSerializer
#xgb_predictor = tuner_log.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=CSVSerializer())
#xgb_predictor.endpoint_name

Load pretrained model. Uncomment to load pre-trained model.

In [26]:
#sess = sagemaker.Session()
# Read in the pre-trained model
print("Reading in pre-trained model")
model_data = 's3://demo-bucket/final_trained_model/model.tar.gz'
from sagemaker.predictor import Predictor
    
xgb = sagemaker.model.Model(image_uri=container, 
                            model_data=model_data, 
                            role=role,
                            predictor_cls=Predictor,
                            sagemaker_session=sess)

Reading in pre-trained model


Deploy prediction endpoint

In [28]:
from sagemaker.serializers import CSVSerializer
xgb_predictor=xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
    serializer=CSVSerializer())

name = xgb_predictor.endpoint_name

--------!



Delete endpoint. RUN CELL ONCE INFERENCE IS COMPLETED.

In [36]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

Function to predict labels.

In [29]:
#https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-test-model.html
def predict(data, predictor, rows=1000):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])
    return np.fromstring(predictions[1:], sep=',')

Predict labels on test set.

In [30]:
preds=predict(x_test, xgb_predictor)

Display confusion matrix and classification report.

In [31]:
confusion_matrix_=confusion_matrix(y_test,preds)
print("The confusion matrix of the model is:")
conf_mat = pd.DataFrame(confusion_matrix_)
display(conf_mat)

report = classification_report(y_test, preds, labels = [0., 1., 2., 3.], target_names=['No Call', 'Cluck', 'Trill', 'Squawk'], digits=3)
print(report)

The confusion matrix of the model is:


Unnamed: 0,0,1,2,3
0,53095,703,37,91
1,271,251,4,15
2,43,3,21,9
3,61,33,7,79


              precision    recall  f1-score   support

     No Call      0.993     0.985     0.989     53926
       Cluck      0.254     0.464     0.328       541
       Trill      0.304     0.276     0.290        76
      Squawk      0.407     0.439     0.422       180

    accuracy                          0.977     54723
   macro avg      0.490     0.541     0.507     54723
weighted avg      0.983     0.977     0.979     54723



Helper function to save dataframe

In [32]:
def save_dataframe(df, filename, prefix = 'Demo/Demo_metrics'):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object('demo-bucket', prefix+'/'+filename).put(Body=csv_buffer.getvalue())

In [35]:
from io import StringIO
save_dataframe(conf_mat, 'confusion_matrix.csv')
#save_dataframe(report, 'report_dataframe.csv')
save_dataframe(pd.DataFrame(np.hstack((preds[:,None], y_test[:,None]))), 'y_pred_y_true.csv')

In [75]:
def count_sounds(y):
    zero = np.count_nonzero(y==0)
    one = np.count_nonzero(y==1)
    two = np.count_nonzero(y==2)
    three = np.count_nonzero(y==3)
    
    assert zero+one+two+three == y.shape[0]
    
    return np.array([zero, one, two, three])

Predict sounds from another bucket

In [73]:
import time
from io import StringIO
bucket_name='demo-bucket'
tmp = tempfile.NamedTemporaryFile()

days_list=["2021-08-"+str(i).zfill(2)+"/" for i in list(range(24,32))]\
+["2021-09-"+str(i).zfill(2)+"/" for i in list(range(1,18))]

def predict_sounds(bucket_name, days_list, hours_list, label_list, start_date, deltas=True, noise_reducer=True, calc_age_dn=True):
    s3 = boto3.resource('s3', region_name='us-east-1')
    bucket = s3.Bucket(bucket_name)
    tic = time.time()
    client = boto3.client('s3')
    for day in days_list:
        class_by_file = []
        day_name=day.split("/")[-2]
        print("day ",day_name,"\n")
        for hour in hours_list:
            features_by_file = []
            hour_name=hour.split("/")[0]
            print("hour ",hour_name,"\n")
            S3_location=day+hour
            keys = get_file_keys(bucket_name, S3_location, client)[1:]
            info = parse_file_key_info(keys[1])
            for key in keys:
                try:
                    features = load_audio_features(bucket, tmp, key, deltas=deltas, noise_reducer=noise_reducer)
                    x = features.features
                    nrow = x.shape[0]
                    if calc_age_dn:
                        age, dn = calc_age_DN(key, nrow, start_date=start_date)
                        x = np.hstack((age, dn, x))
                    if nrow>2400:
                        x=x[0:2400]
                    features_by_file.append(x)
                except RuntimeError:
                    print("@", end="")
                    continue
            x_pred = np.vstack(features_by_file)
            y_pred = predict(x_pred, xgb_predictor)
            counts = count_sounds(y_pred)
            date = [datetime(info['year'],info['month'],info['day'], info['hour'])]
            class_by_file.append(np.hstack((date, counts)))
            toc = time.time()
            print('Elapsed time is %f seconds \n' % float(toc - tic))
        pred_labels_df = pd.DataFrame(np.vstack(class_by_file), columns = ['date', 'no_sound']+label_list)
        save_dataframe(pred_labels_df, f'{day_name}_labels.csv', 'Demo/Predicted_Labels')

In [None]:
predict_sounds(bucket_name, days_list, hours_list, label_list, start_date = datetime(xxxx,x,xx))

day  2021-08-24 

hour  00 

Elapsed time is 134.768576 seconds 

hour  01 

@Elapsed time is 269.752064 seconds 

hour  02 

@Elapsed time is 400.064224 seconds 

hour  03 

Elapsed time is 528.698914 seconds 

hour  04 

Elapsed time is 655.766397 seconds 

hour  05 

@@@@@@Elapsed time is 770.634146 seconds 

hour  06 

Elapsed time is 900.429334 seconds 

hour  07 

@Elapsed time is 1029.437265 seconds 

hour  08 

@Elapsed time is 1159.065686 seconds 

hour  09 

Elapsed time is 1291.503906 seconds 

hour  10 

Elapsed time is 1423.128326 seconds 

hour  11 

Elapsed time is 1555.938940 seconds 

hour  12 

@Elapsed time is 1685.190258 seconds 

hour  13 

Elapsed time is 1814.570829 seconds 

hour  14 

@@@Elapsed time is 1938.424582 seconds 

hour  15 

@@@Elapsed time is 2062.678583 seconds 

hour  16 

@@Elapsed time is 2191.086858 seconds 

hour  17 

@@Elapsed time is 2320.910332 seconds 

hour  18 

@@@Elapsed time is 2449.836339 seconds 

hour  19 

Elapsed time is 2587.08