## Update the GuanoMD inside each of the individual wav files to contain all of the NABat site/deployment level MD that we have 

In [1]:
import time
from pathlib import Path
import pandas as pd

from datetime import datetime, timedelta

from guano import GuanoFile

from osgeo import gdal
import nabatpy
from nabatpy.utils import parse_nabat_fname, row_lookup_v2

### We have a csv of the site/deployment level MD for this project (CO_NABAT_2016_Stationary_Acoustic_deployment_md_2019Apr19_final.csv)
#### We just need a way to cross walk these rows to our files, Since we cleaned up the file names in the last notebook, this should be easy.

In [2]:
all_site_md_fname = r"Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO_NABAT_2016_Stationary_Acoustic_deployment_md_2019Apr19_final.csv"
all_site_md = nabatpy.utils.bulkupload_to_df(all_site_md_fname).set_index(['grts_cell_id', 'location_name'])

all_site_md.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,start_time,end_time,detector,microphone,microphone_orientation,microphone_height,distance_to_nearest_clutter,clutter_type,distance_to_nearest_water,water_type,percent_clutter,broad_habitat_type,audio_recording_name,software_type
grts_cell_id,location_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2,NW,39.290652,-108.821024,2016-06-30,2016-07-03,WILDLIFE ACOUSTICS SM2Bat+,Wildlife Acoustics SMX-US,,2.0,,vegetation,,,,shrubland,FS,Sonobat 3.x
2,SW,39.279238,-108.775622,2016-06-30,2016-07-03,WILDLIFE ACOUSTICS SM2Bat+,Wildlife Acoustics SMX-US,,2.0,,vegetation,,,,shrubland,FS,Sonobat 3.x
2,NE,39.332177,-108.748456,2016-06-30,2016-07-03,WILDLIFE ACOUSTICS SM2Bat+,Wildlife Acoustics SMX-US,,2.0,,vegetation,,,,shrubland,FS,Sonobat 3.x
2,SE,39.271585,-108.72806,2016-06-30,2016-07-03,WILDLIFE ACOUSTICS SM2Bat+,Wildlife Acoustics SMX-US,,2.0,,vegetation,,,,shrubland,FS,Sonobat 3.x
61,NE,38.052719,-104.448246,2016-07-11,2016-07-14,WILDLIFE ACOUSTICS SM2Bat-192,Wildlife Acoustics SMX-US,,3.0,,vegetation,,,,prairie,FS,Sonobat 3.x


### Since these data were processed using Sonobat 3.x there is not GuanoMD in the files.
### We'll need to read and cleanup the csv outputs from sonobat 3.x 

In [3]:
def excel_to_df(fname):
    df = pd.read_excel(fname).rename(columns=lambda x: x.strip().replace(' ', ''))
    
    df = pd.concat([df, pd.DataFrame(list(df.Path.apply(nabatpy.utils.parse_nabat_fname)))], axis=1)

    df = df.rename(columns={"Consensus": "Species_Auto_ID", "EVALUATION": "Species_Manual_ID"})
    
    df['Species_Manual_ID'] = df['Species_Manual_ID'].astype(str).str.upper().str.strip().replace('NAN', '')
    df['Species_Auto_ID'] = df['Species_Auto_ID'].astype(str).str.upper().str.strip().replace('NAN', '')

    ambiguous = df.Species_Manual_ID.str.startswith('UNCONFIRM', na=False)
    df.loc[ambiguous, 'Species_Manual_ID'] = 'UNCONFIRMED'
    
    ambiguous = df.Species_Manual_ID.str.startswith('UNCONFI', na=False)
    df.loc[ambiguous, 'Species_Manual_ID'] = 'UNCONFIRMED'

    ambiguous = df.Species_Manual_ID.str.startswith('UNCOMFIRM', na=False)
    df.loc[ambiguous, 'Species_Manual_ID'] = 'UNCONFIRMED'

    ambiguous = df.Species_Manual_ID.str.startswith('UNSURE', na=False)
    df.loc[ambiguous, 'Species_Manual_ID'] = 'UNCONFIRMED'

    ambiguous = df.Species_Manual_ID.str.startswith('0', na=False)
    df.loc[ambiguous, 'Species_Manual_ID'] = 'UNCONFIRMED'
    
    return df

#### We'll load up all the individual csvs into a single data frame for quick access in attributing the individual files

In [4]:
data_dname = r"Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic"
data_dir = Path(data_dname)

excel_fnames = data_dir.glob('**\*.xlsx')
excel_fnames = [f for f in excel_fnames if '~' not in str(f)]

dfs = []
errors = []

for excel_fname in excel_fnames:
    try:
        dfs.append(excel_to_df(excel_fname))
    except Exception as e:
        errors.append(excel_fname)
        print(excel_fname, e)
    
df = pd.concat(dfs)
df = df.rename(columns={"Path":"path",
                   "Filename":"filename"})

Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic\221\BLM SonoBat analysis\NABAT221E Data.xlsx 'DataFrame' object has no attribute 'Path'
Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic\221\BLM SonoBat analysis\NABAT221NE Data.xlsx 'DataFrame' object has no attribute 'Path'
Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic\221\BLM SonoBat analysis\NABAT221NE-2 Data.xlsx 'DataFrame' object has no attribute 'Path'
Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic\2717\BLM classifications\NABAT2717NW-BatchData1.xlsx 'DataFrame' object has no attribute 'Path'
Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic\2717\BLM classifications\NABAT2717SE-BatchData1.xlsx 'DataFrame' object has no attribute 'Path'
Z:\TSH\DD274_NABat\CNHP_data_processing\Source\CO_NABat\CO 2016 acoustic\2717\BLM classifications\NABAT2717SW-BatchData1.xlsx 'DataFrame' object has no attribute 'Path'
Z:\TSH

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [5]:
def pull_id_from_sb(fname):
    """ 
    """
    
    auto = df[df.correct_fname==Path(fname).name]['Species_Auto_ID'].iloc[0]
    manual = df[df.correct_fname==Path(fname).name]['Species_Manual_ID'].iloc[0]
    
    
    return (auto, manual)

    
project_md = {'sample_frame':'conus',
             'project_name':'Colorado NABat Monitoring',
             'project_id':33,
             'land_unit_code':'',
             'contact_info':'Jeremy Siemers, Colorado Natural Heritage Program',
             'species_list':'south Great Basin'}

to_remove = []


def pull_site_md(fname):
    # returns a dictionary of all of the site/deployment MD associated with a single file name
    parts = nabatpy.utils.parse_nabat_fname(fname)
    site_md_row = all_site_md.loc[int(parts['GrtsId'])].loc[parts['SiteName']]
       
    site_md = {}
    site_md['latitude'] = site_md_row['latitude']
    site_md['longitude'] = site_md_row['longitude']
    site_md['grts_cell_id'] = parts['GrtsId']
    site_md['location_name'] = parts['SiteName']
    site_md['detector'] = site_md_row['detector']
    site_md['microphone'] = site_md_row['microphone']
    site_md['microphone_height'] = site_md_row['microphone_height']
    site_md['distance_to_nearest_clutter'] = site_md_row['distance_to_nearest_water']
    site_md['clutter_type'] = site_md_row['clutter_type'].replace('vegetation', 'Vegetation')
    site_md['broad_habitat_type'] = site_md_row['broad_habitat_type']
    
    start, stop = nabatpy.utils.get_auto_times(fname)
    site_md['start_time'] = nabatpy.utils.time_to_timestr(start)
    site_md['end_time'] = nabatpy.utils.time_to_timestr(stop)
    
    return site_md

sb_md = {'Version':'3.x',
        'Classifier':'Great Basin'}

In [6]:
def get_row_from_fname(fname):
    # returns an ordered dictionary of all of the metadata we have for a single file
    row = nabatpy.utils.get_empty_row(version=2)
    site_md = pull_site_md(fname)
    row.update(site_md)
    row.update(project_md)
    
    row['audio_recording_name'] = Path(fname).name
    
    parts = nabatpy.utils.parse_nabat_fname(fname)
    row['grts_cell_id'] = parts['GrtsId']
    row['location_name'] = parts['SiteName']
    
    return row

# test it:
get_row_from_fname(r"D:\CNHP_Output\2016\1005\NE\1005_NE_20160722_205030.wav")

OrderedDict([('grts_cell_id', '1005'),
             ('location_name', 'NE'),
             ('latitude', 37.71987167),
             ('longitude', -106.52473159999998),
             ('start_time', '2016-07-22T20:41:22'),
             ('end_time', '2016-07-23T05:32:17'),
             ('detector', 'WILDLIFE ACOUSTICS SM4Bat-FS'),
             ('microphone', 'Wildlife Acoustics SMM-U1'),
             ('microphone_orientation', ''),
             ('microphone_height', 2.0),
             ('distance_to_nearest_clutter', nan),
             ('clutter_type', 'Vegetation'),
             ('distance_to_nearest_water', ''),
             ('water_type', ''),
             ('percent_clutter', ''),
             ('broad_habitat_type', 'forest-conifer'),
             ('audio_recording_name', '1005_NE_20160722_205030.wav'),
             ('software_type', ''),
             ('auto_id', ''),
             ('manual_id', ''),
             ('project_name', 'Colorado NABat Monitoring'),
             ('project_id', 33)

In [7]:


def update_single_md(fname, to_delete=[]):
    # Updates the guano MD for a single file.
    # to_delete is a list of guano tags that we want to delete (This will be empty unless a previous run has added something unwanted to the MD)
    
    g = GuanoFile(fname)

    for thing in to_delete:
        try:
            del g[thing]
        except KeyError:
            pass
        
    row = get_row_from_fname(fname)
    for k,v in row.items():
#         print(k, v)
        if pd.isna(v):
            v = ''
        
        nabat_tag = row_lookup_v2[row_lookup_v2.df_columns==k]['nabat_tag'].iloc[0]
        if nabat_tag.startswith('NABat|'):
            g[nabat_tag] = v
            
    g['Timestamp'] = nabatpy.utils.parse_nabat_fname(fname)['datetime']
    
    auto, manual = pull_id_from_sb(fname)
    g['Species Auto ID'] = auto
    g['Species Manual ID'] = manual
    
    g.write(make_backup=False)
    
    
# test it:
update_single_md(r"D:\CNHP_Output\2016\1005\NE\1005_NE_20160722_205030.wav")

In [8]:
input_dname = r"D:\CNHP_Output\2016"

In [9]:
def update_all_md(dname, project_md, redo_all=False):
    input_dir = Path(dname)
    
    wavs = list(input_dir.glob("**/*.wav"))
    
    from ipywidgets import FloatProgress, Button
    from IPython.display import display
    fp = FloatProgress(min=0, max=len(wavs))
    label = Button(description = '...')
    label.style.button_color='#ffffcc'
    display(fp, label)
    
    
    last_grt = ''
    for i, wav in enumerate(wavs):
        if i%100 == 0:
            parts = nabatpy.utils.parse_nabat_fname(str(wav))
            label.description = f"{parts['GrtsId']} {parts['SiteName']} {i}"
        
        
        
        try:        
            g = GuanoFile(wav)
        except:
            print(f"Problem with guanoMD in {wav}")
            g is None
            
        if redo_all or g is None or not 'NABat' in g.get_namespaces():
            label.style.button_color='lightgreen'
            try:
                update_single_md(str(wav))
                g = GuanoFile(wav)
            except Exception as e:
                print(f"There was a problem with:\n\t{wav}")

                import traceback
                traceback.print_exc()
                dropped_drive = False
                while not wav.parent.exists():
                    dropped_drive = True
                    print('waiting for drive to reconect ...')
                    time.sleep(30)

                if dropped_drive:
                    try:
                        update_single_md(str(wav))
                    except:
                        print(f"still didn't work")
        else:
            label.style.button_color = '#ffffcc'
            
        fp.value += 1
        
update_all_md(input_dname, project_md, redo_all=True)

FloatProgress(value=0.0, max=152055.0)

Button(description='...', style=ButtonStyle(button_color='#ffffcc'))