# Adding absence records to PISCO transect data

In [2]:
## Imports

import pandas as pd
import numpy as np
import random
import math

from datetime import datetime # for handling dates

In [3]:
## Ensure my general functions for the MPA data integration project can be imported, and import them

import sys
sys.path.insert(0, "C:\\Users\\dianalg\\PycharmProjects\\PythonScripts\\MPA data integration")

import WoRMS # functions for querying WoRMS REST API

In [34]:
## Load data

# path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\PISCO\\'
filename = 'MLPA_kelpforest_fish.1.csv'
fish = pd.read_csv(filename, encoding='ANSI', dtype={'transect':str, 'sex':str, 'site_name_old':str})

print(fish.shape)
fish.head()

(381693, 24)


Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,max_tl,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old
0,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
1,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
2,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
3,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
4,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,8.0,,MARK CARR,6.1,2.4,,HIGH,1.0,,


In [35]:
## Load species table

filename = 'MLPA_kelpforest_taxon_table.1.csv'
species = pd.read_csv(filename)

print(species.shape)
species.head()

(1336, 38)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,...,LOOKED2009,LOOKED2010,LOOKED2011,LOOKED2012,LOOKED2013,LOOKED2014,LOOKED2015,LOOKED2016,LOOKED2017,LOOKED2018
0,HSU,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,...,no,no,no,no,no,yes,yes,no,yes,yes
1,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
2,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
3,HSU,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,...,no,no,no,no,no,no,no,no,no,no
4,UCSB,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,...,no,no,no,no,no,no,no,no,no,no


In [36]:
## Limit to fish species

species = species[species['sample_type'] == 'FISH']
print(species.shape)

(523, 38)


In [37]:
## Melt species table

long = pd.melt(species, id_vars=species.columns[0:18], var_name='year', value_name='looked')
print(long.shape)
long.head()

(10460, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
0,HSU,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,LOOKED1999,no
1,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,LOOKED1999,yes
2,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,LOOKED1999,no
3,HSU,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,LOOKED1999,no
4,UCSB,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,LOOKED1999,no


In [38]:
## Replace 

s = 'LOOKED2011'
s.split('D')[1]

long['year'] = long['year'].str.split('D').str[1].astype(int)
long['year'].unique()

array([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

## Toy example

2 campuses = UCSC, VRG <br>
2 years = 2003, 2004 <br>
2 surveys per year = UCSC: 6/1/03, 7/1/03, 6/1/04, 7/1/04; VRG: 6/2/03, 7/2/03, 6/2/04, 7/2/04 <br>
1 transect per survey <br>
2 species = SP1, SP2 <br>

VRG did not look for SP1 in 2003 <br>
UCSC did not look for SP2 in 2004

For the UCSC survey on 7/1/04, no organisms were observed.

In [8]:
# Toy fish table

toy_fish = pd.DataFrame({'campus':['UCSC']*3 + ['VRG']*2 + ['UCSC']*2 + ['VRG']*4,
                        'day':[1]*3 + [2]*2 + [1]*2 + [2]*4,
                        'month':[6, 6, 7, 6, 7, 6, 7, 6, 6, 7, 7],
                        'year':[2003]*5 + [2004]*6,
                        'species':['SP1'] + ['SP2']*4 + ['SP1', 'NO_ORG', 'SP1', 'SP2', 'SP1', 'SP2'],
                        'count':[2, 3, 1, 2, 7, 1, 0, 3, 4, 1, 2],
                        'depth':list(range(11))})
toy_fish

Unnamed: 0,campus,day,month,year,species,count,depth
0,UCSC,1,6,2003,SP1,2,0
1,UCSC,1,6,2003,SP2,3,1
2,UCSC,1,7,2003,SP2,1,2
3,VRG,2,6,2003,SP2,2,3
4,VRG,2,7,2003,SP2,7,4
5,UCSC,1,6,2004,SP1,1,5
6,UCSC,1,7,2004,NO_ORG,0,6
7,VRG,2,6,2004,SP1,3,7
8,VRG,2,6,2004,SP2,4,8
9,VRG,2,7,2004,SP1,1,9


In [9]:
# Toy species table

toy_sp = pd.DataFrame({'campus':['UCSC', 'VRG']*6,
                      'year':[2003, 2003, 2004, 2004]*3,
                      'species':['SP1']*4 + ['SP2']*4 + ['NO_ORG']*4,
                      'looked':['Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y']})
toy_sp

Unnamed: 0,campus,year,species,looked
0,UCSC,2003,SP1,Y
1,VRG,2003,SP1,N
2,UCSC,2004,SP1,Y
3,VRG,2004,SP1,Y
4,UCSC,2003,SP2,Y
5,VRG,2003,SP2,Y
6,UCSC,2004,SP2,N
7,VRG,2004,SP2,Y
8,UCSC,2003,NO_ORG,Y
9,VRG,2003,NO_ORG,Y


In [10]:
## Get a table telling whether each fish was looked for during each specific survey

survey_table = toy_fish[['campus', 'day', 'month', 'year']].merge(toy_sp, how='left', on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table

Unnamed: 0,campus,day,month,year,species,looked
0,UCSC,1,6,2003,SP1,Y
1,UCSC,1,6,2003,SP2,Y
2,UCSC,1,6,2003,NO_ORG,Y
6,UCSC,1,7,2003,SP1,Y
7,UCSC,1,7,2003,SP2,Y
8,UCSC,1,7,2003,NO_ORG,Y
9,VRG,2,6,2003,SP1,N
10,VRG,2,6,2003,SP2,Y
11,VRG,2,6,2003,NO_ORG,Y
12,VRG,2,7,2003,SP1,N


In [11]:
## Merge with fish data to get final outcome

full_toy_fish = toy_fish.merge(survey_table, how='right', on=['campus', 'day', 'month', 'year', 'species'])
full_toy_fish

Unnamed: 0,campus,day,month,year,species,count,depth,looked
0,UCSC,1,6,2003,SP1,2.0,0.0,Y
1,UCSC,1,6,2003,SP2,3.0,1.0,Y
2,UCSC,1,6,2003,NO_ORG,,,Y
3,UCSC,1,7,2003,SP1,,,Y
4,UCSC,1,7,2003,SP2,1.0,2.0,Y
5,UCSC,1,7,2003,NO_ORG,,,Y
6,VRG,2,6,2003,SP1,,,N
7,VRG,2,6,2003,SP2,2.0,3.0,Y
8,VRG,2,6,2003,NO_ORG,,,Y
9,VRG,2,7,2003,SP1,,,N


This has to be cleaned according to the following rules:
- If species = NO_ORG, drop the record --> NO_ORG is necessary to populate absence records, but is uninformative afterward
- If looked = Y and count = NaN, count should = 0 --> absence record
- If looked = N and count = NaN, count should = NaN --> true missing observation, these can be dropped

In [12]:
## Clean

full_toy_fish = full_toy_fish[full_toy_fish['species'] != 'NO_ORG'].copy()
full_toy_fish.loc[(full_toy_fish['looked'] == 'Y') & (full_toy_fish['count'].isna() == True), 'count'] = 0
full_toy_fish.dropna(subset=['count'], inplace=True)
full_toy_fish

Unnamed: 0,campus,day,month,year,species,count,depth,looked
0,UCSC,1,6,2003,SP1,2.0,0.0,Y
1,UCSC,1,6,2003,SP2,3.0,1.0,Y
3,UCSC,1,7,2003,SP1,0.0,,Y
4,UCSC,1,7,2003,SP2,1.0,2.0,Y
7,VRG,2,6,2003,SP2,2.0,3.0,Y
10,VRG,2,7,2003,SP2,7.0,4.0,Y
12,UCSC,1,6,2004,SP1,1.0,5.0,Y
15,UCSC,1,7,2004,SP1,0.0,,Y
18,VRG,2,6,2004,SP1,3.0,7.0,Y
19,VRG,2,6,2004,SP2,4.0,8.0,Y


One lingering question is how to fill transect-level data (like depth).

## Simple example with real data

In [13]:
## Small fish table

fish_ex = fish[(fish['survey_year'].isin([2003, 2004])) & (fish['classcode'].isin(['ACOR', 'ADAV', 'TSYM']))].iloc[:, 0:19]
fish_ex

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis
19349,UCSC,SBTL_FISH_PISCO,2003,2003,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,ACOR,1.0,10.0,,,,MIKE MOSS,15.2,4.6
149147,UCSB,SBTL_FISH_PISCO,2003,2003,9,9,SCI_YELLOWBANKS_W,INMID,BOT,3,TSYM,100.0,16.5,15.0,18.0,,JENN CASELLE,8.2,4.6
149353,UCSB,SBTL_FISH_PISCO,2003,2003,9,10,SCI_GULL_ISLE_E,OUTER,BOT,3,TSYM,300.0,17.0,16.0,18.0,,JESSE PATTERSON,18.3,12.2
149364,UCSB,SBTL_FISH_PISCO,2003,2003,9,10,SCI_GULL_ISLE_E,OUTER,MID,3,TSYM,400.0,14.0,,,,CHAD BURT,10.1,10.7
149812,UCSB,SBTL_FISH_PISCO,2003,2003,9,11,SCI_FORNEY_W,OUTER,MID,2,TSYM,20.0,21.5,20.0,23.0,,JESSE PATTERSON,7.6,6.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368548,VRG,SBTL_FISH_CRANE,2004,2005,1,15,SCAI - Pin Rock,OUTER,BOT,11 CEN,ADAV,7.0,25.0,,,,VRG,15.2,5.0
368576,VRG,SBTL_FISH_CRANE,2004,2005,1,15,SCAI - Pin Rock,OUTER,MID,10 CEN,ADAV,1.0,25.0,,,,VRG,11.1,5.0
368710,VRG,SBTL_FISH_CRANE,2004,2004,7,30,SCAI - Ripper's Cove,MID,BOT,5 E,TSYM,1.0,4.0,,,,VRG,7.6,20.0
368716,VRG,SBTL_FISH_CRANE,2004,2004,7,30,SCAI - Ripper's Cove,MID,MID,1 E,TSYM,50.0,7.0,,,,VRG,7.1,20.0


In [14]:
## Small species table

sp_ex = long[long['year'].isin([2003, 2004])]
sp_ex.head()

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
2092,HSU,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,2003,no
2093,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,2003,yes
2094,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,2003,no
2095,HSU,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,2003,no
2096,UCSB,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,2003,no


In [15]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = fish_ex[['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect']].merge(sp_ex[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table.drop(columns='year', inplace=True)
survey_table

Unnamed: 0,campus,day,month,survey_year,site,zone,level,transect,classcode,looked
0,UCSC,6,8,2003,WHITE_ROCK_DC,OUTMID,BOT,2,ACOR,no
1,UCSC,6,8,2003,WHITE_ROCK_DC,OUTMID,BOT,2,ADAV,yes
2,UCSC,6,8,2003,WHITE_ROCK_DC,OUTMID,BOT,2,AFLA,yes
3,UCSC,6,8,2003,WHITE_ROCK_DC,OUTMID,BOT,2,AHOL,no
4,UCSC,6,8,2003,WHITE_ROCK_DC,OUTMID,BOT,2,AOCE,yes
...,...,...,...,...,...,...,...,...,...,...
8944,VRG,30,7,2004,SCAI - Ripper's Cove,MID,MID,5 E,TSYM,yes
8945,VRG,30,7,2004,SCAI - Ripper's Cove,MID,MID,5 E,UHAL,yes
8946,VRG,30,7,2004,SCAI - Ripper's Cove,MID,MID,5 E,URON,yes
8947,VRG,30,7,2004,SCAI - Ripper's Cove,MID,MID,5 E,ZEXA,yes


In [16]:
## Merge with fish data to get final outcome

full_fish_ex = fish_ex.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_fish_ex

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,looked
0,UCSC,SBTL_FISH_PISCO,2003,2003.0,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,ACOR,1.0,10.0,,,,MIKE MOSS,15.2,4.6,no
1,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,ADAV,,,,,,,,,yes
2,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,AFLA,,,,,,,,,yes
3,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,AHOL,,,,,,,,,no
4,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,AOCE,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7234,VRG,SBTL_FISH_CRANE,2004,2004.0,7,30,SCAI - Ripper's Cove,MID,MID,5 E,TSYM,20.0,11.0,,,,VRG,2.6,20.0,yes
7235,VRG,,2004,,7,30,SCAI - Ripper's Cove,MID,MID,5 E,UHAL,,,,,,,,,yes
7236,VRG,,2004,,7,30,SCAI - Ripper's Cove,MID,MID,5 E,URON,,,,,,,,,yes
7237,VRG,,2004,,7,30,SCAI - Ripper's Cove,MID,MID,5 E,ZEXA,,,,,,,,,yes


In [17]:
## Clean

full_fish_ex = full_fish_ex[full_fish_ex['classcode'] != 'NO_ORG'].copy()
full_fish_ex.loc[(full_fish_ex['looked'] == 'yes') & (full_fish_ex['count'].isna() == True), 'count'] = 0
full_fish_ex.dropna(subset=['count'], inplace=True)
full_fish_ex

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,looked
0,UCSC,SBTL_FISH_PISCO,2003,2003.0,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,ACOR,1.0,10.0,,,,MIKE MOSS,15.2,4.6,no
1,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,ADAV,0.0,,,,,,,,yes
2,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,AFLA,0.0,,,,,,,,yes
4,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,AOCE,0.0,,,,,,,,yes
5,UCSC,,2003,,8,6,WHITE_ROCK_DC,OUTMID,BOT,2,APFL,0.0,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7234,VRG,SBTL_FISH_CRANE,2004,2004.0,7,30,SCAI - Ripper's Cove,MID,MID,5 E,TSYM,20.0,11.0,,,,VRG,2.6,20.0,yes
7235,VRG,,2004,,7,30,SCAI - Ripper's Cove,MID,MID,5 E,UHAL,0.0,,,,,,,,yes
7236,VRG,,2004,,7,30,SCAI - Ripper's Cove,MID,MID,5 E,URON,0.0,,,,,,,,yes
7237,VRG,,2004,,7,30,SCAI - Ripper's Cove,MID,MID,5 E,ZEXA,0.0,,,,,,,,yes


**Note** that there are some records where count != 0 but looked = no. This shouldn't happen.

Example: UCSC, 8/6/2003, WHITE_ROCK_DC, OUTMID, BOT, 2, classcode = ACOR

With respect to transect-level data: I think it's best to leave the NaNs rather than trying to fill. I *think* it will pan out when I drop NaN records in the MoF table.

## All data

In [18]:
# Fish

fish.head()

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,max_tl,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old
0,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
1,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
2,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
3,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
4,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,8.0,,MARK CARR,6.1,2.4,,HIGH,1.0,,


In [19]:
# Species

long.head()

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
0,HSU,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,no
1,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,yes
2,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,no
3,HSU,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,1999,no
4,UCSB,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,1999,no


In [20]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = fish[['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect']].merge(long[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table

Unnamed: 0,campus,day,month,survey_year,site,zone,level,transect,classcode,year,looked
0,UCSC,7,9,1999,HOPKINS_DC,INNER,BOT,1,ACOR,1999,no
1,UCSC,7,9,1999,HOPKINS_DC,INNER,BOT,1,ADAV,1999,yes
2,UCSC,7,9,1999,HOPKINS_DC,INNER,BOT,1,AFLA,1999,yes
3,UCSC,7,9,1999,HOPKINS_DC,INNER,BOT,1,AHOL,1999,no
4,UCSC,7,9,1999,HOPKINS_DC,INNER,BOT,1,AOCE,1999,yes
...,...,...,...,...,...,...,...,...,...,...,...
54035835,VRG,12,8,2011,Long Point East,DEEP,MID,2,TSYM,2011,yes
54035836,VRG,12,8,2011,Long Point East,DEEP,MID,2,UHAL,2011,yes
54035837,VRG,12,8,2011,Long Point East,DEEP,MID,2,URON,2011,yes
54035838,VRG,12,8,2011,Long Point East,DEEP,MID,2,ZEXA,2011,yes


In [21]:
## Merge with fish data to get final outcome

full_fish = fish.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'year', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_fish

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
0,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,no
1,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
2,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
3,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,no
4,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8869007,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8869008,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8869009,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8869010,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes


In [22]:
## Clean

full_fish = full_fish[full_fish['classcode'] != 'NO_ORG'].copy()
full_fish.loc[(full_fish['looked'] == 'yes') & (full_fish['count'].isna() == True), 'count'] = 0
full_fish.dropna(subset=['count'], inplace=True)
full_fish

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
1,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
2,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
4,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
5,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
6,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8869007,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8869008,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8869009,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8869010,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes


Let's take a quick look for records that don't make sense (i.e. count > 0 but looked = no).

In [23]:
## Find weird records

weird = full_fish[(full_fish['count'] > 0) & (full_fish['looked'] == 'no')]
weird

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
1016,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,OUTER,BOT,2,...,,JEFF HARDING,13.1,3.0,,HIGH,2.0,,,no
13967,UCSC,SBTL_FISH_PISCO,1999,1999,9,15,STILLWATER_DC,INMID,BOT,2,...,,JEFF HARDING,6.7,6.1,,MODERATE,3.0,,,no
16542,UCSC,SBTL_FISH_PISCO,1999,1999,9,15,STILLWATER_DC,OUTMID,BOT,2,...,,JEFF HARDING,8.5,7.6,,MODERATE,3.0,,,no
18106,UCSC,SBTL_FISH_PISCO,1999,1999,9,15,STILLWATER_UC,INNER,BOT,1,...,,MARK CARR,6.1,4.6,,MODERATE,2.0,,,no
26407,UCSC,SBTL_FISH_PISCO,1999,1999,9,17,SANDHILL_UC,OUTER,BOT,2,...,,JEFF HARDING,9.1,3.0,,HIGH,2.0,,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8864810,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,3,...,,VRG,,8.0,17.0,MODERATE,,,,no
8864811,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,3,...,,VRG,,8.0,17.0,MODERATE,,,,no
8864902,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,4,...,,VRG,,8.0,17.0,MODERATE,,,,no
8864903,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,4,...,,VRG,,8.0,17.0,MODERATE,,,,no


So there are 2669 of these. (Very few in a 7 million record data set). But still. I'm not seeing any trends with respect to species, campus, etc.

In [24]:
## Get table of campuses and years where there were observations for classcodes that were not looked for according to the species table

obs_exist = weird[['campus', 'survey_year', 'classcode']].copy()
obs_exist.drop_duplicates(inplace=True)
obs_exist.head()

Unnamed: 0,campus,survey_year,classcode
1016,UCSC,1999,COTT
13967,UCSC,1999,CLIN
30253,UCSC,1999,SYRI
60011,UCSC,2000,ACOR
62056,UCSC,2000,CLIN


## Convince myself that this worked

To do this, I'll pick a set of random transects and make sure everything makes sense. I've chosen UCSB, 11/7/2014, SRI_CLUSTER_POINT_N, INMID, MID, which only has 3 transects and 10 records total.

In [25]:
## Get transect data

test_fish = fish[(fish['campus'] == 'UCSB') & 
                 (fish['year'] == 2014) & 
                 (fish['month'] == 11) & 
                 (fish['day'] == 7) & 
                 (fish['site'] == 'SRI_CLUSTER_POINT_N') & 
                 (fish['zone'] == 'INMID') & 
                 (fish['level'] == 'MID')]
test_fish.iloc[:, 0:15]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl
295030,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,BFRE,3.0,7.0,,
295031,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,BFRE,2.0,8.0,,
295032,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,BFRE,1.0,9.0,,
295033,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,OCAL,2.0,2.0,,
295034,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,PCLA,1.0,10.0,,
295035,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,BFRE,1.0,7.0,,
295036,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,BFRE,1.0,8.0,,
295037,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,BFRE,1.0,9.0,,
295038,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,OCAL,1.0,14.0,,
295039,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,3,NO_ORG,0.0,,,


In [26]:
# Get which species were looked for

test_long = long[(long['campus'] == 'UCSB') & (long['year'] == 2014)]
looked_for = test_long[test_long['looked'] == 'yes']
looked_for

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
7846,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,2014,yes
7853,UCSB,FISH,FISH,ADAV,ADAV,Animalia,Chordata,Actinopterygii,Perciformes,Haemulidae,Anisotremus,davidsonii,Anisotremus davidsonii,WoRMS,279617,Sargo,,,2014,yes
7857,UCSB,FISH,FISH,AFLA,AFLA,Animalia,Chordata,Actinopterygii,Gasterosteiformes,Aulorhynchidae,Aulorhynchus,flavidus,Aulorhynchus flavidus,WoRMS,279839,Tubesnout,,,2014,yes
7860,UCSB,FISH,FISH,AGUA,AGUA,Animalia,Chordata,Actinopterygii,Perciformes,Apogonidae,Apogon,guadalupensis,Apogon guadalupensis,WoRMS,273016,Guadalupe Cardinalfish,,,2014,yes
7868,UCSB,FISH,FISH,AINE,AINE,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Agonidae,Anoplagonus,inermis,Anoplagonus inermis,WoRMS,279630,Smooth Alligatorfish,,,2014,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8353,UCSB,FISH,FISH,UHAL,UHAL,Animalia,Chordata,Elasmobranchii,Myliobatiformes,Urotrygonidae,Urolophus,halleri,Urolophus halleri,WoRMS,315896,Haller's Round Ray,,,2014,yes
8356,UCSB,FISH,FISH,UNID,UNID,Animalia,Chordata,,,,,,Unidentified Fish,MLPA_kelpforest,,Unidentified Fish,,,2014,yes
8359,UCSB,FISH,FISH,USAN,USAN,Animalia,Chordata,Actinopterygii,Perciformes,Pholidae,Ulvicola,sanctaerosae,Ulvicola sanctaerosae,WoRMS,322270,Kelp Gunnel,,,2014,yes
8362,UCSB,FISH,FISH,ZEXA,ZEXA,Animalia,Chordata,Elasmobranchii,Rajiformes,Rhinobatidae,Zapteryx,exasperata,Zapteryx exasperata,WoRMS,283213,Banded Guitarfish,,,2014,yes


In [27]:
## Check that NO_ORG is on this list

looked_for['classcode'].unique()

array(['AARG', 'ADAV', 'AFLA', 'AGUA', 'AINE', 'ANOB', 'AOCE', 'ATHE',
       'AVUL', 'BAITBALL', 'BATH', 'BFRE', 'BOTH', 'BPOL', 'BRAY', 'CAGG',
       'CITH', 'CLUP', 'CPRI', 'CPUN', 'CSAT', 'CSOR', 'CSTI', 'CVEN',
       'CVIO', 'EJAC', 'ELAT', 'EMBI', 'EMOR', 'GBY', 'GGAL', 'GMOR',
       'GNIG', 'HANA', 'HARG', 'HAZU', 'HCAL', 'HCAR', 'HDEC', 'HELL',
       'HFRA', 'HGRI', 'HLAG', 'HROS', 'HRUB', 'HSEM', 'KGB', 'LCON',
       'LHIR', 'MCAL', 'MCEP', 'MMIN', 'MMOL', 'MXEN', 'NBLA', 'NCEP',
       'NO_ORG', 'OCAL', 'OELO', 'OPIC', 'OYB', 'OYT', 'PATR', 'PCAL',
       'PCLA', 'PCOE', 'PFAL', 'PFUR', 'PGLA', 'PHOL', 'PLEU', 'PMAC',
       'PNEB', 'PNOT', 'PPRO', 'PTRI', 'RALL', 'RBIN', 'RHYP', 'RJOR',
       'RSTE', 'RTOX', 'RVAC', 'SACA', 'SARG', 'SATR', 'SAUR', 'SCAL',
       'SCAR', 'SCARSCAU', 'SCAU', 'SCHI', 'SCHR', 'SDAL', 'SDIP',
       'SEBSPP', 'SENT', 'SGIG', 'SGUT', 'SHOP', 'SJAP', 'SLAL', 'SLUC',
       'SMAR', 'SMEL', 'SMIN', 'SMYS', 'SNEB', 'SPAU', 'SPIN', 'SPUL',
      

In [28]:
## Check that there are no duplicate classcodes (I encountered this doing the small test with real data above; duplicates are dropped making survey_table)

check_duplicates = looked_for.groupby('classcode')['classcode'].count()
check_duplicates[check_duplicates > 1]

Series([], Name: classcode, dtype: int64)

So, without the NO_ORG classcode, we would expect there to be 128 records per transect, or 384 records total

In [29]:
## Check number of records in full_fish

test_full_fish = full_fish[(full_fish['campus'] == 'UCSB') & 
                 (full_fish['year'] == 2014) & 
                 (full_fish['month'] == 11) & 
                 (full_fish['day'] == 7) & 
                 (full_fish['site'] == 'SRI_CLUSTER_POINT_N') & 
                 (full_fish['zone'] == 'INMID') & 
                 (full_fish['level'] == 'MID')]
test_full_fish.iloc[:, 0:15]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl
7258006,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,AARG,0.0,,,
7258008,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,ADAV,0.0,,,
7258009,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,AFLA,0.0,,,
7258010,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,AGUA,0.0,,,
7258012,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,AINE,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7258452,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,3,UHAL,0.0,,,
7258453,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,3,UNID,0.0,,,
7258454,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,3,USAN,0.0,,,
7258455,UCSB,,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,3,ZEXA,0.0,,,


There are a few extra, where are they coming from?

In [30]:
## Make sure the original records are still there and looking good (recall NO_ORG records will be dropped)

test_full_fish[test_full_fish['count'] > 0]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
7258019,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258020,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258021,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258082,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258090,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258170,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258171,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258172,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes
7258233,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,...,,KATIE DAVIS,4.6,7.0,17.2,LIGHT,2.0,,,yes


In [31]:
## Going transect by transect

out1 = test_full_fish[test_full_fish['transect'] == '1']
print(out1.shape)
duplicates = out1.groupby('classcode')['campus'].count()
duplicates[duplicates > 1]

(130, 25)


classcode
BFRE    3
Name: campus, dtype: int64

In [32]:
out1[out1['classcode'] == 'BFRE'].iloc[:, 0:15]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl
7258019,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,BFRE,3.0,7.0,,
7258020,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,BFRE,2.0,8.0,,
7258021,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,1,BFRE,1.0,9.0,,


Ah, that makes sense. There are rows with duplicate classcodes because multiple individuals were observed of different sizes (3x7 cm BFRE, 2x8 cm BFRE, and 1x9cm BFRE).

In [33]:
out2 = test_full_fish[test_full_fish['transect'] == '2']
print(out2.shape)
duplicates = out2.groupby('classcode')['campus'].count()
duplicates[duplicates > 1]

(130, 25)


classcode
BFRE    3
Name: campus, dtype: int64

In [34]:
out2[out2['classcode'] == 'BFRE'].iloc[:, 0:15]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl
7258170,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,BFRE,1.0,7.0,,
7258171,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,BFRE,1.0,8.0,,
7258172,UCSB,SBTL_FISH_PISCO,2014,2014,11,7,SRI_CLUSTER_POINT_N,INMID,MID,2,BFRE,1.0,9.0,,


Same thing in transect 2, accounting for my 4 extra records. Finally, make sure transect 3 has 0 for all 128 classcodes:

In [35]:
out3 = test_full_fish[test_full_fish['transect'] == '3']
out3.shape

(128, 25)

In [36]:
out3[out3['count'] > 0]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked


Everything looks good.

### Scenario 2 - troubleshooting sex

I'm having some discrepencies come up where some sex data has been lost. First, let's validate that there are records missing.

In [37]:
fish[fish['sex'] == 'FEMALE'].shape[0] # 16528
full_fish[full_fish['sex'] == 'FEMALE'].shape[0] # 16295

16295

So, 233 records where sex data was noted are missing, or appear to be (i.e. the sex information was lost but the record remains, or something). 

Let's have a look at them.

In [38]:
fish_diff = fish[fish['sex'] == 'FEMALE'].copy()
full_fish_diff = full_fish[full_fish['sex'] == 'FEMALE'].copy()
full_fish_diff = full_fish_diff.iloc[:, 0:24]

diff = fish_diff.merge(full_fish_diff, indicator=True, how='outer')

pd.set_option('display.max_rows', 60)
diff[diff['_merge'] != 'both'].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
14748,VRG,SBTL_FISH_VRG,2007,2008,3,21,Long Point East,INNER,BOT,1,SPUL,2.0,25.0,,,FEMALE,VRG,5.7,8.0,11.6
14749,VRG,SBTL_FISH_VRG,2007,2008,3,21,Long Point East,INNER,BOT,2,SPUL,1.0,25.0,,,FEMALE,VRG,5.5,8.0,11.6
14750,VRG,SBTL_FISH_VRG,2007,2008,3,21,Long Point East,INNER,BOT,3,SPUL,1.0,20.0,,,FEMALE,VRG,5.2,8.0,11.6
14751,VRG,SBTL_FISH_VRG,2007,2008,3,21,Long Point East,INNER,BOT,3,SPUL,1.0,25.0,,,FEMALE,VRG,5.2,8.0,11.6
14752,VRG,SBTL_FISH_VRG,2007,2008,3,21,Long Point East,INNER,BOT,4,SPUL,1.0,30.0,,,FEMALE,VRG,4.0,8.0,11.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16371,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,MID,BOT,1,SPUL,1.0,40.0,,,FEMALE,VRG,9.8,25.0,12.0
16372,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,MID,BOT,2,SPUL,1.0,40.0,,,FEMALE,VRG,10.2,25.0,12.0
16373,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,MID,BOT,3,SPUL,1.0,40.0,,,FEMALE,VRG,10.1,25.0,12.0
16374,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,MID,BOT,4,SPUL,1.0,40.0,,,FEMALE,VRG,9.7,25.0,12.0


The only thing I see in common here is that all these records are from VRG, and have classcode SPUL. Let's pick an example and walk it through the workflow to see what happens. I'm choosing VRG, 1/16/2013, SNI - Boilers, OUTER, BOT, which only has 4 transects and 13 records.

**Update** - I got it! All of these records are from surveys that were conducted late. I.e., surveys where the survey_year = year - 1. I'll manage that in the following workflow.

In [39]:
## Get transect data

test_fish = fish[(fish['campus'] == 'VRG') & 
                 (fish['year'] == 2013) & 
                 (fish['month'] == 1) & 
                 (fish['day'] == 16) & 
                 (fish['site'] == 'SNI - Boilers') & 
                 (fish['zone'] == 'OUTER') & 
                 (fish['level'] == 'BOT')]
test_fish.iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
379477,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,1,SPUL,1.0,45.0,,,MALE,VRG,16.4,25.0,12.0
379478,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,CPUN,20.0,15.0,,,,VRG,15.4,25.0,12.0
379479,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,CPUN,30.0,20.0,,,,VRG,15.4,25.0,12.0
379480,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,3.0,9.0,,,,VRG,15.4,25.0,12.0
379481,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,13.0,,,,VRG,15.4,25.0,12.0
379482,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,15.0,,,,VRG,15.4,25.0,12.0
379483,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,7.0,20.0,,,,VRG,15.4,25.0,12.0
379484,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,RVAC,1.0,20.0,,,,VRG,15.4,25.0,12.0
379485,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,SMYS,20.0,15.0,,,,VRG,15.4,25.0,12.0
379486,VRG,SBTL_FISH_VRG,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,SPUL,4.0,35.0,,,FEMALE,VRG,15.4,25.0,12.0


In [40]:
# Get which species were looked for

test_long = long[(long['campus'] == 'VRG') & (long['year'] == 2012)]
looked_for = test_long[test_long['looked'] == 'yes']
looked_for

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
6801,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,2012,yes
6809,VRG,FISH,FISH,ADAV,Anisotremus davidsonii,Animalia,Chordata,Actinopterygii,Perciformes,Haemulidae,Anisotremus,davidsonii,Anisotremus davidsonii,WoRMS,279617,Sargo,,,2012,yes
6813,VRG,FISH,FISH,AFLA,Aulorhynchus flavidus,Animalia,Chordata,Actinopterygii,Gasterosteiformes,Aulorhynchidae,Aulorhynchus,flavidus,Aulorhynchus flavidus,WoRMS,279839,Tubesnout,,,2012,yes
6815,VRG,FISH,FISH,AGUA,Apogon guadalupensis,Animalia,Chordata,Actinopterygii,Perciformes,Apogonidae,Apogon,guadalupensis,Apogon guadalupensis,WoRMS,273016,Guadalupe Cardinalfish,,,2012,yes
6825,VRG,FISH,FISH,ANOB,Atractoscion nobilis,Animalia,Chordata,Actinopterygii,Perciformes,Sciaenidae,Atractoscion,nobilis,Atractoscion nobilis,WoRMS,278535,White Seabass,,,2012,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7305,VRG,FISH,FISH,TSYM,Trachurus symmetricus,Animalia,Chordata,Actinopterygii,Perciformes,Carangidae,Trachurus,symmetricus,Trachurus symmetricus,WoRMS,273305,Jack Mackerel,,,2012,yes
7308,VRG,FISH,FISH,UHAL,Urobatis halleri,Animalia,Chordata,Elasmobranchii,Myliobatiformes,Urotrygonidae,Urolophus,halleri,Urolophus halleri,WoRMS,315896,Haller's Round Ray,,,2012,yes
7312,VRG,FISH,FISH,URON,Umbrina roncador,Animalia,Chordata,Actinopterygii,Perciformes,Sciaenidae,Umbrina,roncador,Umbrina roncador,WoRMS,273802,Yellowfin drum,,,2012,yes
7317,VRG,FISH,FISH,ZEXA,Zapteryx exasperata,Animalia,Chordata,Elasmobranchii,Rajiformes,Rhinobatidae,Zapteryx,exasperata,Zapteryx exasperata,WoRMS,283213,Banded Guitarfish,,,2012,yes


**Note** that now I've adjusted it so that I'm selecting data from species based on survey_year (2012) instead of year (2013). I hope this will be easy to implement on the whole data set...

In [41]:
## Check that NO_ORG is on this list

looked_for['classcode'].unique()

array(['AARG', 'ADAV', 'AFLA', 'AGUA', 'ANOB', 'ATHE', 'BFRE', 'BRAY',
       'CAGG', 'COBS', 'CPRI', 'CPUN', 'CSAT', 'DMAC', 'EJAC', 'ELAT',
       'EMOR', 'GGAL', 'GMOR', 'GNIG', 'HARG', 'HAZU', 'HCAL', 'HCAR',
       'HFRA', 'HROS', 'HRUB', 'HSEM', 'MCAL', 'MMIN', 'NCEP', 'NO_ORG',
       'OCAL', 'OELO', 'OPIC', 'OYT', 'PATR', 'PCAL', 'PCLA', 'PCOE',
       'PFAL', 'PFUR', 'PMAC', 'PNEB', 'PPRO', 'RALL', 'RHYP', 'RTOX',
       'RVAC', 'SARG', 'SATR', 'SAUR', 'SCAL', 'SCAR', 'SCAU', 'SCHR',
       'SDAL', 'SGIG', 'SGUT', 'SHOP', 'SJAP', 'SLAL', 'SMAR', 'SMIN',
       'SMYS', 'SPAU', 'SPUL', 'SRAS', 'SROS', 'SSAG', 'SSAX', 'SSEM',
       'STRE', 'SUMB', 'SXYR', 'SYNG', 'TCAL', 'TSEM', 'TSYM', 'UHAL',
       'URON', 'ZEXA', 'ZROS'], dtype=object)

In [42]:
## Check that there are no duplicate classcodes (I encountered this doing the small test with real data above; duplicates are dropped making survey_table)

check_duplicates = looked_for.groupby('classcode')['classcode'].count()
check_duplicates[check_duplicates > 1]

classcode
ATHE    2
Name: classcode, dtype: int64

In [43]:
looked_for[looked_for['classcode'] == 'ATHE']

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
6833,VRG,FISH,FISH,ATHE,Atherinops affinis,Animalia,Chordata,Actinopterygii,Atheriniformes,Atherinopsidae,Atherinopsidae,spp,Atherinopsidae,WoRMS,266995,"Grunion, Topsmelt Or Jacksmelt",,,2012,yes
6834,VRG,FISH,FISH,ATHE,Atherinopsis californiensis,Animalia,Chordata,Actinopterygii,Atheriniformes,Atherinopsidae,Atherinopsidae,spp,Atherinopsidae,WoRMS,266995,"Grunion, Topsmelt Or Jacksmelt",,,2012,yes


So, without the NO_ORG classcode and bearing in mind the duplicate ATHE classcode, 82 fish were looked for during each transect by VRG in 2012 (the survey_year). So we would expect after populating absence records to have 328 records, 13 of which are presence records.

In [44]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = test_fish[['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect']].merge(test_long[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table.drop(columns='year', inplace=True)
survey_table

Unnamed: 0,campus,day,month,survey_year,site,zone,level,transect,classcode,looked
0,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,1,AARG,yes
1,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,1,ACOR,no
2,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,1,ADAV,yes
3,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,1,AFLA,yes
4,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,1,AGUA,yes
...,...,...,...,...,...,...,...,...,...,...
1191,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,4,TSYM,yes
1192,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,4,UHAL,yes
1193,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,4,URON,yes
1194,VRG,16,1,2012,SNI - Boilers,OUTER,BOT,4,ZEXA,yes


In [45]:
## Merge with fish data to get final outcome

full_test_fish = test_fish.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_test_fish.iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
0,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,AARG,,,,,,,,,
1,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,ACOR,,,,,,,,,
2,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,ADAV,,,,,,,,,
3,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,AFLA,,,,,,,,,
4,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,AGUA,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,TSYM,,,,,,,,,
365,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,UHAL,,,,,,,,,
366,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,URON,,,,,,,,,
367,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,ZEXA,,,,,,,,,


In [46]:
## Clean

full_test_fish = full_test_fish[full_test_fish['classcode'] != 'NO_ORG'].copy()
full_test_fish.loc[(full_test_fish['looked'] == 'yes') & (full_test_fish['count'].isna() == True), 'count'] = 0
full_test_fish.dropna(subset=['count'], inplace=True)
full_test_fish.iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
0,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,AARG,0.0,,,,,,,,
2,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,ADAV,0.0,,,,,,,,
3,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,AFLA,0.0,,,,,,,,
4,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,AGUA,0.0,,,,,,,,
6,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,1,ANOB,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,TSYM,0.0,,,,,,,,
365,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,UHAL,0.0,,,,,,,,
366,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,URON,0.0,,,,,,,,
367,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,4,ZEXA,0.0,,,,,,,,


Ok, now that I've fixed the survey_year issue, this is working better. However, we have 6 unexpected rows. I'm guessing they probably come from the same place as they did in the previous example...i.e. there are rows with duplicate classcodes because individuals of multiple sizes were observed.

In [47]:
## Make sure the original records are still there and looking good (recall NO_ORG records will be dropped)

full_test_fish[full_test_fish['count'] > 0].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
74,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,1,SPUL,1.0,45.0,,,MALE,VRG,16.4,25.0,12.0
104,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,CPUN,20.0,15.0,,,,VRG,15.4,25.0,12.0
105,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,CPUN,30.0,20.0,,,,VRG,15.4,25.0,12.0
109,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,3.0,9.0,,,,VRG,15.4,25.0,12.0
110,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,13.0,,,,VRG,15.4,25.0,12.0
111,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,15.0,,,,VRG,15.4,25.0,12.0
112,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,7.0,20.0,,,,VRG,15.4,25.0,12.0
151,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,RVAC,1.0,20.0,,,,VRG,15.4,25.0,12.0
167,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,SMYS,20.0,15.0,,,,VRG,15.4,25.0,12.0
169,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,SPUL,4.0,35.0,,,FEMALE,VRG,15.4,25.0,12.0


In [48]:
## Identify records with duplicate classcodes

full_test_fish[full_test_fish[['campus', 'survey_year', 'month', 'day', 'site', 'zone', 'level', 'transect', 'classcode']].duplicated() == True].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
105,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,CPUN,30.0,20.0,,,,VRG,15.4,25.0,12.0
110,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,13.0,,,,VRG,15.4,25.0,12.0
111,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,15.0,,,,VRG,15.4,25.0,12.0
112,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,7.0,20.0,,,,VRG,15.4,25.0,12.0
170,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,SPUL,2.0,45.0,,,MALE,VRG,15.4,25.0,12.0


In [49]:
## Look at example of duplicates

full_test_fish[(full_test_fish['transect'] == '2') & (full_test_fish['classcode'] == 'ELAT')].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
109,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,3.0,9.0,,,,VRG,15.4,25.0,12.0
110,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,13.0,,,,VRG,15.4,25.0,12.0
111,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,1.0,15.0,,,,VRG,15.4,25.0,12.0
112,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,2,ELAT,7.0,20.0,,,,VRG,15.4,25.0,12.0


Ok, that makes sense. What happened with the single remaining record, then?

In [50]:
## Go transect by transect

print(full_test_fish[full_test_fish['transect'] == '1'].shape)
print(full_test_fish[full_test_fish['transect'] == '2'].shape) # Contains 5 duplicates already identified
print(full_test_fish[full_test_fish['transect'] == '3'].shape)
print(full_test_fish[full_test_fish['transect'] == '4'].shape)

(82, 25)
(87, 25)
(83, 25)
(82, 25)


So the final issue is in transect 3.

In [51]:
## Let's have a closer look

pd.set_option('display.max_rows', 60)
full_test_fish[full_test_fish['transect'] == '3'].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
187,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,AARG,0.0,,,,,,,,
188,VRG,SBTL_FISH_VRG,2012,2013.0,1,16,SNI - Boilers,OUTER,BOT,3,ACOR,2.0,10.0,,,,VRG,15.4,25.0,12.0
189,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,ADAV,0.0,,,,,,,,
190,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,AFLA,0.0,,,,,,,,
191,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,AGUA,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,TSYM,0.0,,,,,,,,
274,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,UHAL,0.0,,,,,,,,
275,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,URON,0.0,,,,,,,,
276,VRG,,2012,,1,16,SNI - Boilers,OUTER,BOT,3,ZEXA,0.0,,,,,,,,


In [52]:
## Check for classcodes that were observed but not looked for

for code in full_test_fish.loc[full_test_fish['transect'] == '3', 'classcode'].unique():
    if code not in looked_for['classcode'].unique():
        print(code)

ACOR


In [53]:
species.loc[(species['campus'] == 'VRG') & (species['classcode'] == 'ACOR'), 'LOOKED2012']

6    no
Name: LOOKED2012, dtype: object

Got it. So, according to the original species table, ACOR was not looked for in 2012. But there is a record showing that one was seen. So this is one of the "weird" observations I identified earlier.

It's not in the "weird" table, though, because that was made before I fixed the problem I just identified with respect to survey_year.

## All data - revised

Running through the workflow again, with edits to account for surveys that occurred in a different year than their survey_year.

In [160]:
## Fish

fish.head()

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,max_tl,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old
0,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
1,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
2,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
3,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
4,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,8.0,,MARK CARR,6.1,2.4,,HIGH,1.0,,


In [161]:
## Species

long.head()

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
0,HSU,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,no
1,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,yes
2,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,no
3,HSU,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,1999,no
4,UCSB,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,1999,no


In [162]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = fish[['campus', 'day', 'month', 'survey_year', 'year', 'site', 'zone', 'level', 'transect']].merge(long[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table.rename(columns={'year_x':'year'}, inplace=True) # year_x retains actual year when survey took place
survey_table.drop(columns=['year_y'], inplace=True) # year_y == survey_year because of the merge
survey_table

Unnamed: 0,campus,day,month,survey_year,year,site,zone,level,transect,classcode,looked
0,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,ACOR,no
1,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,ADAV,yes
2,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,AFLA,yes
3,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,AHOL,no
4,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,AOCE,yes
...,...,...,...,...,...,...,...,...,...,...,...
54035835,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,TSYM,yes
54035836,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,UHAL,yes
54035837,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,URON,yes
54035838,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,ZEXA,yes


In [163]:
## Merge with fish data to get final outcome

full_fish = fish.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'year', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_fish

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
0,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,no
1,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
2,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
3,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,no
4,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8870071,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8870072,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8870073,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8870074,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes


In [164]:
## Clean

full_fish = full_fish[full_fish['classcode'] != 'NO_ORG'].copy()
full_fish.loc[(full_fish['looked'] == 'yes') & (full_fish['count'].isna() == True), 'count'] = 0
full_fish.dropna(subset=['count'], inplace=True)
full_fish

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
1,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
2,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
4,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
5,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
6,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8870071,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8870072,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8870073,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8870074,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes


In [167]:
## Verify records are now present

full_fish[(full_fish['campus'] == 'VRG') &
    (full_fish['year'] == 2013) &
    (full_fish['month'] == 1) &
    (full_fish['day'] == 16) &
    (full_fish['site'] == 'SNI - Boilers') &
    (full_fish['zone'] == 'OUTER') &
    (full_fish['level'] == 'BOT') &
    (full_fish['transect'] == '2')].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
8817952,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,AARG,0.0,,,,,,,,
8817954,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ADAV,0.0,,,,,,,,
8817955,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,AFLA,0.0,,,,,,,,
8817956,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,AGUA,0.0,,,,,,,,
8817958,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ANOB,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8818043,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,TSYM,0.0,,,,,,,,
8818044,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,UHAL,0.0,,,,,,,,
8818045,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,URON,0.0,,,,,,,,
8818046,VRG,,2012,2013,1,16,SNI - Boilers,OUTER,BOT,2,ZEXA,0.0,,,,,,,,


In [168]:
## Check that this resolved the sex issue

print(fish[fish['sex'] == 'FEMALE'].shape[0]) # 16528
full_fish[full_fish['sex'] == 'FEMALE'].shape[0] # 16528

16528


16528

In [169]:
## Finally, reformulate the "weird" dataframe

weird = full_fish[(full_fish['count'] > 0) & (full_fish['looked'] == 'no')]
weird

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
1016,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,OUTER,BOT,2,...,,JEFF HARDING,13.1,3.0,,HIGH,2.0,,,no
13967,UCSC,SBTL_FISH_PISCO,1999,1999,9,15,STILLWATER_DC,INMID,BOT,2,...,,JEFF HARDING,6.7,6.1,,MODERATE,3.0,,,no
16542,UCSC,SBTL_FISH_PISCO,1999,1999,9,15,STILLWATER_DC,OUTMID,BOT,2,...,,JEFF HARDING,8.5,7.6,,MODERATE,3.0,,,no
18106,UCSC,SBTL_FISH_PISCO,1999,1999,9,15,STILLWATER_UC,INNER,BOT,1,...,,MARK CARR,6.1,4.6,,MODERATE,2.0,,,no
26407,UCSC,SBTL_FISH_PISCO,1999,1999,9,17,SANDHILL_UC,OUTER,BOT,2,...,,JEFF HARDING,9.1,3.0,,HIGH,2.0,,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8865874,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,3,...,,VRG,,8.0,17.0,MODERATE,,,,no
8865875,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,3,...,,VRG,,8.0,17.0,MODERATE,,,,no
8865966,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,4,...,,VRG,,8.0,17.0,MODERATE,,,,no
8865967,VRG,SBTL_FISH_VRG,2012,2012,5,29,Lunada Bay,INNER,CAN,4,...,,VRG,,8.0,17.0,MODERATE,,,,no


In [170]:
## Get table of campuses and years where there were observations for classcodes that were not looked for according to the species table

obs_exist = weird[['campus', 'survey_year', 'classcode']].copy()
obs_exist.drop_duplicates(inplace=True)
obs_exist.head()

Unnamed: 0,campus,survey_year,classcode
1016,UCSC,1999,COTT
13967,UCSC,1999,CLIN
30253,UCSC,1999,SYRI
60011,UCSC,2000,ACOR
62056,UCSC,2000,CLIN


## Troubleshooting fish size

I'm now coming up short on the number of sizes in full_fish relative to fish.

In [171]:
## Show missing records

print(fish[fish['fish_tl'].isna() == False].shape[0])
print(full_fish[full_fish['fish_tl'].isna() == False].shape[0])

print(fish[fish['min_tl'].isna() == False].shape[0])
print(full_fish[full_fish['min_tl'].isna() == False].shape[0])

print(fish[fish['max_tl'].isna() == False].shape[0])
print(full_fish[full_fish['max_tl'].isna() == False].shape[0])

369147
369127
32824
32823
28195
28194


So it looks like I've lost 20 total length records and 1 min/max length record while populating absence records. Let's have a look at them.

In [172]:
## 20 records missing fish_tl

fish_diff = fish[fish['fish_tl'].isna() == False].copy()
full_fish_diff = full_fish[full_fish['fish_tl'].isna() == False].copy()
full_fish_diff = full_fish_diff.iloc[:, 0:24]

diff = fish_diff.merge(full_fish_diff, indicator=True, how='outer')

pd.set_option('display.max_rows', 60)
diff[diff['_merge'] != 'both'].iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24]]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,_merge
2153,UCSC,SBTL_FISH_PISCO,2000,2000,8,15,TERRACE_UC,OUTER,BOT,2,RFYOY,1.0,6.0,,,,left_only
3333,UCSC,SBTL_FISH_PISCO,2000,2000,9,7,MONASTERY_UC,OUTER,CAN,1,RFYOY,1.0,4.0,,,,left_only
20235,UCSC,SBTL_FISH_PISCO,2003,2003,8,29,SANDHILL_DC,OUTMID,BOT,1,RFYOY,10.0,4.0,,,,left_only
24615,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SCAL,1.0,150.0,,,,left_only
56142,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,SACA,,80.0,60.0,100.0,,left_only
79465,UCSC,SBTL_FISH_PISCO,2011,2011,7,21,STILLWATER_UC,OUTER,BOT,2,RFYOY,1.0,6.0,,,,left_only
94872,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,BOT,6,RFYOY,3.0,8.0,,,,left_only
94881,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,RFYOY,10.0,8.0,,,,left_only
94889,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,5,RFYOY,1.0,8.0,,,,left_only
94898,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,6,RFYOY,5.0,8.5,8.0,9.0,,left_only


Uh-oh. So this is quite a bit more complicated than I thought. I'm not only failing to pull some sizes into full_fish, I'm also accidentally creating one false size.

**Note** that the record where count=NaN is the only one in the data.

In [173]:
## 1 record missing min/max_tl

fish_diff = fish[fish['min_tl'].isna() == False].copy()
full_fish_diff = full_fish[full_fish['min_tl'].isna() == False].copy()
full_fish_diff = full_fish_diff.iloc[:, 0:24]

diff = fish_diff.merge(full_fish_diff, indicator=True, how='outer')

pd.set_option('display.max_rows', 60)
diff[diff['_merge'] != 'both'].iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24]]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,_merge
1722,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,SACA,,80.0,60.0,100.0,,left_only
4389,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,6,RFYOY,5.0,8.5,8.0,9.0,,left_only
32826,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,SACA,0.0,80.0,60.0,100.0,,right_only


Here, I've missed two records and created one. Note that all three of these appear in the fish_tl table above, since an average has been entered for size ranges. So I can just deal with the fish_tl records.

I'm not sure what any of these records have in common. Let's pick an example and walk it through the workflow to see what happens. I'm choosing UCSC, 8/15/2013, LOVERS_UC, OUTER, MID, which has 3 transects (4, 5 and 6) and 25 records. I'm expecting to lose 3 of them, one from each transect.

In [174]:
## Get transect data

test_fish = fish[(fish['campus'] == 'UCSC') & 
                 (fish['year'] == 2013) & 
                 (fish['month'] == 8) & 
                 (fish['day'] == 15) & 
                 (fish['site'] == 'LOVERS_UC') & 
                 (fish['zone'] == 'OUTER') & 
                 (fish['level'] == 'MID')]
print(test_fish.shape)
test_fish.iloc[:, 0:20]

(25, 24)


Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
99196,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,OYT,45.0,8.0,,,,COLIN GAYLORD,7.0,5.0,12.0
99197,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,RFYOY,10.0,8.0,,,,COLIN GAYLORD,7.0,5.0,12.0
99198,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,SMYS,83.0,7.0,,,,COLIN GAYLORD,7.0,5.0,12.0
99199,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,SMYS,106.0,8.0,,,,COLIN GAYLORD,7.0,5.0,12.0
99200,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,SMYS,1.0,12.0,,,,COLIN GAYLORD,7.0,5.0,12.0
99201,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,4,SMYS,3.0,16.0,,,,COLIN GAYLORD,7.0,5.0,12.0
99202,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,5,OYT,96.0,8.0,,,,COLIN GAYLORD,7.0,5.0,11.0
99203,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,5,OYT,2.0,20.0,,,,COLIN GAYLORD,7.0,5.0,11.0
99204,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,5,OYT,2.0,26.0,,,,COLIN GAYLORD,7.0,5.0,11.0
99205,UCSC,SBTL_FISH_PISCO,2013,2013,8,15,LOVERS_UC,OUTER,MID,5,RFYOY,1.0,8.0,,,,COLIN GAYLORD,7.0,5.0,11.0


In [175]:
# Get which species were looked for

test_long = long[(long['campus'] == 'UCSC') & (long['year'] == 2013)]
looked_for = test_long[test_long['looked'] == 'yes']
looked_for

Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
7331,UCSC,FISH,FISH,ADAV,ADAV,Animalia,Chordata,Actinopterygii,Perciformes,Haemulidae,Anisotremus,davidsonii,Anisotremus davidsonii,WoRMS,279617,Sargo,,,2013,yes
7335,UCSC,FISH,FISH,AFLA,AFLA,Animalia,Chordata,Actinopterygii,Gasterosteiformes,Aulorhynchidae,Aulorhynchus,flavidus,Aulorhynchus flavidus,WoRMS,279839,Tubesnout,,,2013,yes
7351,UCSC,FISH,FISH,AOCE,AOCE,Animalia,Chordata,Actinopterygii,Perciformes,Anarhichadidae,Anarrhichthys,ocellatus,Anarrhichthys ocellatus,WoRMS,279605,Wolf Eel,,,2013,yes
7352,UCSC,FISH,FISH,APFL,APFL,Animalia,Chordata,Actinopterygii,Perciformes,Pholidae,Apodichthys,flavidus,Apodichthys flavidus,WoRMS,279664,Penpiont Gunnel,,,2013,yes
7355,UCSC,FISH,FISH,ATHE,ATHE,Animalia,Chordata,Actinopterygii,Atheriniformes,Atherinopsidae,Atherinopsidae,spp,Atherinopsidae,WoRMS,266995,"Grunion, Topsmelt Or Jacksmelt",,,2013,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7823,UCSC,FISH,FISH,TSEM,TSEM,Animalia,Chordata,Elasmobranchii,Carcharhiniformes,Triakidae,Triakis,semifasciata,Triakis semifasciata,WoRMS,279060,Leopard Shark,,,2013,yes
7827,UCSC,FISH,FISH,TSYM,TSYM,Animalia,Chordata,Actinopterygii,Perciformes,Carangidae,Trachurus,symmetricus,Trachurus symmetricus,WoRMS,273305,Jack Mackerel,,,2013,yes
7834,UCSC,FISH,FISH,UNID,UNID,Animalia,Chordata,,,,,,Unidentified Fish,MLPA_kelpforest,,Unidentified Fish,,,2013,yes
7837,UCSC,FISH,FISH,USAN,USAN,Animalia,Chordata,Actinopterygii,Perciformes,Pholidae,Ulvicola,sanctaerosae,Ulvicola sanctaerosae,WoRMS,322270,Kelp Gunnel,,,2013,yes


In [176]:
## Check that NO_ORG is on this list

looked_for['classcode'].unique()

array(['ADAV', 'AFLA', 'AOCE', 'APFL', 'ATHE', 'AVUL', 'BAITBALL', 'BATH',
       'BFRE', 'BOTH', 'BPOL', 'BRAY', 'CAGG', 'CGIG', 'CITH', 'CLUP',
       'CPRI', 'CPUN', 'CSAT', 'CSOR', 'CSTI', 'CVEN', 'CVIO', 'EBIS',
       'EJAC', 'ELAT', 'EMBI', 'EMOR', 'EWAL', 'GBY', 'GGAL', 'GMAE',
       'GMOR', 'GNIG', 'HANA', 'HARG', 'HCAR', 'HDEC', 'HELL', 'HEXA',
       'HFRA', 'HHEM', 'HLAG', 'HROS', 'HRUB', 'HSEM', 'HSTE', 'KGB',
       'LCON', 'LHIR', 'MCAL', 'MCEP', 'MMOL', 'NO_ORG', 'OCAL', 'OELO',
       'OPIC', 'OYB', 'OYT', 'PATR', 'PCAL', 'PCLA', 'PCOE', 'PFAL',
       'PFUR', 'PGLA', 'PHOL', 'PLEU', 'PMAC', 'PNEB', 'PNOT', 'PTRI',
       'RALL', 'RBIN', 'RHYP', 'RJOR', 'RNIC', 'RRIC', 'RSTE', 'RTOX',
       'RVAC', 'SACA', 'SARG', 'SATR', 'SAUR', 'SCAR', 'SCAU', 'SCHI',
       'SCHR', 'SDAL', 'SDIP', 'SEBSPP', 'SENT', 'SGIG', 'SGUT', 'SHOP',
       'SJAP', 'SMAL', 'SMAR', 'SMEL', 'SMIN', 'SMYS', 'SNEB', 'SPAU',
       'SPIN', 'SPUL', 'SRAS', 'SROS', 'SRUB', 'SSAG', 'SSAX', 'SSEM',
  

In [177]:
## Check that there are no duplicate classcodes (I encountered this doing the small test with real data above; duplicates are dropped making survey_table)

check_duplicates = looked_for.groupby('classcode')['classcode'].count()
check_duplicates[check_duplicates > 1]

Series([], Name: classcode, dtype: int64)

So, without the NO_ORG classcode, 120 fish were looked for during each transect by UCSC in 2013 (the survey_year). So we would expect after populating absence records to have 360 records, 25 of which are presence records.

In [178]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = test_fish[['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect']].merge(test_long[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table.drop(columns='year', inplace=True)
survey_table

Unnamed: 0,campus,day,month,survey_year,site,zone,level,transect,classcode,looked
0,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,4,ACOR,no
1,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,4,ADAV,yes
2,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,4,AFLA,yes
3,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,4,AHOL,no
4,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,4,AOCE,yes
...,...,...,...,...,...,...,...,...,...,...
2251,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,6,TSEM,yes
2252,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,6,TSYM,yes
2253,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,6,UNID,yes
2254,UCSC,15,8,2013,LOVERS_UC,OUTER,MID,6,USAN,yes


In [179]:
## Merge with fish data to get final outcome

full_test_fish = test_fish.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_test_fish.iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
0,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,4,ACOR,,,,,,,,,
1,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,4,ADAV,,,,,,,,,
2,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,4,AFLA,,,,,,,,,
3,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,4,AHOL,,,,,,,,,
4,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,4,AOCE,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,6,TSEM,,,,,,,,,
434,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,6,TSYM,,,,,,,,,
435,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,6,UNID,,,,,,,,,
436,UCSC,,2013,,8,15,LOVERS_UC,OUTER,MID,6,USAN,,,,,,,,,


In [180]:
## Are my records of interest still there?

full_test_fish[(full_test_fish['campus'] == 'UCSC') & 
               (full_test_fish['year'] == 2013) & 
               (full_test_fish['month'] == 8) & 
               (full_test_fish['day'] == 15) & 
               (full_test_fish['site'] == 'LOVERS_UC') & 
               (full_test_fish['zone'] == 'OUTER') & 
               (full_test_fish['level'] == 'MID')].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
77,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,4,OYT,45.0,8.0,,,,COLIN GAYLORD,7.0,5.0,12.0
120,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,4,SMYS,83.0,7.0,,,,COLIN GAYLORD,7.0,5.0,12.0
121,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,4,SMYS,106.0,8.0,,,,COLIN GAYLORD,7.0,5.0,12.0
122,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,4,SMYS,1.0,12.0,,,,COLIN GAYLORD,7.0,5.0,12.0
123,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,4,SMYS,3.0,16.0,,,,COLIN GAYLORD,7.0,5.0,12.0
221,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,5,OYT,96.0,8.0,,,,COLIN GAYLORD,7.0,5.0,11.0
222,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,5,OYT,2.0,20.0,,,,COLIN GAYLORD,7.0,5.0,11.0
223,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,5,OYT,2.0,26.0,,,,COLIN GAYLORD,7.0,5.0,11.0
266,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,5,SMYS,311.0,8.0,,,,COLIN GAYLORD,7.0,5.0,11.0
267,UCSC,SBTL_FISH_PISCO,2013,2013.0,8,15,LOVERS_UC,OUTER,MID,5,SMYS,4.0,12.0,,,,COLIN GAYLORD,7.0,5.0,11.0


Ok, so the records were lost during the merge. It looks like all three records had classcode = 'RFYOY'

In [181]:
## Is this classcode missing from the species table?

print(survey_table[survey_table['classcode'] == 'RFYOY'].shape)
print(test_long[test_long['classcode'] == 'RFYOY'].shape)
species[species['classcode'] == 'RFYOY'].iloc[:, 0:20]

(0, 10)
(0, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,LOOKED1999,LOOKED2000
329,HSU,FISH,FISH,RFYOY,RFYOY,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Sebastidae,Sebastes,spp,Sebastes,WoRMS,126175,"Rockfish Young Of The Year, Unidentified Sp.",Any unidentified YOY rockfish.,,no,no


It looks like the only time RFYOY was used as either a classcode or original classcode was by HSU. So that's why the records are disappearing - they don't appear in the species table.

**So RFYOY does not appear in the species table ever for UCSC, but it should, at least for 2013. The best way to handle this is for Dan to update the species table.**

This problem explains all but 3 of the missing/messed up size records. So does it generalize to the last 3?

In [182]:
## Get transect data

test_fish = fish[(fish['campus'] == 'UCSC') & 
                 (fish['year'] == 2003) & 
                 (fish['month'] == 9) & 
                 (fish['day'] == 21) & 
                 (fish['site'] == 'LUCIA_UC') & 
                 (fish['zone'] == 'OUTER') & 
                 (fish['level'] == 'MID') &
                 (fish['transect'] == '2')]
print(test_fish.shape)
test_fish.iloc[:, 0:20]

(14, 24)


Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
24880,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,OYT,4.0,20.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24881,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,OYT,4.0,25.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24882,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,OYT,1.0,30.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24883,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,OYT,1.0,38.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24884,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SCAL,1.0,150.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24885,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SMEL,1.0,20.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24886,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SMEL,1.0,25.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24887,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SMEL,1.0,30.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24888,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SMEL,3.0,35.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0
24889,UCSC,SBTL_FISH_PISCO,2003,2003,9,21,LUCIA_UC,OUTER,MID,2,SMYS,1.0,13.0,,,,RANDOLPH SKROVAN,11.0,3.0,12.0


In [183]:
## The classcode=SCAL record is the one we lose. Is it in the species table?

print(survey_table[survey_table['classcode'] == 'SCAL'].shape)
print(test_long[test_long['classcode'] == 'SCAL'].shape)
long[(long['classcode'] == 'SCAL') & (long['year'] == 2003)]

(0, 10)
(0, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
2461,HSU,FISH,FISH,SCAL,SCAL,Animalia,Chordata,Elasmobranchii,Squatiniformes,Squatinidae,Squatina,californica,Squatina californica,WoRMS,271667,Pacific Angel Shark,,,2003,no
2462,UCSB,FISH,FISH,SCAL,SCAL,Animalia,Chordata,Elasmobranchii,Squatiniformes,Squatinidae,Squatina,californica,Squatina californica,WoRMS,271667,Pacific Angel Shark,,,2003,yes
2463,VRG,FISH,FISH,SCAL,Squatina californica,Animalia,Chordata,Elasmobranchii,Squatiniformes,Squatinidae,Squatina,californica,Squatina californica,WoRMS,271667,Pacific Angel Shark,,,2003,no


Yes, so SCAL never appears as a classcode for UCSC, although HSU, UCSB, and VRG used it some years. **SCAL should be added to the species table for UCSC, at least for 2003.**

Finally...

In [184]:
## Get transect data

test_fish = fish[(fish['campus'] == 'UCSC') & 
                 (fish['year'] == 2008) & 
                 (fish['month'] == 9) & 
                 (fish['day'] == 4) & 
                 (fish['site'] == 'PALO_COLORADO') & 
                 (fish['zone'] == 'OUTMID') & 
                 (fish['level'] == 'BOT') &
                 (fish['transect'] == '2')]
print(test_fish.shape)
test_fish.iloc[:, 0:20]

(5, 24)


Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
57951,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,ELAT,1.0,7.0,,,,SCOTT GABARA,15.2,7.0,14.4
57952,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,OYT,1.0,30.0,,,,SCOTT GABARA,15.2,7.0,14.4
57953,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,SACA,,80.0,60.0,100.0,,SCOTT GABARA,15.2,7.0,14.4
57954,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,SMYS,1.0,22.0,,,,SCOTT GABARA,15.2,7.0,14.4
57955,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,SMYS,1.0,23.0,,,,SCOTT GABARA,15.2,7.0,14.4


In [185]:
## The classcode=SACA record is the one we lose. We also gain a record where classcode=SACA and count=0 instead of NaN. Is SACA in the species table?

print(survey_table[survey_table['classcode'] == 'SACA'].shape)
print(test_long[test_long['classcode'] == 'SACA'].shape)
long[(long['classcode'] == 'SACA') & (long['year'] == 2008)]

(3, 10)
(1, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
5061,HSU,FISH,FISH,SACA,SACA,Animalia,Chordata,Elasmobranchii,Squaliformes,Squalidae,Squalus,acanthias,Squalus acanthias,WoRMS,105923,Spiny Dogfish,,,2008,no
5062,UCSB,FISH,FISH,SACA,SACA,Animalia,Chordata,Elasmobranchii,Squaliformes,Squalidae,Squalus,acanthias,Squalus acanthias,WoRMS,105923,Spiny Dogfish,,,2008,yes
5063,UCSC,FISH,FISH,SACA,SACA,Animalia,Chordata,Elasmobranchii,Squaliformes,Squalidae,Squalus,acanthias,Squalus acanthias,WoRMS,105923,Spiny Dogfish,,,2008,yes


So SACA is in the species table, and was looked for by UCSC every year, including 2008. So this is a separate issue from what was up with the previous two classcodes. My suspicion is that the merge has an issue because the count value is missing.

In [186]:
# Get which species were looked for

test_long = long[(long['campus'] == 'UCSC') & (long['year'] == 2008)]
looked_for = test_long[test_long['looked'] == 'yes']
print(looked_for.shape)
looked_for.head()

(120, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
4716,UCSC,FISH,FISH,ADAV,ADAV,Animalia,Chordata,Actinopterygii,Perciformes,Haemulidae,Anisotremus,davidsonii,Anisotremus davidsonii,WoRMS,279617,Sargo,,,2008,yes
4720,UCSC,FISH,FISH,AFLA,AFLA,Animalia,Chordata,Actinopterygii,Gasterosteiformes,Aulorhynchidae,Aulorhynchus,flavidus,Aulorhynchus flavidus,WoRMS,279839,Tubesnout,,,2008,yes
4736,UCSC,FISH,FISH,AOCE,AOCE,Animalia,Chordata,Actinopterygii,Perciformes,Anarhichadidae,Anarrhichthys,ocellatus,Anarrhichthys ocellatus,WoRMS,279605,Wolf Eel,,,2008,yes
4737,UCSC,FISH,FISH,APFL,APFL,Animalia,Chordata,Actinopterygii,Perciformes,Pholidae,Apodichthys,flavidus,Apodichthys flavidus,WoRMS,279664,Penpiont Gunnel,,,2008,yes
4740,UCSC,FISH,FISH,ATHE,ATHE,Animalia,Chordata,Actinopterygii,Atheriniformes,Atherinopsidae,Atherinopsidae,spp,Atherinopsidae,WoRMS,266995,"Grunion, Topsmelt Or Jacksmelt",,,2008,yes


In [187]:
## Check that NO_ORG is on this list

looked_for['classcode'].unique()

array(['ADAV', 'AFLA', 'AOCE', 'APFL', 'ATHE', 'AVUL', 'BAITBALL', 'BATH',
       'BFRE', 'BOTH', 'BPOL', 'BRAY', 'CAGG', 'CGIG', 'CITH', 'CLUP',
       'CPRI', 'CPUN', 'CSAT', 'CSOR', 'CSTI', 'CVEN', 'CVIO', 'EBIS',
       'EJAC', 'ELAT', 'EMBI', 'EMOR', 'EWAL', 'GBY', 'GGAL', 'GMAE',
       'GMOR', 'GNIG', 'HANA', 'HARG', 'HCAR', 'HDEC', 'HELL', 'HFRA',
       'HHEM', 'HLAG', 'HROS', 'HRUB', 'HSEM', 'HSTE', 'KGB', 'LCON',
       'LHIR', 'MCAL', 'MCEP', 'MMOL', 'NO_ORG', 'OCAL', 'OELO', 'OPIC',
       'OYB', 'OYT', 'PATR', 'PCAL', 'PCLA', 'PCOE', 'PFAL', 'PFUR',
       'PGLA', 'PHOL', 'PLEU', 'PMAC', 'PNEB', 'PNOT', 'PTRI', 'RALL',
       'RBIN', 'RHYP', 'RJOR', 'RNIC', 'RRIC', 'RSTE', 'RTOX', 'RVAC',
       'SACA', 'SARG', 'SATR', 'SAUR', 'SCAR', 'SCAU', 'SCHI', 'SCHR',
       'SDAL', 'SDIP', 'SEBSPP', 'SENT', 'SGIG', 'SGUT', 'SHOP', 'SJAP',
       'SMAL', 'SMAR', 'SMEL', 'SMIN', 'SMYS', 'SNEB', 'SPAU', 'SPIN',
       'SPUL', 'SRAS', 'SROS', 'SRUB', 'SSAG', 'SSAX', 'SSEM', 'STICH',
 

In [188]:
## Check that there are no duplicate classcodes (I encountered this doing the small test with real data above; duplicates are dropped making survey_table)

check_duplicates = looked_for.groupby('classcode')['classcode'].count()
check_duplicates[check_duplicates > 1]

Series([], Name: classcode, dtype: int64)

So, we'd expect 119 species to be looked for during this transect, with 5 presence records.

In [189]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = test_fish[['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect']].merge(test_long[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table.drop(columns='year', inplace=True)
print(survey_table[survey_table['looked'] == 'yes'].shape)
survey_table.head()

(120, 10)


Unnamed: 0,campus,day,month,survey_year,site,zone,level,transect,classcode,looked
0,UCSC,4,9,2008,PALO_COLORADO,OUTMID,BOT,2,ACOR,no
1,UCSC,4,9,2008,PALO_COLORADO,OUTMID,BOT,2,ADAV,yes
2,UCSC,4,9,2008,PALO_COLORADO,OUTMID,BOT,2,AFLA,yes
3,UCSC,4,9,2008,PALO_COLORADO,OUTMID,BOT,2,AHOL,no
4,UCSC,4,9,2008,PALO_COLORADO,OUTMID,BOT,2,AOCE,yes


In [190]:
## Merge with fish data to get final outcome

full_test_fish = test_fish.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_test_fish.iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
0,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,ACOR,,,,,,,,,
1,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,ADAV,,,,,,,,,
2,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,AFLA,,,,,,,,,
3,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,AHOL,,,,,,,,,
4,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,AOCE,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,TSEM,,,,,,,,,
138,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,TSYM,,,,,,,,,
139,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,UNID,,,,,,,,,
140,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,USAN,,,,,,,,,


In [191]:
## Check if the record of interest is still there

full_test_fish[full_test_fish['classcode'] == 'SACA'].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
100,UCSC,SBTL_FISH_PISCO,2008,2008.0,9,4,PALO_COLORADO,OUTMID,BOT,2,SACA,,80.0,60.0,100.0,,SCOTT GABARA,15.2,7.0,14.4


Hmmm, that's interesting. Doesn't seem to be any problems with the merge step.

In [192]:
## Clean

full_test_fish = full_test_fish[full_test_fish['classcode'] != 'NO_ORG'].copy()
full_test_fish.loc[(full_test_fish['looked'] == 'yes') & (full_test_fish['count'].isna() == True), 'count'] = 0
full_test_fish.dropna(subset=['count'], inplace=True)
full_test_fish.iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
1,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,ADAV,0.0,,,,,,,,
2,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,AFLA,0.0,,,,,,,,
4,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,AOCE,0.0,,,,,,,,
5,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,APFL,0.0,,,,,,,,
6,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,ATHE,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,TSEM,0.0,,,,,,,,
138,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,TSYM,0.0,,,,,,,,
139,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,UNID,0.0,,,,,,,,
140,UCSC,,2008,,9,4,PALO_COLORADO,OUTMID,BOT,2,USAN,0.0,,,,,,,,


In [193]:
## Check what happened to the record of interest

full_test_fish[full_test_fish['classcode'] == 'SACA'].iloc[:, 0:20]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,observer,depth,vis,temp
100,UCSC,SBTL_FISH_PISCO,2008,2008.0,9,4,PALO_COLORADO,OUTMID,BOT,2,SACA,0.0,80.0,60.0,100.0,,SCOTT GABARA,15.2,7.0,14.4


Ok, that's very interesting. I think I misinterpreted the outcome of the original merge. This record appeared once with count=NaN only in fish, and once with count=0 only in full_fish. Now I see that the second record simply replaced the first during cleaning; it's not a new record that was accidentally created. Looking at the cleaning code, I changed NaNs to zeros in the count column in the second line. **This problem can be solved by dropping any records where count=NaN before populating absence records. This will not be a big deal in this context, because only one record has count=NaN. But this will be a worthwhile quality checking step as the data are updated.**

## All data - revised - draft 2

In [39]:
## Fish

print(fish.shape)
fish.head()

(381693, 24)


Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,max_tl,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old
0,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
1,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
2,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
3,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,MARK CARR,6.1,2.4,,HIGH,1.0,,
4,UCSC,SBTL_FISH_PISCO,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,8.0,,MARK CARR,6.1,2.4,,HIGH,1.0,,


In [40]:
## Species

print(long.shape)
long.head()

(10460, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
0,HSU,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,no
1,UCSB,FISH,FISH,AARG,AARG,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,yes
2,VRG,FISH,FISH,AARG,Amphistichus argenteus,Animalia,Chordata,Actinopterygii,Perciformes,Embiotocidae,Amphistichus,argenteus,Amphistichus argenteus,WoRMS,279594,Barred Surfperch,,,1999,no
3,HSU,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,1999,no
4,UCSB,FISH,FISH,ACOR,ACOR,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Cottidae,Artedius,corallinus,Artedius corallinus,WoRMS,279699,Coralline Sculpin,"Cryptic, not sampled effectiively",,1999,no


In [41]:
## NEW STEP - check for records in fish where count=NaN

fish[fish['count'].isna() == True]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,max_tl,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old
57953,UCSC,SBTL_FISH_PISCO,2008,2008,9,4,PALO_COLORADO,OUTMID,BOT,2,...,100.0,,SCOTT GABARA,15.2,7.0,14.4,LIGHT,,"DOGFISH (LOTS), INTERFERANCE FROM MIDWATER DIV...",


In [42]:
## NEW STEP - drop any records where count=NaN

print(fish.shape)
fish.dropna(subset=['count'], inplace=True)
fish.shape

(381693, 24)


(381692, 24)

In [43]:
## NEW STEP - add missing entries to long

to_add = pd.DataFrame({'campus':['UCSC']*7 + ['UCSB']*3,
                      'sample_type':'FISH',
                      'sample_subtype':'FISH',
                      'classcode':['RFYOY']*6 + ['SCAL'] + ['RFYOY']*3,
                      'orig_classcode':['RFYOY']*6 + ['SCAL'] + ['RFYOY']*3,
                      'Kingdom':'Animalia',
                      'Phylum':'Chordata',
                      'Class':['Actinopterygii']*6 + ['Elasmobranchii'] + ['Actinopterygii']*3,
                      'Order':['Scorpaeniformes']*6 + ['Squatiniformes'] + ['Scorpaeniformes']*3,
                      'Family':['Sebastidae']*6 + ['Squatinidae'] + ['Sebastidae']*3,
                      'Genus':['Sebastes']*6 + ['Squatina'] + ['Sebastes']*3,
                      'Species':['spp']*6 + ['californica'] + ['spp']*3,
                      'species_definition':['Sebastes']*6 + ['Squatina californica'] + ['Sebastes']*3,
                      'taxonomic_source':'WoRMS',
                      'taxonomic_id':[126175]*6 + [271667] + [126175]*3,
                      'common_name':['Rockfish Young Of The Year, Unidentified Sp.']*6 + ['Pacific Angel Shark'] + ['Rockfish Young Of The Year, Unidentified Sp.']*3,
                      'notes':['Any unidentified YOY rockfish']*6 + [np.nan] + ['Any unidentified YOY rockfish']*3,
                      'size_cutoff':np.nan,
                      'year':[2000, 2003, 2011, 2013, 2014, 2017, 2003, 2001, 2003, 2005],
                      'looked':'yes'})

print(long.shape)
long = pd.concat([long, to_add])
print(long.shape)
long.reset_index(drop=True, inplace=True)
long.tail()

(10460, 20)
(10470, 20)


Unnamed: 0,campus,sample_type,sample_subtype,classcode,orig_classcode,Kingdom,Phylum,Class,Order,Family,Genus,Species,species_definition,taxonomic_source,taxonomic_id,common_name,notes,size_cutoff,year,looked
10465,UCSC,FISH,FISH,RFYOY,RFYOY,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Sebastidae,Sebastes,spp,Sebastes,WoRMS,126175,"Rockfish Young Of The Year, Unidentified Sp.",Any unidentified YOY rockfish,,2017,yes
10466,UCSC,FISH,FISH,SCAL,SCAL,Animalia,Chordata,Elasmobranchii,Squatiniformes,Squatinidae,Squatina,californica,Squatina californica,WoRMS,271667,Pacific Angel Shark,,,2003,yes
10467,UCSB,FISH,FISH,RFYOY,RFYOY,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Sebastidae,Sebastes,spp,Sebastes,WoRMS,126175,"Rockfish Young Of The Year, Unidentified Sp.",Any unidentified YOY rockfish,,2001,yes
10468,UCSB,FISH,FISH,RFYOY,RFYOY,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Sebastidae,Sebastes,spp,Sebastes,WoRMS,126175,"Rockfish Young Of The Year, Unidentified Sp.",Any unidentified YOY rockfish,,2003,yes
10469,UCSB,FISH,FISH,RFYOY,RFYOY,Animalia,Chordata,Actinopterygii,Scorpaeniformes,Sebastidae,Sebastes,spp,Sebastes,WoRMS,126175,"Rockfish Young Of The Year, Unidentified Sp.",Any unidentified YOY rockfish,,2005,yes


In [44]:
## Get a table telling whether each fish was looked for during each specific transect

survey_table = fish[['campus', 'day', 'month', 'survey_year', 'year', 'site', 'zone', 'level', 'transect']].merge(long[['campus', 'classcode', 'year', 'looked']], 
                                                                                                             how='left', 
                                                                                                             left_on=['campus', 'survey_year'],
                                                                                                             right_on=['campus', 'year'])
survey_table.drop_duplicates(inplace=True)
survey_table.rename(columns={'year_x':'year'}, inplace=True) # year_x retains actual year when survey took place
survey_table.drop(columns=['year_y'], inplace=True) # year_y == survey_year because of the merge
survey_table

Unnamed: 0,campus,day,month,survey_year,year,site,zone,level,transect,classcode,looked
0,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,ACOR,no
1,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,ADAV,yes
2,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,AFLA,yes
3,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,AHOL,no
4,UCSC,7,9,1999,1999,HOPKINS_DC,INNER,BOT,1,AOCE,yes
...,...,...,...,...,...,...,...,...,...,...,...
54117169,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,TSYM,yes
54117170,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,UHAL,yes
54117171,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,URON,yes
54117172,VRG,12,8,2011,2011,Long Point East,DEEP,MID,2,ZEXA,yes


In [45]:
## Merge with fish data to get final outcome

full_fish = fish.merge(survey_table, 
                             how='right', 
                             on=['campus', 'day', 'month', 'year', 'survey_year', 'site', 'zone', 'level', 'transect', 'classcode'])
full_fish

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
0,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,no
1,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
2,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
3,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,no
4,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8883628,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8883629,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8883630,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8883631,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes


In [46]:
## Clean

full_fish = full_fish[full_fish['classcode'] != 'NO_ORG'].copy()
full_fish.loc[(full_fish['looked'] == 'yes') & (full_fish['count'].isna() == True), 'count'] = 0
full_fish.dropna(subset=['count'], inplace=True)
full_fish

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,...,sex,observer,depth,vis,temp,surge,pctcnpy,notes,site_name_old,looked
1,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
2,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
4,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
5,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
6,UCSC,,1999,1999,9,7,HOPKINS_DC,INNER,BOT,1,...,,,,,,,,,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8883628,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8883629,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8883630,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes
8883631,VRG,,2011,2011,8,12,Long Point East,DEEP,MID,2,...,,,,,,,,,,yes


In [47]:
## Check for missing records in size data

print(fish[fish['fish_tl'].isna() == False].shape[0])
print(full_fish[full_fish['fish_tl'].isna() == False].shape[0])

print(fish[fish['min_tl'].isna() == False].shape[0])
print(full_fish[full_fish['min_tl'].isna() == False].shape[0])

print(fish[fish['max_tl'].isna() == False].shape[0])
print(full_fish[full_fish['max_tl'].isna() == False].shape[0])

369146
369146
32823
32823
28194
28194


In [48]:
## Double check

fish_diff = fish[fish['fish_tl'].isna() == False].copy()
full_fish_diff = full_fish[full_fish['fish_tl'].isna() == False].copy()
full_fish_diff = full_fish_diff.iloc[:, 0:24]

diff = fish_diff.merge(full_fish_diff, indicator=True, how='outer')

diff[diff['_merge'] != 'both'].iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24]]

Unnamed: 0,campus,method,survey_year,year,month,day,site,zone,level,transect,classcode,count,fish_tl,min_tl,max_tl,sex,_merge


## Save

In [49]:
## Save data with absence records populated

full_fish.to_csv('MLPA_kelpforest_fish_full.csv', index=False, na_rep='NaN')