# Analysis and data mining of snow geese
Author: Yihe Zhao

In [11]:
import numpy as np
import pandas as pd
import warnings
import GooseUtils
import seaborn as sns
import matplotlib.pyplot as plt

# Supress warning related to data types (will get on first import of the csv file)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

Import and trim relevant data. This notebook focuses only on data related to the Anser caerulescens caerulescens (species id: 1690) and Anser caerulescens atlantica (species id: 1699). This roughly corresponds to $898183$ rows of data.

In [12]:
# Retrieve and filter data from the csv file

# Retrieve group 1 data from the relevant CSV file
unfiltered_goose_data = pd.read_csv('NABBP_2023_grp_01.csv')

# Filter out all irrelevant species
filtered_species = unfiltered_goose_data[(unfiltered_goose_data['SPECIES_ID'] == 1690) | (unfiltered_goose_data['SPECIES_ID'] == 1699)]

In [13]:
# Filter unuseful columns

# Keep relevant colummns
goose_data = filtered_species[['BAND', 
                             'ORIGINAL_BAND', 
                             'OTHER_BANDS', 
                             'EVENT_DATE', 
                             'EVENT_DAY', 
                             'EVENT_MONTH', 
                             'EVENT_YEAR', 
                             'LAT_DD', 
                             'LON_DD', 
                             'COORD_PREC']]

# Display number of non-null entries in each column
display(goose_data.count())

BAND             898183
ORIGINAL_BAND    898183
OTHER_BANDS        1551
EVENT_DATE       898183
EVENT_DAY        898183
EVENT_MONTH      898183
EVENT_YEAR       898183
LAT_DD           897329
LON_DD           897329
COORD_PREC       898064
dtype: int64

A large number of the date cells do not work with the `pd.to_datetime()` function. Since this is vital information for the analysis, the below cell aims specifically to clean the dates and remove any unessesary columns after. The following is the process by which dates are chosen.

1. If the `'EVENT_DATE'` column already has a valid date that works with `pd.to_datetime()`, it will be the date used.
2. Otherwise, if the `'EVENT_DAY'`, `'EVENT_MONTH'`, and `'EVENT_YEAR'` column all form a date that works with `pd.to_datetime()`, it will be the date used.
3. If neither of the above work, `NaT` will be assigned and the row will be dropped.

In [14]:
# Clean time-related columns as described above.

goose_data['EVENT_DATE'] = pd.to_datetime(goose_data['EVENT_DATE'], format='%m/%d/%Y', errors='coerce')

# Assemble date guesses from the EVENT_MONTH, EVENT_DAY, and EVENT_YEAR columns BEFORE DROPPING THEM
dates_from_columns = pd.to_datetime(
    goose_data['EVENT_MONTH'].astype(str) + '/' +
    goose_data['EVENT_DAY'].astype(str) + '/' +
    goose_data['EVENT_YEAR'].astype(str),
    format='%m/%d/%Y',
    errors='coerce'
)

# Fill in all NaT values that can be filled with the guesses from the previous line.
goose_data['EVENT_DATE'] = goose_data['EVENT_DATE'].fillna(dates_from_columns)

# Remove all rows where EVENT_DATE is still NaT after the above operations.
goose_data = goose_data[goose_data['EVENT_DATE'].notna()]

# drop EVENT_MONTH, EVENT_DAY, and EVENT_YEAR columns
goose_data = goose_data.drop(labels=['EVENT_MONTH', 'EVENT_DAY', 'EVENT_YEAR'], axis=1)

Location data is also vital for analysis, so abit of cleaning will have to be done.

First, rows fitting any of the following conditions will be excluded:

1. Rows that do not have values for either `LAT_DD` or `LON_DD` because this issue cannot be rectified.
2. Rows whose `COORD_PREC` values are `8`, `12`, `18`, `28`, `33`, `38`, `72` because an uncertainty given either cannot be determined or is too big to be useful.

In [15]:
# Clean the coordinates columns as described above.

# Filter out all rows where LAT_DD or LON_DD are NaN. Cannot rectify rows with this issue.
goose_data = goose_data[goose_data['LAT_DD'].notna() & goose_data['LON_DD'].notna()]

# Filter out all rows with unusable or useless coordinate precision values as outlined above.
goose_data = goose_data[~((goose_data['COORD_PREC'] == 8)  | \
                     (goose_data['COORD_PREC'] == 12) | \
                     (goose_data['COORD_PREC'] == 18) | \
                     (goose_data['COORD_PREC'] == 28) | \
                     (goose_data['COORD_PREC'] == 33) | \
                     (goose_data['COORD_PREC'] == 38) | \
                     (goose_data['COORD_PREC'] == 72))]

goose_data

Unnamed: 0,BAND,ORIGINAL_BAND,OTHER_BANDS,EVENT_DATE,LAT_DD,LON_DD,COORD_PREC
0,B99285787525,B99285787525,,1974-07-16,71.50000,-179.50000,60.0
1,B99055198081,B99055198081,,1975-07-23,71.50000,-179.50000,60.0
2,B89055198948,B89055198948,,1975-07-23,71.50000,-179.50000,60.0
3,B59055198985,B59055198985,,1975-07-23,71.50000,-179.50000,60.0
4,B59055198835,B59055198835,,1975-07-23,71.50000,-179.50000,60.0
...,...,...,...,...,...,...,...
1235635,B57965382915,B57965382915,,2020-11-12,43.99020,-73.33690,0.0
1235636,B57865508750,B57865508750,,2020-11-21,44.16255,-73.31707,11.0
1235637,B47855457805,B47855457805,,2021-11-21,44.08487,-73.33633,0.0
1235638,B58715537317,B58715537317,,2022-09-02,44.97504,-73.31033,11.0


Additionally, a new column with lattitude and longitude uncertainties will be made whose values obey the following rules:

1. If the `COORD_PREC` corresponds to an exact location (is `0`), then the uncertainty is $5 * 10^6$ to account for limits in the number of significant digits given by the data.
2. If the `COORD_PREC` corresponds to a 1-minute block (is `1`), then the uncertainty is $\frac{1}{120} \approx 0.01$ degrees (rounded up) since the coordinates are in the centroid of the block.
3. If the `COORD_PREC` corresponds to a 10-minute block (is `10`), then the uncertainty is $\frac{1}{12} \approx 0.1$ degrees (rounded up) since the coordinates are in the centroid of the block.
4. If the `COORD_PREC` corresponds to a 1-degree block (is `60`), then the uncertainty is $0.5$ degrees since the coordinates are in the centroid of the block.
5. If the `COORD_PREC` corresponds to a county (is `7`), then the uncertainty will be $0.25$ degrees by estimate (since the average county land area is 1090.69 degrees and a sqaure of that size is around $0.5$ degrees in lattitude and longitude)
6. If the `COORD_PREC` corresponds to a town/area (is `11`), then the uncertainty will $0.25$ be degrees by estimate (since each town should be smaller than a county and thus have less uncertainty associated with it)

In [16]:
def get_coord_precision(COORD_PREC):
    if pd.isna(COORD_PREC):
        return None  # or np.nan, or a default value like 0.5
    if COORD_PREC == 0:
        return 5e-6
    elif COORD_PREC == 1:
        return 0.01
    elif COORD_PREC == 10:
        return 0.1
    elif COORD_PREC == 60:
        return 0.5
    elif COORD_PREC == 7 or COORD_PREC == 11:
        return 0.25
    else:
        raise ValueError(f"Unrecognized COORD_PREC value: {COORD_PREC}")


In [17]:
# Perform the coordinate precision conversion as described above.

# Compute coording uncertainties
goose_data['COORD_UNC'] = goose_data['COORD_PREC'].apply(lambda x : get_coord_precision(x))

# Drop the old column
goose_data = goose_data.drop(labels=['COORD_PREC'], axis=1)