In [1]:
#
# Import the necessary modules
#

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

import scipy.stats as stats
import matplotlib.dates as mdates
from datetime import datetime

import calendar
from scipy.stats import f_oneway


import warnings

import GooseUtils

# Supress warning related to data types (will get on first import of the csv file)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [None]:
#
# Retrieve the data from the csv file and filter out irrelevant species
#

# Retrieve group 1 data from the relevant CSV file
grp1_goose_data_raw = pd.read_csv('Bird_Banding_Data/NABBP_2023_grp_01.csv')
grp2_goose_data_raw = pd.read_csv('Bird_Banding_Data/NABBP_2023_grp_02.csv', on_bad_lines='skip')
grp3_goose_data_raw = pd.read_csv('Bird_Banding_Data/NABBP_2023_grp_03.csv')


# Filter out irrelevant columns from the dataframes
relevant_cols = ['BAND', 
                 'ORIGINAL_BAND', 
                 'OTHER_BANDS', 
                 'EVENT_DATE', 
                 'EVENT_DAY', 
                 'EVENT_MONTH', 
                 'EVENT_YEAR', 
                 'LAT_DD', 
                 'LON_DD', 
                 'COORD_PREC',
                 'SPECIES_ID',]

grp1_goose_data_raw = grp1_goose_data_raw[relevant_cols]
grp2_goose_data_raw = grp2_goose_data_raw[relevant_cols]
grp3_goose_data_raw = grp3_goose_data_raw[relevant_cols]

In [None]:
# 
white_fronted_goose_data_raw = (grp1_goose_data_raw[(grp1_goose_data_raw['SPECIES_ID'] == 1710) | \
                                                   (grp1_goose_data_raw['SPECIES_ID'] == 1719)]).drop(labels='SPECIES_ID', axis=1)
snow_goose_data_raw = (grp1_goose_data_raw[(grp1_goose_data_raw['SPECIES_ID'] == 1690) | \
                                          (grp1_goose_data_raw['SPECIES_ID'] == 1699) | \
                                          (grp1_goose_data_raw['SPECIES_ID'] == 1691) | \
                                          (grp1_goose_data_raw['SPECIES_ID'] == 1698)]).drop(labels='SPECIES_ID', axis=1)
cackling_goose_data_raw = (grp3_goose_data_raw[(grp3_goose_data_raw['SPECIES_ID'] == 1721) | \
                                             (grp3_goose_data_raw['SPECIES_ID'] == 1722)]).drop(labels='SPECIES_ID', axis=1)
#canada_goose_data_raw = 

#
# Get all relevant columns and display basic information about the data
#

In [None]:
#
# Clean the datetime columns
#

def clean_datetime(goose_data):
    goose_data['EVENT_DATE'] = pd.to_datetime(goose_data['EVENT_DATE'], format='%m/%d/%Y', errors='coerce')

    # Assemble date guesses from the EVENT_MONTH, EVENT_DAY, and EVENT_YEAR columns BEFORE DROPPING THEM
    dates_from_columns = pd.to_datetime(
        goose_data['EVENT_MONTH'].astype(str) + '/' +
        goose_data['EVENT_DAY'].astype(str) + '/' +
        goose_data['EVENT_YEAR'].astype(str),
        format='%m/%d/%Y',
        errors='coerce'
    )

    # Fill in all NaT values that can be filled with the guesses from the previous line.
    goose_data['EVENT_DATE'] = goose_data['EVENT_DATE'].fillna(dates_from_columns)

    # Remove all rows where EVENT_DATE is still NaT after the above operations.
    goose_data = goose_data[goose_data['EVENT_DATE'].notna()]

    # drop EVENT_MONTH, EVENT_DAY, and EVENT_YEAR columns
    goose_data = goose_data.drop(labels=['EVENT_MONTH', 'EVENT_DAY', 'EVENT_YEAR'], axis=1)

    return goose_data

white_fronted_goose_data_raw = clean_datetime(white_fronted_goose_data_raw)
snow_goose_data_raw = clean_datetime(snow_goose_data_raw)
cackling_goose_data_raw = clean_datetime(cackling_goose_data_raw)
# canada_goose_data_raw = clean_datetime(canada_goose_data_raw)

In [None]:
#
# Clean the coordinates columns as described above.
#

def clean_location(goose_data):
    # Filter out all rows where LAT_DD or LON_DD are NaN. Cannot rectify rows with this issue.
    goose_data = goose_data[goose_data['LAT_DD'].notna() & goose_data['LON_DD'].notna()]

    # Filter out all rows with unusable or useless coordinate precision values as outlined above.
    goose_data = goose_data[~((goose_data['COORD_PREC'] == 8)  | \
                              (goose_data['COORD_PREC'] == 12) | \
                              (goose_data['COORD_PREC'] == 18) | \
                              (goose_data['COORD_PREC'] == 28) | \
                              (goose_data['COORD_PREC'] == 33) | \
                              (goose_data['COORD_PREC'] == 38) | \
                              (goose_data['COORD_PREC'] == 72) | \
                              (goose_data['COORD_PREC'].isna()))]

    goose_data = goose_data[(goose_data['LAT_DD'] != 0.0) | (goose_data['LON_DD'] != 0.0)]

    return goose_data

white_fronted_goose_data_raw = clean_location(white_fronted_goose_data_raw)
snow_goose_data_raw = clean_location(snow_goose_data_raw)
cackling_goose_data_raw = clean_location(cackling_goose_data_raw)
# canada_goose_data_raw = clean_location(canada_goose_data_raw)