# Analysis Notebook

In [1]:
import time
import utils
import requests
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

## 1) Data Sampling and Collection

Please Add short description of process.

In [2]:
# By Ned Antell
def fetch_and_cache(data_url, file, data_dir="data", force=False):
    """
    Download and cache a url and return the file object.

    data_url: the web address to download
    file: the file in which to save the results.
    data_dir: (default="data") the location to save the data
    force: if true the file is always re-downloaded

    return: The pathlib.Path object representing the file.
    """

    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok = True)
    file_path = data_dir / Path(file)
    # If the file already exists and we want to force a download then
    # delete the file first so that the creation date is correct.
    if force and file_path.exists():
        file_path.unlink()
    if force or not file_path.exists():
        print('Downloading...', end=' ')
        resp = requests.get(data_url)
        with file_path.open('wb') as f:
            f.write(resp.content)
        print('Done!')
        last_modified_time = time.ctime(file_path.stat().st_mtime)
    else:
        last_modified_time = time.ctime(file_path.stat().st_mtime)
        print("Using cached version that was downloaded (UTC):", last_modified_time)
    return file_path

def fetch_and_cache_gdrive(gdrive_id, file, data_dir="data", force=False):
    """
    Download and cache a url and return the file object.

    data_url: the web address to download
    file: the file in which to save the results.
    data_dir: (default="data") the location to save the data
    force: if true the file is always re-downloaded

    return: The pathlib.Path object representing the file.
    """

    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok = True)
    file_path = data_dir / Path(file)
    # If the file already exists and we want to force a download then
    # delete the file first so that the creation date is correct.
    if force and file_path.exists():
        file_path.unlink()
    if force or not file_path.exists():
        print('Downloading...', end=' ')
        download_file_from_google_drive(gdrive_id, file_path)
        print('Done!')
        last_modified_time = time.ctime(file_path.stat().st_mtime)
    else:
        last_modified_time = time.ctime(file_path.stat().st_mtime)
        print("Using cached version that was downloaded (UTC):", last_modified_time)
    return file_path



# https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [3]:
# By Ned Antell
#Pull data from google drive

#bioCON plant diversity
#utils.fetch_and_cache_gdrive('1WIIljYUz3B45K9DOA1M3cfGOedtgxLFL', 'bioCON_plant_diversity.txt', data_dir = '')

#pollinators 1
fetch_and_cache_gdrive('1JCynlMV0Um07MUa3E0qSqdDYOGfy9Eyr', 'plant_pollinator_diversity_set1.csv', data_dir = 'data/')

#pollinators 2
fetch_and_cache_gdrive('1IjzzaYep_BXIzMVjIBmh8AiXbydwmYk_', 'plant_pollinator_diversity_set2.csv', data_dir = 'data/')

Downloading... Done!
Downloading... Done!


WindowsPath('data/plant_pollinator_diversity_set2.csv')

In [5]:
file1 = 'data/plant_pollinator_diversity_set1.csv'
file2 = 'data/plant_pollinator_diversity_set2.csv'

In [9]:
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df2.head()

Unnamed: 0,DBCODE,ENTITY,COMPLEX,MEADOW,PLOT_ID,YEAR,SAMPLEDATE,WATCH,OBSERVER,PLOT,FLW_STATUS,PLTSP_CODE,PLTSP_NAME,NO_STALK,NO_FLWS,QC_NOTES
0,SA026,2,Bunchgrass,BD,BGD01,2011,2011-07-20,1.0,ND,1,,,,,,
1,SA026,2,Bunchgrass,BD,BGD01,2011,2011-07-25,2.0,Andy,1,FLW,ACHIMILL,Achillea millefolium,4.0,36.0,
2,SA026,2,Bunchgrass,BD,BGD01,2011,2011-07-25,2.0,Andy,1,FLW,ERIGFOLI,Erigeron foliosus,14.0,1.0,
3,SA026,2,Bunchgrass,BD,BGD01,2011,2011-07-25,2.0,Andy,1,FLW,VICIAMER,Vicia americana,3.0,2.3,
4,SA026,2,Bunchgrass,BD,BGD01,2011,2011-08-02,3.0,"Andy, Tim",1,FLW,ACHIMILL,Achillea millefolium,2.0,26.0,


In [10]:
df1.columns

Index(['DBCODE', 'ENTITY', 'COMPLEX', 'MEADOW', 'PLOT_ID', 'YEAR',
       'SAMPLEDATE', 'WATCH', 'OBSERVER', 'PLOT', 'START_TIME', 'END_TIME',
       'MINUTE', 'CLOUDS', 'WIND', 'TEMP', 'PPI_STATUS', 'NO_INT',
       'PLTSP_CODE', 'PLTSP_NAME', 'VISSP_CODE', 'VISSP_NAME', 'VISSP_TYPE',
       'REF_NO', 'VISSP_NO', 'QC_NOTES'],
      dtype='object')

## 2) Data Cleaning

Please Add short description of process.

In [13]:
# Drop unnecessary/redundant columns for our data analysis

def drop_columns(dataframe, todrop):
    # dataframe: Pandas Dataset to drop columns from
    # todrop: list of columns to drop
    df = dataframe.drop(todrop, axis=1)
    return df

In [14]:
drop_cols_1 = ['DBCODE', 'ENTITY', 'COMPLEX', 'START_TIME', 'END_TIME', 'MINUTE',  'YEAR', 'OBSERVER', 'PLTSP_NAME', 'VISSP_NAME']
drop_cols_2 = ['DBCODE', 'ENTITY', 'COMPLEX', 'YEAR', 'OBSERVER', 'PLTSP_NAME']

df1 = drop_columns(df1, drop_cols_1)
df2 = drop_columns(df2, drop_cols_2)
df2.head()

Unnamed: 0,MEADOW,PLOT_ID,SAMPLEDATE,WATCH,PLOT,FLW_STATUS,PLTSP_CODE,NO_STALK,NO_FLWS,QC_NOTES
0,BD,BGD01,2011-07-20,1.0,1,,,,,
1,BD,BGD01,2011-07-25,2.0,1,FLW,ACHIMILL,4.0,36.0,
2,BD,BGD01,2011-07-25,2.0,1,FLW,ERIGFOLI,14.0,1.0,
3,BD,BGD01,2011-07-25,2.0,1,FLW,VICIAMER,3.0,2.3,
4,BD,BGD01,2011-08-02,3.0,1,FLW,ACHIMILL,2.0,26.0,


In [15]:
# Next: Drop NaNs

## 3) Exploratory Data Analysis

Please Add short description of process.

## 4) Data Modeling and Inferences

Please Add short description of process.