## Assigning circles to weather stations
Using a custom table created from uploading the CSV to Big Query (this table is called `cleaned_bird_counts_gstorage`) a join is done with the view that contains the flatten data.

In [2]:
import os
# The path to your json credentials file. Replace with your corresponding file.
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/fjvr/Downloads/birdproject-2020-3cdcf7c1792d.json"

from datetime import datetime
from google.cloud import bigquery

# Used to classify the name 
time_now = datetime.today().strftime('%Y%m%d%H%M%S')

client = bigquery.Client()
project = 'birdproject-2020'
source_dataset_id = 'audubon_cdc'
# source_table_id = 'us_states'
shared_dataset_ref = client.dataset(source_dataset_id)

query = f"""
WITH circles_hash as (SELECT x.*, ST_GEOHASH(ST_GEOGPOINT(x.lon,x.lat), 4) as geohash_circle, ST_GEOHASH(ST_GEOGPOINT(x.lon,x.lat), 7) as circle_id

FROM `{project}.audubon_cdc.cleaned_bird_counts_gstorage` x),

stations_hash as (SELECT y.*, ST_GEOHASH(ST_GEOGPOINT(y.longitude,y.latitude),4) as geohash_station FROM `bigquery-public-data`.ghcn_d.ghcnd_stations y),

circle_with_matched_stations as (SELECT * FROM circles_hash x INNER JOIN stations_hash y ON x.geohash_circle = y.geohash_station)

SELECT x.*, y.temp_min_value,y.temp_max_value,y.precipitation_value,y.temp_avg,y.snow,y.snwd

FROM circle_with_matched_stations x
LEFT JOIN `{project}.audubon_cdc.flatten_noaa_from_1900_to_present` y ON x.id = y.id AND x.count_date = y.date

ORDER BY circle_id DESC,count_date ASC """

# Queries BigQuery public data set and creates a new dataframe object
df_circles_to_stations_weather_data = client.query(query).to_dataframe()

## Top 5 records
Showing the top 5 records of the data extracted to the query above

In [3]:
df_circles_to_stations_weather_data.head()

Unnamed: 0,Unnamed__0,circle_name,country_state,lat,lon,count_year,count_date,n_field_counters,n_feeder_counters,min_field_parties,...,gsn_flag,hcn_crn_flag,wmoid,geohash_station,temp_min_value,temp_max_value,precipitation_value,temp_avg,snow,snwd
0,28806,Amchitka Island,US-AK,51.409713,179.284881,1977,1977-01-01,4.0,,,...,,,,zcpk,,,,,,
1,28806,Amchitka Island,US-AK,51.409713,179.284881,1977,1977-01-01,4.0,,,...,,,,zcpk,,,,,,
2,30036,Amchitka Island,US-AK,51.409713,179.284881,1978,1977-12-29,5.0,,,...,,,,zcpk,,,,,,
3,30036,Amchitka Island,US-AK,51.409713,179.284881,1978,1977-12-29,5.0,,,...,,,,zcpk,,,,,,
4,31321,Amchitka Island,US-AK,51.409713,179.284881,1979,1978-12-30,2.0,,,...,,,,zcpk,,,,,,


## Statistics on dataset
How many records are empty for the various temperature measurements

In [7]:
import numpy as np

record_count = len(df_circles_to_stations_weather_data.index)
print('How many rows in dataset with missing vals: ', record_count)

temp_min_nas = df_circles_to_stations_weather_data.temp_min_value.isna().sum()
print("Missing min temperature: " + str(temp_min_nas))

temp_max_nas = df_circles_to_stations_weather_data.temp_max_value.isna().sum()
print("Missing max temperature: " + str(temp_max_nas))

temp_avg_nas = df_circles_to_stations_weather_data.temp_avg.isna().sum()
print("Missing avg temperature: " + str(temp_avg_nas))

snow = df_circles_to_stations_weather_data.snow.isna().sum()
print("Missing snow temperature: " + str(snow))

How many rows in dataset with missing vals:  1151128
Missing min temperature: 1055467
Missing max temperature: 1055462
Missing avg temperature: 1141109
Missing snow temperature: 1026148


## Remove empty min/max temperature
Create new data frame

In [10]:
ref=df_circles_to_stations_weather_data.temp_min_value
paired_data=df_circles_to_stations_weather_data[ref.notna()]
paired_data.head()

Unnamed: 0,Unnamed__0,circle_name,country_state,lat,lon,count_year,count_date,n_field_counters,n_feeder_counters,min_field_parties,...,gsn_flag,hcn_crn_flag,wmoid,geohash_station,temp_min_value,temp_max_value,precipitation_value,temp_avg,snow,snwd
7,32617,Amchitka Island,US-AK,51.409713,179.284881,1980,1979-12-18,4.0,,,...,,,,zcpk,-17.0,17.0,5.0,,3.0,0.0
78,57796,"Saipan, C.N.M.I.",MP-,15.199118,145.750642,1996,1995-12-16,11.0,0.0,1.0,...,,,,x4xw,239.0,283.0,81.0,,0.0,0.0
82,59507,"Saipan, C.N.M.I.",MP-,15.199118,145.750642,1997,1997-01-04,13.0,0.0,1.0,...,,,,x4xw,239.0,283.0,8.0,,0.0,0.0
87,61281,"Saipan, C.N.M.I.",MP-,15.199118,145.750642,1998,1998-01-03,20.0,1.0,1.0,...,,,,x4xw,233.0,283.0,5.0,,0.0,0.0
95,64937,"Saipan, C.N.M.I.",MP-,15.199118,145.750642,2000,1999-12-18,14.0,0.0,1.0,...,,,,x4xw,222.0,289.0,41.0,,0.0,0.0


## Size of dataframe

In [14]:
print("The total number of records in this data set is: ", len(paired_data.circle_name))

The total number of records in this data set is:  95661


## Only Data in the USA
Create new data frame for stations only for stations located in the USA

In [35]:
paired_data_usa = paired_data[paired_data.id.str.slice(stop=2)=="US"]
print("The number of rows station usa weather data: ", len(paired_data_usa))

The number of rows station usa weather data:  81352


In [34]:
# Saving stations in csv COMPRESSED IN GZIP!!!
df_circles_to_stations_weather_data.to_csv(r'1.0-circles_to_many_stations_usa_weather_data_' + str(time_now) +  '.csv', compression = "gzip")