## Assigning circles to weather stations
Using a custom table created from uploading the CSV to Big Query (this table is called `cleaned_bird_counts_gstorage`) a join is done with the view that contains the flatten data.

In [2]:
from datetime import datetime
from google.cloud import bigquery

# Used to classify the name 
time_now = datetime.today().strftime('%Y%m%d%H%M%S')

client = bigquery.Client()
project = 'fjvr-testing'
source_dataset_id = 'audubon_cdc'
# source_table_id = 'us_states'
shared_dataset_ref = client.dataset(source_dataset_id)

query = """
WITH circles_hash as (SELECT x.*, ST_GEOHASH(ST_GEOGPOINT(x.lon,x.lat), 4) as geohash_circle, ST_GEOHASH(ST_GEOGPOINT(x.lon,x.lat), 7) as circle_id

FROM `fjvr-testing.audubon_cdc.cleaned_bird_counts_gstorage` x),

stations_hash as (SELECT y.*, ST_GEOHASH(ST_GEOGPOINT(y.longitude,y.latitude),4) as geohash_station FROM `bigquery-public-data`.ghcn_d.ghcnd_stations y),

circle_with_matched_stations as (SELECT * FROM circles_hash x INNER JOIN stations_hash y ON x.geohash_circle = y.geohash_station)

SELECT x.*, y.temp_min_value,y.temp_max_value,y.precipitation_value,y.temp_avg,y.snow,y.snwd

FROM circle_with_matched_stations x
LEFT JOIN `fjvr-testing.audubon_cdc.flatten_noaa_from_1900_to_present` y ON x.id = y.id AND x.count_date = y.date

ORDER BY circle_id DESC,count_date ASC """

# Queries BigQuery public data set and creates a new dataframe object
df_circles_to_stations_weather_data = client.query(query).to_dataframe()

# Saving stations in csv COMPRESSED IN GZIP!!!
df_circles_to_stations_weather_data.to_csv(r'circles_to_stations_weather_data_' + str(time_now) +  '.csv', compression = "gzip")

ModuleNotFoundError: No module named 'google.cloud'

## Top 5 records
Showing the top 5 records of the data extracted to the query above

In [None]:
df_circles_to_stations_weather_data.head()

## Statistics on dataset
How many records are empty for the various temperature measurements

In [None]:
import numpy as np

record_count = len(df_circles_to_stations_weather_data.index)
print(record_count)

# record_count = record_count.astype(np.int32)

# print("Total number of records: " + record_count)

temp_min_nas = df_circles_to_stations_weather_data.temp_min_value.isna().sum()
print("Missing min temperature: " + str(temp_min_nas))
print('Missing min temperature(%): ', round(temp_avg_nas/float(len(df_circles_to_stations_weather_data))*100,2),'%')

temp_avg_nas = df_circles_to_stations_weather_data.temp_avg.isna().sum()
print("Missing avg temperature: " + str(temp_avg_nas))
temp_avg_nas = df_circles_to_stations_weather_data.temp_avg.isna().sum()
snow = df_circles_to_stations_weather_data.snow.isna().sum()
snow = snow.astype(np.int32)
print(snow)
# len(df_circles_to_stations_weather_data)
# record_length = len(df_circles_to_stations_weather_data)
# record_length = record_length.astype(np.int32)
# print("Total number of records: " + str(record_length))