# Notebook Unknown
## Creating a DB View to query ALL NOAA stations for various years
The below notebook creates a DB View in your Google BigQuery project. A view is a prepackaged query that does not take up space in your DB. The query is a UNION ALL over the years selected as a parameter.

In [None]:
from google.cloud import bigquery
client = bigquery.Client()
project = 'fjvr-testing' # Change this project to your 
source_dataset_id = 'audubon_cdc'
# source_table_id = 'us_states'
shared_dataset_ref = client.dataset(source_dataset_id)

# This is the parameter used to create the years that will go into the view
year = 1900

# The name of the view in our DB
view_ref = shared_dataset_ref.table("noaa_from_" + str(year) + "_to_present")
view = bigquery.Table(view_ref)

# The string that will contain the SQL we will use to create our view
sql_statement_accumulated = ""
tables_used = []
tables = client.list_tables("bigquery-public-data.ghcn_d")

# Iterate over all tables in the schema and store the tables we will use in the tables_used array
for table in tables:
    try:
        if int(table.table_id[-4:]) >= year:
            tables_used.append(table.table_id)
    except:
        pass
    
# Iterate over all values in an array and annex to the sql_statement_accumulated
for index in range(0,len(tables_used)):
    if index == len(tables_used) - 1:
        sql_template = 'SELECT * FROM `{}`'
        sql_statement_accumulated = sql_statement_accumulated + sql_template.format("bigquery-public-data.ghcn_d." + str(tables_used[index]))
    else:
        sql_template = 'SELECT * FROM `{}` UNION ALL '
        sql_statement_accumulated = sql_statement_accumulated + sql_template.format("bigquery-public-data.ghcn_d." + str(tables_used[index]))
        

# Assiging the SQL to the associated view
view.view_query = sql_statement_accumulated
view = client.create_table(view)  # API request

print("Successfully created view at {}".format(view.full_table_id))

## Creating the DB View to flatten the data
The below notebook creates a DB View in your Google BigQuery project consisting of the flatten data from the union of all data from NOAA stations given the specified year.

In [6]:
import pandas as pd

# Query for flattening the data
query = """
SELECT DISTINCT
  base.id, 
  base.date,
  stations.name,
  stations.state,
  temp_min.value as temp_min_value,
  temp_max.value as temp_max_value,
  precipitation.value as precipitation_value,
  temp_avg.value as temp_avg,
  snow.value as snow,
  snwd.value as snwd

FROM {} base
LEFT JOIN {} temp_min ON base.id = temp_min.id AND base.date = temp_min.date AND temp_min.element = 'TMIN'
LEFT JOIN {} temp_max ON base.id = temp_max.id AND base.date = temp_max.date AND temp_max.element = 'TMAX'
LEFT JOIN {} precipitation ON base.id = precipitation.id AND base.date = precipitation.date AND precipitation.element = 'PRCP'
LEFT JOIN {} temp_avg ON base.id = temp_avg.id AND base.date = temp_avg.date AND temp_avg.element = 'TAVG'
LEFT JOIN {} snow ON base.id = snow.id AND base.date = snow.date AND snow.element = 'SNOW'
LEFT JOIN {} snwd ON base.id = snwd.id AND base.date = snwd.date AND snwd.element = 'SNWD'

INNER JOIN `bigquery-public-data`.ghcn_d.ghcnd_stations stations ON base.id = stations.id

ORDER BY base.id, base.date
"""

# This is the from clause table we will use in our project. 
# Please replace `fjvr-testing` with your PROJECT NAME
parameter = "`fjvr-testing`.audubon_cdc.noaa_from_1900_to_present"
# In this statement we replace the {} with the corresponding table parameter
query = query.format(parameter,parameter,parameter,parameter,parameter,parameter,parameter)

# Name of the view in our DB
view_ref = shared_dataset_ref.table("flatten_noaa_from_" + str(year) + "_to_present")
view = bigquery.Table(view_ref)

# Assigning the SQL string to the view query
view.view_query = query

# Creating the view associated with the query
view = client.create_table(view)  # API request

print("Successfully created view at {}".format(view.full_table_id))

Conflict: 409 POST https://bigquery.googleapis.com/bigquery/v2/projects/fjvr-testing/datasets/audubon_cdc/tables: Already Exists: Table fjvr-testing:audubon_cdc.flatten_noaa_from_1900_to_present

## Assigning circles to weather stations
Using a custom table created from uploading the CSV to Big Query (this table is called `cleaned_bird_counts_gstorage`) a join is done with the view that contains the flatten data.

In [7]:
from datetime import datetime
from google.cloud import bigquery

# Used to classify the name 
time_now = datetime.today().strftime('%Y%m%d%H%M%S')

client = bigquery.Client()
project = 'fjvr-testing'
source_dataset_id = 'audubon_cdc'
# source_table_id = 'us_states'
shared_dataset_ref = client.dataset(source_dataset_id)

query = """
WITH circles_hash as (SELECT x.*, ST_GEOHASH(ST_GEOGPOINT(x.lon,x.lat), 4) as geohash_circle, ST_GEOHASH(ST_GEOGPOINT(x.lon,x.lat), 7) as circle_id

FROM `fjvr-testing.audubon_cdc.cleaned_bird_counts_gstorage` x),

stations_hash as (SELECT y.*, ST_GEOHASH(ST_GEOGPOINT(y.longitude,y.latitude),4) as geohash_station FROM `bigquery-public-data`.ghcn_d.ghcnd_stations y),

circle_with_matched_stations as (SELECT * FROM circles_hash x INNER JOIN stations_hash y ON x.geohash_circle = y.geohash_station)

SELECT x.*, y.temp_min_value,y.temp_max_value,y.precipitation_value,y.temp_avg,y.snow,y.snwd

FROM circle_with_matched_stations x
LEFT JOIN `fjvr-testing.audubon_cdc.flatten_noaa_from_1900_to_present` y ON x.id = y.id AND x.count_date = y.date

ORDER BY circle_id DESC,count_date ASC """

# Queries BigQuery public data set and creates a new dataframe object
df_circles_to_stations_weather_data = client.query(query).to_dataframe()

# Saving stations in csv COMPRESSED IN GZIP!!!
df_circles_to_stations_weather_data.to_csv(r'circles_to_stations_weather_data_' + str(time_now) +  '.csv', compression = "gzip")

In [9]:
df_circles_to_stations_weather_data.head()

Unnamed: 0,Unnamed__0,circle_name,country_state,lat,lon,count_year,count_date,n_field_counters,n_feeder_counters,min_field_parties,...,gsn_flag,hcn_crn_flag,wmoid,geohash_station,temp_min_value,temp_max_value,precipitation_value,temp_avg,snow,snwd
0,28806,Amchitka Island,US-AK,51.409713,179.284881,1977,1977-01-01,4.0,,,...,,,,zcpk,,,,,,
1,28806,Amchitka Island,US-AK,51.409713,179.284881,1977,1977-01-01,4.0,,,...,,,,zcpk,,,,,,
2,30036,Amchitka Island,US-AK,51.409713,179.284881,1978,1977-12-29,5.0,,,...,,,,zcpk,,,,,,
3,30036,Amchitka Island,US-AK,51.409713,179.284881,1978,1977-12-29,5.0,,,...,,,,zcpk,,,,,,
4,31321,Amchitka Island,US-AK,51.409713,179.284881,1979,1978-12-30,2.0,,,...,,,,zcpk,,,,,,


In [3]:
# Install a pip package in the current Jupyter kernel
#!pip install python-geohash

import Geohash as ph

print(ph.encode(42.6,-5.6, precision=5))

ezs42


In [None]:
import numpy as np

record_count = len(df_circles_to_stations_weather_data.index)
print(record_count)

# record_count = record_count.astype(np.int32)

# print("Total number of records: " + record_count)

temp_min_nas = df_circles_to_stations_weather_data.temp_min_value.isna().sum()
print("Missing min temperature: " + str(temp_min_nas))
print('Missing min temperature(%): ', round(temp_avg_nas/float(len(df_circles_to_stations_weather_data))*100,2),'%')

temp_avg_nas = df_circles_to_stations_weather_data.temp_avg.isna().sum()
print("Missing avg temperature: " + str(temp_avg_nas))
temp_avg_nas = df_circles_to_stations_weather_data.temp_avg.isna().sum()
snow = df_circles_to_stations_weather_data.snow.isna().sum()
snow = snow.astype(np.int32)
print(snow)
# len(df_circles_to_stations_weather_data)
# record_length = len(df_circles_to_stations_weather_data)
# record_length = record_length.astype(np.int32)
# print("Total number of records: " + str(record_length))