In [None]:
import pandas as pd
import sqlalchemy as sq
import geopandas as gpd
from matplotlib import pyplot as plt
from IPython.display import clear_output
from dotenv import load_dotenv
from DataService import DataService
from utils import *


In [6]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 100
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)

In [11]:
tableName = "public.\"StationsDly\""
query = "SELECT * FROM " + tableName + ";"


dataService = DataService(PGDB, PGUSER, PGPW)
db_con = dataService.connect()
dfStations = gpd.GeoDataFrame.from_postgis(query, db_con, geom_col='geometry')

In [12]:
# get list of SASKATCHEWAN stations
skStations = dfStations.loc[dfStations['Province'] == 'SASKATCHEWAN']

# remove stations with NaN DLY First Year
skStations = skStations.loc[skStations['DLY First Year'].notnull()]

# reindex 
skStations.reset_index(drop=True, inplace=True)

skStations.count()

Station Name      91
Province          91
Latitude          91
Longitude         91
Elevation         91
Climate ID        91
WMO Identifier    71
TC Identifier     81
First Year        91
Last Year         91
HLY First Year    91
HLY Last Year     91
DLY First Year    91
DLY Last Year     91
MLY First Year    63
MLY Last Year     63
geometry          91
CRnum             91
scraped           91
dtype: int64

In [9]:
requester = cdr()
df = pd.DataFrame()

for index, row in skStations.iterrows():
                
    stationID = str(row['Climate ID'])
    errQuery = sq.text("UPDATE public.\"StationsDly\" SET \"scraped\" = False WHERE \"Climate ID\" like CAST(\'{}\' AS TEXT);".format(stationID))
    if row['scraped'] == False or row['scraped'] == None:
        try:
            startYr = 1995
            endYr = 2022
            if row['DLY First Year'] != np.NAN or row['DLY Last Year'] != np.NAN or row['DLY Last Year'] > 1995:
                if row['DLY First Year'] > 1995:
                    startYr = row['DLY First Year']
                if row['DLY Last Year'] > 1995:
                    endYr = row['DLY Last Year']
                df = requester.get_data("SK", stationID, startYr, endYr)
                clear_output(wait=False)

                print(df.head())
                
                if not df.empty:
                    dataProcessA(df, stationID)
                else:
                    db_con.execute(errQuery)
        except Exception as e:
            print("Failed to scrape " + stationID)
            print(e)
            db_con.execute(errQuery)
    else:
        print("Data for station " + str(stationID) + " already exists.")

    print("Processed row " + str(index + 1) + " of " + str(len(skStations)))

   Longitude (x)  Latitude (y) Station Name  Climate ID   Date/Time  Year   
0         -102.5          51.3    YORKTON A     4019080  2005-10-01  2005  \
1         -102.5          51.3    YORKTON A     4019080  2005-10-02  2005   
2         -102.5          51.3    YORKTON A     4019080  2005-10-03  2005   
3         -102.5          51.3    YORKTON A     4019080  2005-10-04  2005   
4         -102.5          51.3    YORKTON A     4019080  2005-10-05  2005   

   Month  Day  Data Quality  Max Temp (°C) Max Temp Flag  Min Temp (°C)   
0     10    1           NaN           14.7           NaN            6.2  \
1     10    2           NaN            9.8           NaN            0.6   
2     10    3           NaN            4.2           NaN           -3.4   
3     10    4           NaN            7.5           NaN           -5.6   
4     10    5           NaN            NaN             M            NaN   

  Min Temp Flag  Mean Temp (°C) Mean Temp Flag  Heat Deg Days (°C)   
0           NaN 

In [10]:
tableName = "public.\"WeatherData\""
query = "SELECT * FROM " + tableName + ";"


In [None]:
dataService.cleanup()