In [1]:
#@title Import relevant modules
import os
import pandas as pd
import numpy as np
import geopandas as gpd
import sqlalchemy as sq
from matplotlib import pyplot as plt
from IPython.display import clear_output
from dotenv import load_dotenv
from DataService import DataService
from ClimateDataRequester import ClimateDataRequester as cdr
from utils import *


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 100
pd.options.display.float_format = "{:.1f}".format
pd.set_option('display.max_columns', None)


In [5]:
def push_data(df: pd.DataFrame) -> None:
    dataService = DataService(PGDB, PGUSER, PGPW)
    db_con = dataService.connect()
    df.to_sql("WeatherData", db_con, if_exists="append", index=False)
    dataService.cleanup()


In [6]:
def dataProcessA(df: pd.DataFrame, stationID: str) -> None:
    dataService = DataService(PGDB, PGUSER, PGPW)
    db_con = dataService.connect()
    try:
        df.drop(columns=['Data Quality', 'Max Temp Flag', 'Mean Temp Flag', 'Min Temp Flag', 'Heat Deg Days Flag', 'Cool Deg Days Flag', 'Spd of Max Gust (km/h)',
                         'Total Rain Flag', 'Total Snow Flag', 'Total Precip Flag', 'Snow on Grnd Flag', 'Dir of Max Gust Flag', 'Spd of Max Gust Flag',
                         'Heat Deg Days (°C)', 'Cool Deg Days (°C)', 'Longitude (x)', 'Latitude (y)', 'Station Name', 'Dir of Max Gust (10s deg)'], inplace=True)
    except:
        df.to_csv("data/failed/" + str(df.iloc[0, 0]) + "_unexpected_column_names.csv", index=False)

    # Climate ID	Date/Time	Year	Month	Day	Max Temp (Â°C)	Min Temp (Â°C)	Mean Temp (Â°C)	Total Rain (mm)	Total Snow (cm)	Total Precip (mm)	Snow on Grnd (cm)	Dir of Max Gust (10s deg)	Spd of Max Gust (km/h)
    # ClimateID Date Year Month Day MaxTemp MinTemp MeanTemp TotalRain TotalSnow TotalPrecip SnowOnGrnd DirOfMaxGust SpdOfMaxGust
    df.rename(columns={df.columns[0]: "ClimateID"}, inplace=True)
    df.rename(columns={df.columns[1]: "Date"}, inplace=True)
    df.rename(columns={df.columns[2]: "Year"}, inplace=True)
    df.rename(columns={df.columns[3]: "Month"}, inplace=True)
    df.rename(columns={df.columns[4]: "Day"}, inplace=True)
    df.rename(columns={df.columns[5]: "MaxTemp"}, inplace=True)
    df.rename(columns={df.columns[6]: "MinTemp"}, inplace=True)
    df.rename(columns={df.columns[7]: "MeanTemp"}, inplace=True)
    df.rename(columns={df.columns[8]: "TotalRain"}, inplace=True)
    df.rename(columns={df.columns[9]: "TotalSnow"}, inplace=True)
    df.rename(columns={df.columns[10]: "TotalPrecip"}, inplace=True)
    df.rename(columns={df.columns[11]: "SnowOnGrnd"}, inplace=True)

    df.dropna(subset=['MeanTemp'], inplace=True)
    df.loc[df['SnowOnGrnd'].isnull(), 'SnowOnGrnd'] = 0
    df.loc[df['TotalRain'].isnull(), 'TotalRain'] = 0
    df.loc[df['TotalSnow'].isnull(), 'TotalSnow'] = 0
    df.loc[df['TotalPrecip'].isnull(), 'TotalPrecip'] = 0
    df['MaxTemp'] = np.where(df['MaxTemp'].isnull(),
                             df['MeanTemp'], df['MaxTemp'])
    df['MinTemp'] = np.where(df['MinTemp'].isnull(),
                             df['MeanTemp'], df['MinTemp'])

    df[['ClimateID', 'Date']] = df[['ClimateID', 'Date']].astype(str)
    df[['Year', 'Month', 'Day']] = df[['Year', 'Month', 'Day']].astype(int)
    df[['MaxTemp', 'MinTemp', 'MeanTemp', 'TotalRain', 'TotalSnow', 'TotalPrecip', 'SnowOnGrnd']] = df[[
        'MaxTemp', 'MinTemp', 'MeanTemp', 'TotalRain', 'TotalSnow', 'TotalPrecip', 'SnowOnGrnd']].astype(float)

    # we try a db push, but if it fails, we place the data in a csv file
    # try:
    push_data(df)
    query = sq.text("UPDATE public.\"StationsDly\" SET \"scraped\" = True WHERE \"Climate ID\" like CAST(\'{}\' AS TEXT);".format(stationID))
    db_con.execute(query)
    # except:
    #     df.to_csv("Failed/" + str(df.iloc[0, 0]) +
    #             "_data_failed_dbpush.csv", index=False)
    dataService.cleanup()


In [7]:
tableName = "public.\"StationsDly\""
query = "SELECT * FROM " + tableName + ";"


dataService = DataService(PGDB, PGUSER, PGPW)
db_con = dataService.connect()
dfStations = gpd.GeoDataFrame.from_postgis(query, db_con, geom_col='geometry')


In [8]:
# get list of MANITOBA stations
mbStations = dfStations.loc[dfStations['Province'] == 'MANITOBA']

# remove stations with NaN DLY First Year
mbStations = mbStations.loc[mbStations['DLY First Year'].notnull()]

# reindex 
mbStations.reset_index(drop=True, inplace=True)

In [9]:
# try first 2 stations
# mbStations = mbStations.iloc[0:3, :]

In [10]:
mbStations.count()

Station Name      80
Province          80
Latitude          80
Longitude         80
Elevation         80
Climate ID        80
WMO Identifier    61
TC Identifier     75
First Year        80
Last Year         80
HLY First Year    80
HLY Last Year     80
DLY First Year    80
DLY Last Year     80
MLY First Year    54
MLY Last Year     54
geometry          80
CRnum             80
scraped           80
dtype: int64

In [11]:
requester = cdr()
df = pd.DataFrame()

for index, row in mbStations.iterrows():
                
    stationID = str(row['Climate ID'])
    errQuery = sq.text("UPDATE public.\"StationsDly\" SET \"scraped\" = False WHERE \"Climate ID\" like CAST(\'{}\' AS TEXT);".format(stationID))
    if row['scraped'] == False or row['scraped'] == None:
        try:
            startYr = 1995
            endYr = 2022
            if row['DLY First Year'] != np.NAN or row['DLY Last Year'] != np.NAN or row['DLY Last Year'] > 1995:
                if row['DLY First Year'] > 1995:
                    startYr = row['DLY First Year']
                if row['DLY Last Year'] > 1995:
                    endYr = row['DLY Last Year']
                df = requester.get_data("MB", stationID, startYr, endYr)
                clear_output(wait=False)

                print(df.head())
                
                if not df.empty:
                    dataProcessA(df, stationID)
                else:
                    db_con.execute(errQuery)
        except Exception as e:
            print("Failed to scrape " + stationID)
            print(e)
            db_con.execute(errQuery)
    else:
        print("Data for station " + str(stationID) + " already exists.")

    print("Processed row " + str(index + 1) + " of " + str(len(mbStations)))


   Longitude (x)  Latitude (y)        Station Name  Climate ID   Date/Time   
0          -97.1          49.9  WINNIPEG THE FORKS     5023262  2022-12-01  \
1          -97.1          49.9  WINNIPEG THE FORKS     5023262  2022-12-02   
2          -97.1          49.9  WINNIPEG THE FORKS     5023262  2022-12-03   
3          -97.1          49.9  WINNIPEG THE FORKS     5023262  2022-12-04   
4          -97.1          49.9  WINNIPEG THE FORKS     5023262  2022-12-05   

   Year  Month  Day  Data Quality  Max Temp (°C) Max Temp Flag  Min Temp (°C)   
0  2022     12    1           NaN           -6.9           NaN          -16.2  \
1  2022     12    2           NaN           -7.1           NaN          -19.5   
2  2022     12    3           NaN          -13.4           NaN          -21.1   
3  2022     12    4           NaN           -3.4           NaN          -15.4   
4  2022     12    5           NaN           -5.4           NaN          -23.9   

  Min Temp Flag  Mean Temp (°C) Mean Temp Fl

In [12]:

dataService.cleanup()