In [10]:
import os
import sys
import pandas as pd
import geopandas as gpd
import pygeos as pg
import numpy as np
import calendar
from IPython.display import clear_output
from matplotlib import pyplot as plt

In [11]:
# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
pd.set_option("display.max_columns", None)
NULLFLAG = -9999

In [12]:
def summaryDaily(df: pd.DataFrame) -> pd.DataFrame:
    """Returns new df with min max mean for each numeric column"""

    result = df.groupby(
        ["ClimateID", "ProvinceCode", "Year", "Month", "Day"], as_index=False
    ).agg(
        {
            "Temp": [("MeanTemp", "mean"), ("MinTemp", "min"), ("MaxTemp", "max")],
            "DewPointTemp": [
                ("MeanDewPoint", "mean"),
                ("MinDewPoint", "min"),
                ("MaxDewPoint", "max"),
            ],
            "RelativeHumidity": [
                ("MeanHumidity", "mean"),
                ("MinHumidity", "min"),
                ("MaxHumidity", "max"),
            ],
            "StationPressure": [
                ("MeanPressure", "mean"),
                ("MinPressure", "min"),
                ("MaxPressure", "max"),
            ],
            "WindSpeed": [
                ("MeanWindSpeed", "mean"),
                ("MinWindSpeed", "min"),
                ("MaxWindSpeed", "max"),
            ],
            "WindChill": [
                ("MeanWindChill", "mean"),
                ("MinWindChill", "min"),
                ("MaxWindChill", "max"),
            ],
            "PrecipAmount": [("TotalPrecip", "sum")],
            "WindDirection": [("MeanWindDirection", "mean")],
        }
    )
    # result.drop(columns=['Hour'], inplace=True)
    return result

In [13]:
# def cleanup(id: str, year: int, month: int, day: int, srcTable: str, destTable: str) -> None:
def cleanup(df: pd.DataFrame) -> pd.DataFrame:
    df.astype(
        {
            "ClimateID": "str",
            "ProvinceCode": "str",
            "Year": "int",
            "Month": "int",
            "Day": "int",
            "Hour": "int",
            "Temp": "float",
            "DewPointTemp": "float",
            "PrecipAmount": "float",
            "RelativeHumidity": "float",
            "StationPressure": "float",
            "WindChill": "float",
            "WindDirection": "float",
            "WindSpeed": "float",
        },
        copy=False,
    )

    # Replace NULLFLAG values with mean for each column
    df = df.replace(NULLFLAG, np.nan)
    df = df.fillna(df.mean())

    return df

In [14]:
dfWeatherHourly = pd.read_csv("Data/WeatherData2020_2022_headers.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
dfIDs = dfWeatherHourly[["ClimateID"]].drop_duplicates()
# Replace NULLFLAG values with mean for each column
for id in dfIDs["ClimateID"]:
    clear_output(wait=False)
    print("Processing ClimateID: {}".format(id))

    # Iterate through days from 2009 to 2022
    year = 2010
    month = 1
    for year in range(2020, 2023):
        # first we check if the db has data for the year
        dfYear = dfWeatherHourly[
            (dfWeatherHourly["ClimateID"] == id) & (dfWeatherHourly["Year"] == year)
        ]
        if dfYear.empty:
            continue

        monthList = dfYear["Month"].unique()
        for month in monthList:
            dfMonth = dfYear[dfYear["Month"] == month]

            dayList = dfMonth["Day"].unique()
            for day in dayList:
                dfDay = dfMonth[dfMonth["Day"] == day]

                dfClean = cleanup(dfDay)

                # Get df with min max mean for each numeric column
                dfSummary = summaryDaily(dfClean)

                # Update the databasegapeblastersupreme

                dfSummary.to_csv(
                    "Data/WeatherDataAggDaily.csv", mode="a", header=False, index=False
                )

                print("Processed ClimateID: {} for {}".format(id, year))
                # sys.exit(0)

Processing ClimateID: 4012403
Processed ClimateID: 4012403 for 2020


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
