In [43]:
import pandas as pd
import io
import os
from google.colab import drive
import numpy as np
from datetime import datetime

In [44]:
# This will allow us to save files in Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#### 1) First, we will define a function that will allow us to extract weather data from Canada Gov

In [45]:
def getHourlyData(stationID, year, month):
    base_url = "http://climate.weather.gc.ca/climate_data/bulk_data_e.html?"
    query_url = "format=csv&stationID={}&Year={}&Month={}&timeframe=1".format(stationID, year, month)
    api_endpoint = base_url + query_url
    return pd.read_csv(api_endpoint, skiprows=0)

#### 2) We get the weather for only one weather station McTavish (ID=30165)

In [46]:
stationID = 5415
year_list = [2005,2006,2007,2008]

frames_1 = []
for year in year_list:
  for month in range(12):
    df = getHourlyData(stationID, year, month+1)
    frames_1.append(df)

stationID = 30165
year_list = [2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2019,2020]

frames_2 = []
for year in year_list:
  for month in range(12):
    df = getHourlyData(stationID, year, month+1)
    frames_2.append(df)

In [47]:
weather_data_1 = pd.concat(frames_1)
weather_data_2 = pd.concat(frames_2)
frames = [weather_data_1, weather_data_2]
weather_data = pd.concat(frames)

#### 3) Recode time into shifts

In [48]:
# Data cleaning
climate_raw = weather_data.copy()
climate_raw['Shift'] = climate_raw['Time'].apply(lambda x: 1 if x<'08:00' else 2 if  x<'16:00' else 3)
climate_raw['Date']  = climate_raw['Date/Time'].str.slice(start=0, stop=10)

# Keep useful columns
climate = climate_raw[[ 'Date', 'Shift', 'Temp (°C)', 'Rel Hum (%)', 'Wind Dir (10s deg)', 'Wind Spd (km/h)', 'Stn Press (kPa)']]

# Rename columns
climate = climate.rename(columns={'Date': 'Date', 'Shift': 'Shift', 'Temp (°C)': 'Temp_DC', 'Rel Hum (%)': 'Humid_percent', 'Wind Dir (10s deg)': 'Win_Dir', 
                                  'Wind Spd (km/h)': 'Wind_Speed', 'Stn Press (kPa)': 'Stn_Press'})

# Use aggregate functions
climate_mean = climate.groupby(['Date','Shift']).mean()
climate_min  = climate.groupby(['Date','Shift']).min()
climate_max  = climate.groupby(['Date','Shift']).max()

climate_mean = climate_mean.rename(columns={'Temp_DC': 'Temp_DC_Mean', 'Humid_percent': 'Humid_percent_Mean', 'Win_Dir': 'Win_Dir_Mean',
                                            'Wind_Speed': 'Wind_Speed_Mean', 'Stn_Press': 'Stn_Press_Mean'})
climate_max = climate_max.rename(columns={'Temp_DC': 'Temp_DC_Max', 'Humid_percent': 'Humid_percent_Max', 'Win_Dir': 'Win_Dir_Max', 
                                          'Wind_Speed': 'Wind_Speed_Max', 'Stn_Press': 'Stn_Press_Max'})
climate_min = climate_min.rename(columns={'Temp_DC': 'Temp_DC_Min', 'Humid_percent': 'Humid_percent_Min', 'Win_Dir': 'Win_Dir_Min', 
                                          'Wind_Speed': 'Wind_Speed_Min', 'Stn_Press': 'Stn_Press_Min'})

# Join data
climate_final =climate_mean.join(climate_max)
climate_final =climate_final.join(climate_min)

In [49]:
# Check data
climate_final = climate_final.loc['2005-01-01':'2020-06-30']
climate_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Temp_DC_Mean,Humid_percent_Mean,Win_Dir_Mean,Wind_Speed_Mean,Stn_Press_Mean,Temp_DC_Max,Humid_percent_Max,Win_Dir_Max,Wind_Speed_Max,Stn_Press_Max,Temp_DC_Min,Humid_percent_Min,Win_Dir_Min,Wind_Speed_Min,Stn_Press_Min
Date,Shift,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2005-01-01,1,5.225,73.625,26.25,34.375,100.9075,9.2,91.0,27.0,46.0,101.44,3.1,60.0,24.0,28.0,100.47
2005-01-01,2,0.625,65.875,28.875,24.375,102.17375,2.6,78.0,30.0,28.0,102.71,-1.1,54.0,27.0,19.0,101.6
2005-01-01,3,-5.575,52.25,21.5,18.5,103.3175,-1.9,57.0,36.0,28.0,103.64,-9.5,49.0,2.0,11.0,102.86
2005-01-02,1,-13.875,64.25,4.75,21.5,103.78125,-10.7,69.0,6.0,24.0,103.83,-15.9,59.0,3.0,19.0,103.69
2005-01-02,2,-13.9375,76.5,4.625,23.25,102.9775,-11.2,87.0,6.0,28.0,103.63,-15.8,71.0,4.0,20.0,102.19


#### 4) Output data to One Drive

In [50]:
climate_final.to_csv('/content/drive/My Drive/Data/YCBS-299/Weather_data_2005_2020.csv')