---
# IESO (Power) Data Analysis and Preparation
---

## Setup

In [2]:
# Libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# set paths to data files
IESO_path = 'https://raw.githubusercontent.com/VernonNaidoo-Toronto/3253_ML_Group_7_Electricity_Project/master/RAW_DATA/IESO_Data/'
weather_path = 'https://raw.githubusercontent.com/VernonNaidoo-Toronto/3253_ML_Group_7_Electricity_Project/master/DATA/Weather_Data/'

  import pandas.util.testing as tm


In [3]:
# Set default plot styles
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (16, 6)
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['axes.labelsize'] = 14

## Load Ontario Power SUPPLY DATA

### Load files, concatenate, add date index to create hourly DataFrame: **power_supply_hourly**

In [4]:
# Load files from GitHub:
supply_2017 = pd.read_csv(IESO_path+'2017_Hourly_Output.csv', parse_dates=['Date'])
supply_2018 = pd.read_csv(IESO_path+'2018_Hourly_Output.csv', parse_dates=['Date'])
supply_2019 = pd.read_csv(IESO_path+'2019_Hourly_Output.csv', parse_dates=['Date'])

# Print summary of records loaded:
for year, df in [[2017,supply_2017], [2018,supply_2018], [2019,supply_2019]]:
  print(f'Loaded {df.shape[0]} records and {df.shape[1]} columns from the {year} file.')

Loaded 8760 records and 9 columns from the 2017 file.
Loaded 8760 records and 9 columns from the 2018 file.
Loaded 8760 records and 9 columns from the 2019 file.


In [5]:
# Concatenate DataFrames
power_supply_hourly = pd.concat([supply_2017, supply_2018, supply_2019], ignore_index=True) 

# Change column names to Python standard lowercase:
cols = [column_name.lower() for column_name in power_supply_hourly.columns] 
power_supply_hourly.columns = cols

In [6]:
#Move date column to index
power_supply_hourly.set_index('date', inplace=True)

# Show sample records
print('Shape:',power_supply_hourly.shape)
display(power_supply_hourly.sample(5))

Shape: (26280, 8)


Unnamed: 0_level_0,hour,nuclear,gas,hydro,wind,solar,biofuel,total output
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-26,10,11579,492,5038,2128,8,27,19272
2017-02-15,23,9826,557,4287,2447,0,27,17144
2018-09-24,14,10427,868,3431,2025,224,28,17003
2019-04-12,14,9796,254,3990,2935,112,63,17150
2018-04-16,11,9416,2379,4020,1952,65,78,17910


### Data Preparation

#### Check for missing values; check datatypes; add day column

In [7]:
power_supply_hourly.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26280 entries, 2017-01-01 to 2019-12-31
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   hour          26280 non-null  int64
 1   nuclear       26280 non-null  int64
 2   gas           26280 non-null  int64
 3   hydro         26280 non-null  int64
 4   wind          26280 non-null  int64
 5   solar         26280 non-null  int64
 6   biofuel       26280 non-null  int64
 7   total output  26280 non-null  int64
dtypes: int64(8)
memory usage: 1.8 MB


In [8]:
# Check for missing values: NONE FOUND
col_count = 0 # count columns with nulls
for col in power_supply_hourly.columns:
  null_count = power_supply_hourly[col].isna().sum()
  if null_count >0:
    print(f'The {col} column has {null_count} rows without values.')
    col_count+=1
print(f'{col_count} columns have some missing values.')

0 columns have some missing values.


In [9]:
# Add weekday column
power_supply_hourly['day'] = power_supply_hourly.index.strftime('%A')
power_supply_hourly.head(3)

Unnamed: 0_level_0,hour,nuclear,gas,hydro,wind,solar,biofuel,total output,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-01,1,11592,234,2455,2433,0,1,16715,Sunday
2017-01-01,2,11591,235,2523,1994,0,1,16344,Sunday
2017-01-01,3,11596,234,2464,1647,0,1,15942,Sunday


### Export CSV Files for Machine Learning

Output Files:
- **power_supply_hourly**
- **power_supply_daily**

#### Create and export power_supply_hourly.csv

In [10]:
#Save as (local) csv:
filename_with_path = 'power_supply_hourly.csv'
power_supply_hourly.to_csv(filename_with_path)

In [11]:
#Test:
filename_with_path = 'power_supply_hourly.csv'
test = pd.read_csv(filename_with_path)
test.set_index(['date']).head(3)

Unnamed: 0_level_0,hour,nuclear,gas,hydro,wind,solar,biofuel,total output,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-01,1,11592,234,2455,2433,0,1,16715,Sunday
2017-01-01,2,11591,235,2523,1994,0,1,16344,Sunday
2017-01-01,3,11596,234,2464,1647,0,1,15942,Sunday


#### Create and export power_supply_daily.csv

In [13]:
# Resample to daily totals
power_supply_daily = power_supply_hourly.resample('D').sum()

# Drop unwanted columns:
power_supply_daily.drop(columns='hour', inplace=True)

# Add weekday column
power_supply_daily['day'] = power_supply_daily.index.strftime('%A')
power_supply_daily.head(3)

Unnamed: 0_level_0,nuclear,gas,hydro,wind,solar,biofuel,total output,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-01,278488,5660,82156,28018,778,29,395129,Sunday
2017-01-02,278199,7418,88439,25860,568,25,400509,Monday
2017-01-03,278362,15329,100371,15468,63,863,410456,Tuesday


In [14]:
#Save as csv:
filename_with_path = 'power_supply_daily.csv'
power_supply_daily.to_csv(filename_with_path)

In [15]:
#Test:
filename_with_path = 'power_supply_daily.csv'
test = pd.read_csv(filename_with_path)
test.set_index(['date']).head(3)

Unnamed: 0_level_0,nuclear,gas,hydro,wind,solar,biofuel,total output,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-01,278488,5660,82156,28018,778,29,395129,Sunday
2017-01-02,278199,7418,88439,25860,568,25,400509,Monday
2017-01-03,278362,15329,100371,15468,63,863,410456,Tuesday
