In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
#Power output HQ dataset (with only wind powercolumn)
power_df = pd.read_csv("../historique-production-electricite-quebec.csv")[["Date","Wind"]]

There are some duplicate dates due to time-zone related issue. I am printing them here.

In [3]:
#Checking duplicate 'Date' in the original dataset
power_df_strtime = power_df
print("Duplicated places:")
print(power_df_strtime[power_df_strtime['Date'].duplicated()])

#for trouble in [5476,14385,22988,28901,41485]:
#    print(f"Displaying around troublesome index: {trouble}")
#    print(power_df.loc[trouble-2:trouble+3])

Duplicated places:
                            Date     Wind
5476   2020-11-01T01:30:00-05:00  2737.00
14385  2021-11-07T01:30:00-05:00  2584.00
22988  2019-11-03T01:30:00-05:00   686.00
28901  2022-11-06T01:30:00-05:00  2576.00
41485  2023-03-12T03:30:00-04:00   142.97


Convert all time strings to pandas Timestamp objects with UTC flag true.

In [5]:
#Format time from string to TimeStamp and converted all times to UTC
power_df['Date'] = pd.to_datetime(power_df['Date'], utc =True)

But this shifts some datapoints from Year N to Year N+1, so we shift everything by 5 hours so that the first hour is 2019-01-01 00:00:00 and the last hour is 2023-12-31 23:00:00.

In [6]:
power_df['Date'] = power_df['Date'].apply(lambda x: x - pd.Timedelta(hours=5))

In [15]:
#power_df['TimeZone'] = power_df['Date'].apply(lambda x: x.utcoffset())
#print(power_df['TimeZone'].unique())

After this, we can treat our data time-zone independent. We we remove the time-zone information all together.

In [7]:
power_df['Date'] = power_df['Date'].dt.tz_localize(None)#remove time-zone information

In [9]:
#Sort the dataset by date
power_df = power_df.sort_values('Date')

#Replace original 'Date' column by Year:MonthDay:Hour:00 for consistant formating with weather data
power_df['Date'] = power_df['Date'].apply(lambda x: x.floor('H'))

#Adding year and month/day in the dataset
power_df['Year'] = power_df['Date'].apply(lambda x: x.year)                     #adding a year column
power_df['MonthDay'] = power_df['Date'].apply(lambda x: x.strftime('%m-%d'))    #adding a Month/Day column

  power_df['Date'] = power_df['Date'].apply(lambda x: x.floor('H'))


In [7]:
#checking if the last entry in inside 2023
np.array(power_df['Date'])[-1]

np.datetime64('2023-12-31T23:00:00.000000000')

In [7]:
timestamps = pd.date_range(start="2019-01-01 00:00:00", end="2023-12-31 23:00:00", freq="H")

  timestamps = pd.date_range(start="2019-01-01 00:00:00", end="2023-12-31 23:00:00", freq="H")


In [8]:
# Step 3: Find missing timestamps
missing = timestamps.difference(power_df['Date'])
print("Missing timestamps:")
print(missing, len(missing))

# Step 4: Find duplicate timestamps
duplicates = power_df['Date'][power_df['Date'].duplicated(keep=False)]
print("Duplicate timestamps:")
print(duplicates, len(duplicates))

Missing timestamps:
DatetimeIndex(['2019-11-03', '2020-11-01', '2021-11-07', '2022-11-06',
               '2023-11-05'],
              dtype='datetime64[ns]', freq=None) 5
Duplicate timestamps:
22988   2019-11-03 01:00:00
17194   2019-11-03 01:00:00
5475    2020-11-01 01:00:00
5476    2020-11-01 01:00:00
14385   2021-11-07 01:00:00
14383   2021-11-07 01:00:00
28901   2022-11-06 01:00:00
11380   2022-11-06 01:00:00
589     2023-03-12 02:00:00
41485   2023-03-12 02:00:00
Name: Date, dtype: datetime64[ns] 10


In [23]:
print(len(power_df[(power_df['Date'] >= pd.Timestamp('2023-01-01 00:00:00') - pd.Timedelta(hours=12)) & 
                (power_df['Date'] <= pd.Timestamp('2023-03-12 12:00:00') + pd.Timedelta(hours=12))]))

1718


In [16]:
print(len(power_df[(power_df['Date'] >= pd.Timestamp('2020-03-08 00:00:00')) & 
               (power_df['Date'] <= pd.Timestamp('2020-03-08 23:00:00'))]))

24


In [31]:
for year in range(2019, 2024):
    print(len(power_df[power_df['Year'].astype(int) == year]))

8760
8784
8760
8760
8760


In [16]:
dupes_df = power_df[power_df['Date'].duplicated(keep=False)]
dupes_df_sorted = dupes_df.sort_values('Date')

for ts in dupes_df_sorted['Date'].unique():
    print(f"\nNearby rows for duplicated timestamp: {ts}")
    nearby = power_df[(power_df['Date'] >= ts - pd.Timedelta(hours=3)) & 
                (power_df['Date'] <= ts + pd.Timedelta(hours=2))]
    print(nearby.sort_values('Date'))


Nearby rows for duplicated timestamp: 2019-11-03 01:00:00
                     Date   Wind  Year MonthDay
17192 2019-11-02 22:00:00  795.0  2019    11-02
17193 2019-11-02 23:00:00  773.0  2019    11-02
22988 2019-11-03 01:00:00  686.0  2019    11-03
17194 2019-11-03 01:00:00  712.0  2019    11-03
22989 2019-11-03 02:00:00  702.0  2019    11-03
17195 2019-11-03 03:00:00  670.0  2019    11-03

Nearby rows for duplicated timestamp: 2020-11-01 01:00:00
                     Date    Wind  Year MonthDay
20062 2020-10-31 22:00:00  2202.0  2020    10-31
5474  2020-10-31 23:00:00  2459.0  2020    10-31
5475  2020-11-01 01:00:00  2212.0  2020    11-01
5476  2020-11-01 01:00:00  2737.0  2020    11-01
5477  2020-11-01 02:00:00  2723.0  2020    11-01
5478  2020-11-01 03:00:00  2400.0  2020    11-01

Nearby rows for duplicated timestamp: 2021-11-07 01:00:00
                     Date    Wind  Year MonthDay
31894 2021-11-06 22:00:00  2855.0  2021    11-06
31895 2021-11-06 23:00:00  2819.0  2021    11-

### The above confirms that all those duplicated values were when the time zone shifted. We would need to manually change these 5 entries.

All those 01:00 will become 00:00. But I'm not sure how to deal with the duplicate 2023-03-12 02:00:00.

In [None]:
# We shall manually replace them here (or some better way that we need think later)


10

# Weather files

In [32]:
#File names for weather data of different farms
weather_files = os.listdir('../windfarm_weather_data')

In [33]:
farm_wdf = pd.read_csv(f"../windfarm_weather_data/{weather_files[0]}")

In [34]:
farm_wdf.head()

Unnamed: 0.1,Unnamed: 0,time,temperature_2m,relative_humidity_2m,wind_speed_10m,wind_direction_10m,location
0,0,2019-01-01T00:00,-0.3,83,16.8,138,Arthabaska wind farm
1,1,2019-01-01T01:00,-0.2,88,18.7,140,Arthabaska wind farm
2,2,2019-01-01T02:00,0.2,90,19.7,142,Arthabaska wind farm
3,3,2019-01-01T03:00,0.6,87,18.9,140,Arthabaska wind farm
4,4,2019-01-01T04:00,-0.1,99,13.2,119,Arthabaska wind farm


In [5]:
#Just wanted to check how the dataset look by looking at the first farm
farm_wdf = pd.read_csv(f"../windfarm_weather_data/{weather_files[0]}")

farm_wdf['time'] = pd.to_datetime(farm_wdf['time'], utc= True)                  #convert time to Timestamp object
farm_wdf = farm_wdf.sort_values('time')                                         #sort values using time

farm_wdf['time'] = farm_wdf['time'].apply(lambda x: x - pd.Timedelta(hours=5))  #drag the times to usual days in ET

farm_wdf['Year'] = farm_wdf['time'].apply(lambda x: x.year)                     #adding a year column
farm_wdf['MonthDay'] = farm_wdf['time'].apply(lambda x: x.strftime('%m-%d'))    #adding Month/Day as a column

#Remove data from 2024 as we don't have power data from 2024
farm_wdf = farm_wdf[farm_wdf['Year'].astype(str) != '2024']

In [6]:
for farm_name in weather_files:
    farm_wdf = pd.read_csv(f"../windfarm_weather_data/{farm_name}")
    farm_wdf['time'] = pd.to_datetime(farm_wdf['time'])                             #convert time to Timestamp object
    farm_wdf = farm_wdf.sort_values('time')                                         #sort values using time

    farm_wdf['time'] = farm_wdf['time'].apply(lambda x: x - pd.Timedelta(hours=5))  #drag the times to usual days in ET    
    if len(farm_wdf[farm_wdf['time'].duplicated()]):
        print(f"{farm_name.replace(' wind farm hourly weather 2019-2024.csv','')} has some duplicate timestamps.")
    

### Seems like weather files does not have any duplicates and have handled time-zone shifts well. Yay. 

I'll merge the datasets after I manually clean the power data.