# Import libraries and read dataset

In [43]:
import pandas as pd

In [44]:
dataset_path = 'opsd_germany_daily.csv'

opsd_daily = pd.read_csv(dataset_path, index_col='Date', parse_dates=True) #parse_dates=True: convert the 'Date' column to datetime format
opsd_daily['Year'] = opsd_daily.index.year
opsd_daily['Month'] = opsd_daily.index.month
opsd_daily['Weekday Name'] = opsd_daily.index.day_name()    
opsd_daily.head(3)

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar,Year,Month,Weekday Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-01-01,1069.184,,,,2006,1,Sunday
2006-01-02,1380.521,,,,2006,1,Monday
2006-01-03,1442.533,,,,2006,1,Tuesday


# 1. Time-based indexing

In [None]:
"""
- When processing time series data, pandas can use dates and times for organizing, analyzing and querying data.
"""

opsd_daily.loc['2014-01-20' : '2014-01-22'] #select data for a specific range of dates

In [None]:
"""
- partial-string indexing: select data for a specific month, year, or other period.
"""

opsd_daily.loc['2012-02'] #select data for February 2012

# 2. Visualizing time-series data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
"""
- sns.set(): a function from seaborn that changes the default matplotlib parameters to make the plots look better.
- rc: a dictinary where we can specify the parameter we want to plots.
- 'figure.figsize':(11, 4) : the size of the figure in inches, width 11 and height 4.
- plot(): a method from pandas that plots the data in the specified column.
- linewidth=0.5: the width of the line in the plot.
"""

sns.set(rc={'figure.figsize':(11, 4)}) # set the default figure size for matplotlib plots
opsd_daily['Consumption'].plot(linewidth=0.5) #plot the time series data

In [None]:
"""
- cols_plot: a list of the columns we want to plot.
- opsd_daily[cols_plot]: select the columns we want to plot.
- marker: the symbol used to mark the data points in the plot.
- alpha: the transparency of the data points to 50% to avoid overplotting.
- linestyle: 'None' to remove the line connecting the data points.
- subplots=True: create a separate plot for each column.
"""

cols_plot = ['Consumption', 'Solar', 'Wind'] 
axes = opsd_daily[cols_plot].plot(marker='.', alpha=0.5, linestyle='None', figsize=(11, 9), subplots=True)
for axe in axes:
    axe.set_ylabel('Daily Totals (GWh)')
plt.show()

# 3. Seasonality

In [None]:
custom_palette = ['#FF9999', '#FFB266', '#CCCC66', '#99CC66', '#66CC66', '#66CCCC', '#6699CC', '#9999CC', '#CC99CC', '#FF99CC', '#FF6666', '#FF9966']

fig, axes = plt.subplots(3, 1, figsize=(11, 10), sharex=True)
for name, ax in zip(['Consumption', 'Solar', 'Wind'], axes):
    sns.boxplot(data=opsd_daily, x='Month', y=name, ax=ax, palette=custom_palette)
    ax.set_ylabel('GWh')
    ax.set_title(name)
    if ax != axes[-1]:
        ax.set_xlabel('')

# 4. Frequencies

In [None]:
pd.date_range('1998-03-10', '1998-03-15', freq='D') # create a range of dates from 1998-03-10 to 1998-03-15

In [None]:
"""
- forward fill: fill missing values with the last known value.
"""

times_sample = pd.to_datetime(['2013-02-03', '2013-02-06', '2013-02-08'])
consum_sample = opsd_daily.loc[times_sample, ['Consumption']].copy()
consum_sample

In [None]:
consum_freg = consum_sample.asfreq('D') # fill missing values with NaN
consum_freg['Consumption - Forward Fill'] = consum_sample.asfreq('D', method='ffill')
consum_freg

# 5. Resampling

In [None]:
"""
- resampling: teachnique to change the frequency of the time series data (ex: from day to month).
    + 'D': daily frequency
    + 'W': weekly frequency
    + 'M': monthly frequency
    + 'A': annual frequency
- resampling for lower frequency (downsampling): aggregation of the data.
- resampling for higher frequency (upsampling): interpolation of the data.
"""

data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
opsd_weekly_mean = opsd_daily[data_columns].resample('W').mean()
opsd_weekly_mean.head(3)

In [None]:
"""
- Weekly resampling: the mean of the data for each week.
"""

start, end = '2017-01', '2017-06'
fig, ax = plt.subplots()
ax.plot(opsd_daily.loc[start:end, 'Solar'], marker='.', linestyle='-', linewidth=0.5, label='Daily')
ax.plot(opsd_weekly_mean.loc[start:end, 'Solar'], marker='o', markersize=8, linestyle='-', label='Weekly Mean Resample')
ax.set_ylabel("Solar Production (GWh)")
ax.legend() # add a legend (a box that identifies the different elements of the 
            # plot, such as lines, markers, or other plot elements) to the plot
plt.show()

In [None]:
"""
- Yearly resampling: the sum of the data for each year.
"""

opsd_annual = opsd_daily[data_columns].resample('YE').sum(min_count=360) # min_count=360: require that many non-NA values for the result to be non-NA
opsd_annual = opsd_annual.set_index(opsd_annual.index.year)
opsd_annual.index.name = 'Year'
opsd_annual['Wind+Solar/Consumption'] = opsd_annual['Wind+Solar'] / opsd_annual['Consumption']
opsd_annual.tail(3)

In [None]:
ax = opsd_annual.loc[2012:, 'Wind+Solar/Consumption'].plot.bar(color='C0')
ax.set_ylabel('Fraction')
ax.set_ylim(0, 0.3) # set the y-axis limits to 0-30% (limits are 0-1 by default)
ax.set_title('Wind + Solar Share of Annual Electricity Consumption')
plt.xticks(rotation=0) # rotate the x-axis labels to be horizontal

# 7. Rolling Windowns

In [None]:
"""
- Rolling windows: 
    + a common technique for smoothing time series data by divide the data into time windowns.
    + different with downsampling where the data is not overlap each other and the output have a lower frequency, 
      rolling windows have overlapping data and the output have the same frequency as the input.
"""

opsd_7d = opsd_daily[data_columns].rolling(7, center=True).mean() # calculate the rolling mean with a window size of 7 days
opsd_7d.head(10)

# 7. Trends

In [None]:
"""
- trends: 
    + a feature of time series data that represents the general direction in which the data is moving.
    + with rolling windows, we can smooth the data to better identify trends.
"""

import matplotlib.dates as mdates
opsd_365d = opsd_daily[data_columns].rolling(window=365, center=True, min_periods=360).mean()

#Plot daily, 7-day rolling mean, and 365-day rolling mean time series data
fig, ax = plt.subplots() # create a figure and a set of subplots
ax.plot(opsd_daily['Consumption'], marker='.', markersize=2, color='0.6', linestyle='None', label='Daily') 
ax.plot(opsd_7d['Consumption'], linewidth=2, label='7-d Rolling Mean')
ax.plot(opsd_365d['Consumption'], color='0.2', linewidth=3, label='Trend (365-d Rolling Mean)')

#Set x-ticks to yearly interval and add legend and labels
ax.xaxis.set_major_locator(mdates.YearLocator()) # set x-ticks to yearly interval
ax.legend() # add a legend to the plot
ax.set_xlabel('Year') # set the x-axis label
ax.set_ylabel('Consumption (GWh)') # set the y-axis label
ax.set_title('Trends in Electricity Consumption') # set the title of the plot
plt.show()


In [None]:
# plot 365-day rolling mean time series data of wind and solar power
fig, ax = plt.subplots()
for nm in ['Wind', 'Solar', 'Wind+Solar']:
    ax.plot(opsd_365d[nm], label=nm)
    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.legend()
    ax.set_ylabel('Production (GWh)')
    ax.set_title('Trends in Electricity Production (365-d Rolling Means)')
plt.show()

In [48]:
opsd_daily['Solar']

Date
2006-01-01       NaN
2006-01-02       NaN
2006-01-03       NaN
2006-01-04       NaN
2006-01-05       NaN
               ...  
2017-12-27    16.530
2017-12-28    14.162
2017-12-29    29.854
2017-12-30     7.467
2017-12-31    19.980
Name: Solar, Length: 4383, dtype: float64