### Time Series

Time series are the sequences of numbers along the time axis.

In [1]:
import pandas as pd

In [2]:
#convert an object dtyoe to datetime
data = pd.read_csv('/datasets/energy_consumption.csv')

data['Datetime'] = pd.to_datetime(data['Datetime'])
print(data.info())

#OR parse_dates arguments in the read_csv
data = pd.read_csv('/datasets/energy_consumption.csv', parse_dates=[0])

In [None]:
#Set the table index equal to the Datetime column.
#or call the df.set_index() function.
data = pd.read_csv('/datasets/energy_consumption.csv', parse_dates=[0], index_col='Datetime')

In [None]:
#change the dates to chronological order
import pandas as pd

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(axis=0, ascending=True, inplace=True)
print(data.index.is_monotonic)
# < write code here >
print(data.info())

In [None]:
#select data from january to june 2018
import pandas as pd

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01':'2018-06']
print(data.info())

In [None]:
#Plot the time series graph.
data.plot()

### Resampling
- change the interval of a time series

In [None]:
#To change the interval and group the values, call the resample() function
# 1H = one hour
data.resample('1H') 

# 2W = two weeks
data.resample('2W')

In [None]:
#The resample() function is similar to the groupby() function. 
#After grouping, call functions mean() and max() to aggregate the values:
# mean for each hour
data.resample('1H').mean()

# maximum for each two weeks
data.resample('2W').max()

In [None]:
#Plot the graph of average electricity consumption by year
import pandas as pd

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data.resample('1Y').mean()
# < write code here >
data.plot()

In [None]:
import pandas as pd

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01' : '2018-06'].resample('1D').sum()# < write code here >
data.plot()

### Rolling mean

Rolling mean or moving average is a method of smoothing the data in a time series. 
- The method involves finding the values least susceptible to fluctuations, that is, the arithmetic mean.

*Here's how the method works:*
- The interval for averaging (window size) is selected experimentally.
    - The larger the interval, the stronger the smoothing. 
- Then the window starts to "roll" almost from the beginning to the end of the time series. 
    - The mean value is calculated at each point.
    
In the moving average, the windows overlap and cannot go beyond the series. 
- So the number of obtained means will be slightly less than the number of the initial values of the series.

In [None]:
#Call the rolling() function to create a rolling window. Specify the window size in the argument:
# window size 7
data.rolling(7)

In [None]:
#Call the mean() function to aggregate the values:
# rolling mean with window size 7
data.rolling(7).mean()

In [None]:
import pandas as pd

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01':'2018-06'].resample('1D').sum()
data['rolling_mean'] = data.rolling(10).mean()
data.plot()

### Trends and Seasonality

In [None]:
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01':'2018-06'].resample('1D').sum()

decomposed = seasonal_decompose(data)# < write code here >

plt.figure(figsize=(6, 8))
plt.subplot(311)
# To display the graph correctly, specify its
# axes ax equal to plt.gca() (gca = get current axis)
decomposed.trend.plot(ax=plt.gca())
plt.title('Trend')
plt.subplot(312)
# < write code here >
decomposed.seasonal.plot(ax=plt.gca())
plt.title('Seasonality')
plt.subplot(313)
# < write code here >
decomposed.resid.plot(ax=plt.gca())
plt.title('Residuals')
plt.tight_layout()

In [None]:
#Plot a graph of the seasonal component for the first 15 days of January 2018.
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01':'2018-06'].resample('1D').sum()

decomposed = seasonal_decompose(data)

# < write code
decomposed = seasonal_decompose(data)
decomposed.seasonal['2018-01-01':'2018-01-15'].plot()

### Stationary series

In [None]:
import pandas as pd

data = pd.read_csv('energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01':'2018-06'].resample('1D').sum()
data['mean'] = data['PJME_MW'].rolling(15).mean()
data['std'] = data['PJME_MW'].rolling(15).std()
data.plot()

In [1]:
import pandas as pd
data = pd.Series([0.5, 0.7, 2.4, 3.2])
print(data)
print(data.shift())

0    0.5
1    0.7
2    2.4
3    3.2
dtype: float64
0    NaN
1    0.5
2    0.7
3    2.4
dtype: float64


In [2]:
import pandas as pd

data = pd.Series([0.5, 0.7, 2.4, 3.2])
print(data)
print(data.shift(fill_value=0))

0    0.5
1    0.7
2    2.4
3    3.2
dtype: float64
0    0.0
1    0.5
2    0.7
3    2.4
dtype: float64


### Time Series Difference

In [None]:
import pandas as pd

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data['2018-01':'2018-06'].resample('1D').sum()
# < write code here >
data = data - data.shift()
data['mean'] = data['PJME_MW'].rolling(15).mean()
data['std'] = data['PJME_MW'].rolling(15).std()
data.plot()

## Time Series Forecasting

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.Series([0.1, 0.5, 2.3, 1.2, 1.5])
train, test = train_test_split(data, shuffle=False, test_size=0.2)
print(data)
print('Training set:')
print(train)
print('Test set:')
print(test)

0    0.1
1    0.5
2    2.3
3    1.2
4    1.5
dtype: float64
Training set:
0    0.1
1    0.5
2    2.3
3    1.2
dtype: float64
Test set:
4    1.5
dtype: float64


In [None]:
import pandas as pd
# < write code here >
from sklearn.model_selection import train_test_split

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data.resample('1D').sum()

# < write code here >
train, test = train_test_split(data, test_size=0.2, shuffle=False)

print(train.index.min(), train.index.max())
print(test.index.min(), test.index.max())

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# < write code here >
from sklearn.metrics import mean_absolute_error

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data.resample('1D').sum()

train, test = train_test_split(data, shuffle=False, test_size=0.2)

#print(test.shape)
print("Mean daily power consumption:", test['PJME_MW'].mean())
pred_median = np.ones(test.shape) * train['PJME_MW'].median()
# < write code here >
print("MAE:", mean_absolute_error(test['PJME_MW'], np.ones(test.shape) * pred_median))
# < write code here >)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

data = pd.read_csv('/datasets/energy_consumption.csv', index_col=[0], parse_dates=[0])
data.sort_index(inplace=True)
data = data.resample('1D').sum()

train, test = train_test_split(data, shuffle=False, test_size=0.2)

print("Mean daily power consumption:", test['PJME_MW'].mean())

pred_previous = test.shift()
pred_previous.iloc[0] = train.iloc[-1]
# < write code here >
print("MAE:", mean_absolute_error(test['PJME_MW'], pred_previous))
      # < write code here >)