In [1]:
import pandas as pd
import numpy as np

## Windowing operations :

- pandas contains a compact set of APIs for performing windowing operations - an operation that performs an aggregation over a sliding partition of values. 
- The API functions similarly to the groupby API in that Series and DataFrame call the windowing method with necessary parameters and then subsequently call the aggregation function

## Overview

pandas supports 4 types of windowing operations:

- Rolling window: Generic fixed or variable sliding window over the values.

- Weighted window: Weighted, non-rectangular window supplied by the scipy.signal library.

- Expanding window: Accumulating window over the values.

In [2]:
#basic example

a = pd.Series(range(10))
a

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [3]:
a

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [4]:
a.rolling(2).sum() #creates the running total

0     NaN
1     1.0
2     3.0
3     5.0
4     7.0
5     9.0
6    11.0
7    13.0
8    15.0
9    17.0
dtype: float64

In [5]:
a.expanding().sum() #returns the cummulative sum total 

0     0.0
1     1.0
2     3.0
3     6.0
4    10.0
5    15.0
6    21.0
7    28.0
8    36.0
9    45.0
dtype: float64

Note :

- Windowing operations currently only support numeric data (integer and float) and will always return float64 values

Rolling Windows:

- Definition: A rolling window is a fixed-size subset of data that moves along the dataset, allowing you to perform operations within this window

In [6]:
#rolling

s = pd.Series(range(10), index = pd.date_range('1/1/2022', periods = 10))
s

2022-01-01    0
2022-01-02    1
2022-01-03    2
2022-01-04    3
2022-01-05    4
2022-01-06    5
2022-01-07    6
2022-01-08    7
2022-01-09    8
2022-01-10    9
Freq: D, dtype: int64

In [7]:
s.rolling(window = '2D').sum()

2022-01-01     0.0
2022-01-02     1.0
2022-01-03     3.0
2022-01-04     5.0
2022-01-05     7.0
2022-01-06     9.0
2022-01-07    11.0
2022-01-08    13.0
2022-01-09    15.0
2022-01-10    17.0
Freq: D, dtype: float64

In [8]:
s.rolling(2).sum()

2022-01-01     NaN
2022-01-02     1.0
2022-01-03     3.0
2022-01-04     5.0
2022-01-05     7.0
2022-01-06     9.0
2022-01-07    11.0
2022-01-08    13.0
2022-01-09    15.0
2022-01-10    17.0
Freq: D, dtype: float64

In [9]:
#expanding

df = pd.DataFrame ({'A' : ['a', 'b', 'a', 'b', 'a'], 'B' : [1,2,3,4,1]})
df

Unnamed: 0,A,B
0,a,1
1,b,2
2,a,3
3,b,4
4,a,1


In [10]:
df.groupby('A').expanding().sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,1.0
a,2,4.0
a,4,5.0
b,1,2.0
b,3,6.0


In [11]:
#Exercise with simple data

data = pd.DataFrame({'a' : range(1, 11), 'b' : [10, np.nan, np.nan, 20, np.nan, np.nan, 30, np.nan, 40, 50]})
data

Unnamed: 0,a,b
0,1,10.0
1,2,
2,3,
3,4,20.0
4,5,
5,6,
6,7,30.0
7,8,
8,9,40.0
9,10,50.0


In [12]:
#cumulative sum (CUMSUM)

data['CumSum'] = data['a'].cumsum()
data

Unnamed: 0,a,b,CumSum
0,1,10.0,1
1,2,,3
2,3,,6
3,4,20.0,10
4,5,,15
5,6,,21
6,7,30.0,28
7,8,,36
8,9,40.0,45
9,10,50.0,55


In [13]:
#cumulative product (CUMPROD)

data['cumprod'] = data ['a'].cumprod()
data

Unnamed: 0,a,b,CumSum,cumprod
0,1,10.0,1,1
1,2,,3,2
2,3,,6,6
3,4,20.0,10,24
4,5,,15,120
5,6,,21,720
6,7,30.0,28,5040
7,8,,36,40320
8,9,40.0,45,362880
9,10,50.0,55,3628800


In [14]:
#cumulative minimum (CUMMIN) and cumulative maximum (CUMMAX)

data['cumin'] = data['a'].cummin()
data['cummax'] = data['a'].cummax()

data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax
0,1,10.0,1,1,1,1
1,2,,3,2,1,2
2,3,,6,6,1,3
3,4,20.0,10,24,1,4
4,5,,15,120,1,5
5,6,,21,720,1,6
6,7,30.0,28,5040,1,7
7,8,,36,40320,1,8
8,9,40.0,45,362880,1,9
9,10,50.0,55,3628800,1,10


In [15]:
#rolling sum

data['r_sum'] = data['a'].rolling(2).sum() #sum by each 2 values (1st and 2nd)
data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax,r_sum
0,1,10.0,1,1,1,1,
1,2,,3,2,1,2,3.0
2,3,,6,6,1,3,5.0
3,4,20.0,10,24,1,4,7.0
4,5,,15,120,1,5,9.0
5,6,,21,720,1,6,11.0
6,7,30.0,28,5040,1,7,13.0
7,8,,36,40320,1,8,15.0
8,9,40.0,45,362880,1,9,17.0
9,10,50.0,55,3628800,1,10,19.0


In [16]:
#rolling count

data['r_count_b'] = data['b'].rolling(2).count()
data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax,r_sum,r_count_b
0,1,10.0,1,1,1,1,,
1,2,,3,2,1,2,3.0,1.0
2,3,,6,6,1,3,5.0,0.0
3,4,20.0,10,24,1,4,7.0,1.0
4,5,,15,120,1,5,9.0,1.0
5,6,,21,720,1,6,11.0,0.0
6,7,30.0,28,5040,1,7,13.0,1.0
7,8,,36,40320,1,8,15.0,1.0
8,9,40.0,45,362880,1,9,17.0,1.0
9,10,50.0,55,3628800,1,10,19.0,2.0


In [17]:
#rolling min and max

data['r_min'] = data['a'].rolling(4).min()
data['r_max'] = data['a'].rolling(4).max()

data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax,r_sum,r_count_b,r_min,r_max
0,1,10.0,1,1,1,1,,,,
1,2,,3,2,1,2,3.0,1.0,,
2,3,,6,6,1,3,5.0,0.0,,
3,4,20.0,10,24,1,4,7.0,1.0,1.0,4.0
4,5,,15,120,1,5,9.0,1.0,2.0,5.0
5,6,,21,720,1,6,11.0,0.0,3.0,6.0
6,7,30.0,28,5040,1,7,13.0,1.0,4.0,7.0
7,8,,36,40320,1,8,15.0,1.0,5.0,8.0
8,9,40.0,45,362880,1,9,17.0,1.0,6.0,9.0
9,10,50.0,55,3628800,1,10,19.0,2.0,7.0,10.0


In [18]:
#rolling mean and median

data['r_mean'] = data['a'].rolling(3).mean()
data['r_median'] = data['a'].rolling(2).median()

data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax,r_sum,r_count_b,r_min,r_max,r_mean,r_median
0,1,10.0,1,1,1,1,,,,,,
1,2,,3,2,1,2,3.0,1.0,,,,1.5
2,3,,6,6,1,3,5.0,0.0,,,2.0,2.5
3,4,20.0,10,24,1,4,7.0,1.0,1.0,4.0,3.0,3.5
4,5,,15,120,1,5,9.0,1.0,2.0,5.0,4.0,4.5
5,6,,21,720,1,6,11.0,0.0,3.0,6.0,5.0,5.5
6,7,30.0,28,5040,1,7,13.0,1.0,4.0,7.0,6.0,6.5
7,8,,36,40320,1,8,15.0,1.0,5.0,8.0,7.0,7.5
8,9,40.0,45,362880,1,9,17.0,1.0,6.0,9.0,8.0,8.5
9,10,50.0,55,3628800,1,10,19.0,2.0,7.0,10.0,9.0,9.5


In [19]:
#rolling std

data['r_std'] = data['a'].rolling(3).std()
data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax,r_sum,r_count_b,r_min,r_max,r_mean,r_median,r_std
0,1,10.0,1,1,1,1,,,,,,,
1,2,,3,2,1,2,3.0,1.0,,,,1.5,
2,3,,6,6,1,3,5.0,0.0,,,2.0,2.5,1.0
3,4,20.0,10,24,1,4,7.0,1.0,1.0,4.0,3.0,3.5,1.0
4,5,,15,120,1,5,9.0,1.0,2.0,5.0,4.0,4.5,1.0
5,6,,21,720,1,6,11.0,0.0,3.0,6.0,5.0,5.5,1.0
6,7,30.0,28,5040,1,7,13.0,1.0,4.0,7.0,6.0,6.5,1.0
7,8,,36,40320,1,8,15.0,1.0,5.0,8.0,7.0,7.5,1.0
8,9,40.0,45,362880,1,9,17.0,1.0,6.0,9.0,8.0,8.5,1.0
9,10,50.0,55,3628800,1,10,19.0,2.0,7.0,10.0,9.0,9.5,1.0


In [20]:
#to check standar deviation

np.std([1,2,3])

0.816496580927726

In [21]:
np.std([1,2,3], ddof = 1)

1.0

In [22]:
#variance

data['r_var'] = data['a'].rolling(3).var()
data

Unnamed: 0,a,b,CumSum,cumprod,cumin,cummax,r_sum,r_count_b,r_min,r_max,r_mean,r_median,r_std,r_var
0,1,10.0,1,1,1,1,,,,,,,,
1,2,,3,2,1,2,3.0,1.0,,,,1.5,,
2,3,,6,6,1,3,5.0,0.0,,,2.0,2.5,1.0,1.0
3,4,20.0,10,24,1,4,7.0,1.0,1.0,4.0,3.0,3.5,1.0,1.0
4,5,,15,120,1,5,9.0,1.0,2.0,5.0,4.0,4.5,1.0,1.0
5,6,,21,720,1,6,11.0,0.0,3.0,6.0,5.0,5.5,1.0,1.0
6,7,30.0,28,5040,1,7,13.0,1.0,4.0,7.0,6.0,6.5,1.0,1.0
7,8,,36,40320,1,8,15.0,1.0,5.0,8.0,7.0,7.5,1.0,1.0
8,9,40.0,45,362880,1,9,17.0,1.0,6.0,9.0,8.0,8.5,1.0,1.0
9,10,50.0,55,3628800,1,10,19.0,2.0,7.0,10.0,9.0,9.5,1.0,1.0


In [23]:
#checking variance value

np.var([1,2,3])

0.6666666666666666

In [24]:
np.var([1,2,3], ddof = 1)

1.0

Centering windows
- By default the labels are set to the right edge of the window, but a center keyword is available so the labels can be set at the center.

In [25]:
df = pd.DataFrame({'a' : range(1, 11), 'b' : range(11,21)})
df

Unnamed: 0,a,b
0,1,11
1,2,12
2,3,13
3,4,14
4,5,15
5,6,16
6,7,17
7,8,18
8,9,19
9,10,20


In [26]:
df.rolling(3, center = True).sum() #by using centre it will take one value above and one value below in our example while summing 3 it takes one value above ie, 2 and one value below ie, 4

Unnamed: 0,a,b
0,,
1,6.0,36.0
2,9.0,39.0
3,12.0,42.0
4,15.0,45.0
5,18.0,48.0
6,21.0,51.0
7,24.0,54.0
8,27.0,57.0
9,,


Rolling window endpoints

The inclusion of the interval endpoints in rolling window calculations can be specified with the closed parameter:

- 'right' - close right endpoint

- 'left' - close left endpoint

- 'both' - close both endpoints

- 'neither' - open endpoints

For example, having the right endpoint open is useful in many problems that require that there is no contamination from present information back to past information. This allows the rolling window to compute statistics “up to that point in time”, but not including that point in time.

In [27]:
df = pd.DataFrame(
    {"x": range(1,7)},
    index=[
        pd.Timestamp("20130101 09:00:01"),
        pd.Timestamp("20130101 09:00:02"),
        pd.Timestamp("20130101 09:00:03"),
        pd.Timestamp("20130101 09:00:04"),
        pd.Timestamp("20130101 09:00:06"),
        pd.Timestamp("20130101 09:00:07")
    ],
)
df

Unnamed: 0,x
2013-01-01 09:00:01,1
2013-01-01 09:00:02,2
2013-01-01 09:00:03,3
2013-01-01 09:00:04,4
2013-01-01 09:00:06,5
2013-01-01 09:00:07,6


In [28]:
df['right'] = df.rolling('2s', closed = 'right').x.sum() #right --> after number
df['left'] = df.rolling('2s', closed = 'left').x.sum() #left --> before number
df['both'] = df.rolling('2s', closed = 'both').x.sum() #both --> performs both
df['neither'] = df.rolling('2s', closed = 'neither').x.sum()
df

Unnamed: 0,x,right,left,both,neither
2013-01-01 09:00:01,1,1.0,,1.0,
2013-01-01 09:00:02,2,3.0,1.0,3.0,1.0
2013-01-01 09:00:03,3,5.0,3.0,6.0,2.0
2013-01-01 09:00:04,4,7.0,5.0,9.0,3.0
2013-01-01 09:00:06,5,5.0,4.0,9.0,
2013-01-01 09:00:07,6,11.0,5.0,11.0,5.0


Custom window rolling
- In addition to accepting an integer or offset as a window argument, rolling also accepts a BaseIndexer subclass that allows a user to define a custom method for calculating window bounds. 
- The BaseIndexer subclass will need to define a get_window_bounds method that returns a tuple of two arrays, the first being the starting indices of the windows and second being the ending indices of the windows. 
- Additionally, num_values, min_periods, center, closed and step will automatically be passed to get_window_bounds and the defined method must always accept these arguments