[Reference](https://medium.com/swlh/window-statistic-a-new-pair-of-hands-with-python-pandas-sql-py-spark-1096070d88d3)

# Python

In [4]:
# create sample sequence of data
sequence = [i for i in range(10)]

# define window size
start = -2
stop = 1

length = len(sequence)
result = []
    
for i in range(0, length):
    # move window
    window_start = max(start + i, 0)
    window_stop = min(i + stop + 1, length)
    
    # select elements that fall into window
    elements_in_window = sequence[window_start:window_stop]
    
    # estimate statistic of a sample
    average_of_elements = sum(elements_in_window) / len(elements_in_window)
    
    # add statistic to results
    result.append(average_of_elements)

print(sequence)  # original sequence
print(result)  # resulting sequence

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0.5, 1.0, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.0]


In [5]:
# create a sample sequence of data
sequence = [i for i in range(10)]

def window(sequence, start, stop):
    """Yields elements from sequence that fall into ]start:stop[ interval"""
    
    length = len(sequence)
    
    for i in range(0, length):
        window_start = max(start + i, 0)
        window_stop = min(i + stop + 1, length)
        
        yield sequence[window_start:window_stop]

# instantiate a generator with specified window and data sequence
window_iterator = window(sequence, -2, 1)

# define a function to estimate statistic
avg = lambda some_list: sum(some_list) / len(some_list)

print(sequence)  # original sequence
print(list(map(lambda x: avg(x), window_iterator)))  # resulting sequence

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0.5, 1.0, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.0]


In [6]:
# make a sample DataFrame
df = pd.DataFrame([3,2,5,1,6], columns=['data'])

# estimate a rolling-mean in a new column
df['mean'] = df['data'].rolling(window=3, center=True, min_periods=3).mean()

# SQL

```sql
SELECT usage,
   AVG(usage)
   OVER(PARTITION BY user
        ORDER BY date
        ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
   AS still_total_average
FROM table_name;
```

```sql
SELECT usage,
   AVG(usage)
   OVER(ORDER BY date 
   ROWS BETWEEN 2 PRECEDING AND CURRENT ROW)
   AS 3_day_moving_average
FROM table_name;
```

# Apache Spark

```python
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import mean

# instantiate a spark session and read-in some data
spark = SparkSession.builder.appName('Some_Name').getOrCreate()
path = "data/some_log.json"
some_log = spark.read.json(path)

# specify window for a window of size 3
specidied_window = Window.orderBy('column_name') \
                         .rowsBetween(-2, Window.currentRow)

# add a column named '3_day_mean' with an estimated three day mean
some_log.withColumn('3_day_mean', mean('data').over(specidied_window)).show()
```