#### Here i am telling the time series concept of moving average in pyspark , commonly used in fintech projects


What Is a Moving Average?
The moving average is a time series technique for analyzing and determining trends in data. Sometimes called rolling means, rolling averages, or running averages, they are calculated as the mean of the current and a specified number of immediately preceding values for each point in time. The main idea is to examine how these averages behave over time instead of examining the behavior of the original or raw data points.

In [None]:

import pyspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql import functions as f
conf = SparkConf()
conf.setMaster("local").setAppName("My app")
conf.get("spark.master")
spark = SparkSession.builder.getOrCreate()

In [19]:
filePath="C:\\Users\\ershiaa\\Downloads\\dataset_1.csv"
read =  spark.read.format("csv").option("header","true").load(filePath)
read_select = read.select("sno","Stocks","Date_ist","price")
read_select.show()

+---+--------+----------+-----+
|sno|  Stocks|  Date_ist|price|
+---+--------+----------+-----+
|  1|   Apple|2013-01-10|  100|
|  2|   Apple|2013-01-11|  200|
|  3|   Apple|2013-01-12|  300|
|  4|   Apple|2013-01-13|  400|
|  5|Ericsson|2013-01-14|  500|
|  6|Ericsson|2013-01-15|  600|
|  7|Ericsson|2013-01-16|  700|
|  8|Ericsson|2013-01-17|  800|
|  9| Walmart|2013-01-18|  900|
| 10| Walmart|2013-01-19| 1000|
| 11| Walmart|2013-01-20| 1100|
+---+--------+----------+-----+



In [3]:
read_select.registerTempTable("dataset")




In [4]:
avgLastThree = spark.sql(""" select  *, avg(price) 
                             over(partition by Stocks order by Date_ist ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_avg from
                            dataset""")

In [5]:
avgLastThree.show()

+---+--------+----------+-----+----------+
|sno|  Stocks|  Date_ist|price|moving_avg|
+---+--------+----------+-----+----------+
|  1|   Apple|2013-01-10|  100|     100.0|
|  2|   Apple|2013-01-11|  200|     150.0|
|  3|   Apple|2013-01-12|  300|     200.0|
|  4|   Apple|2013-01-13|  400|     300.0|
|  5|Ericsson|2013-01-14|  500|     500.0|
|  6|Ericsson|2013-01-15|  600|     550.0|
|  7|Ericsson|2013-01-16|  700|     600.0|
|  8|Ericsson|2013-01-17|  800|     700.0|
|  9| Walmart|2013-01-18|  900|     900.0|
| 10| Walmart|2013-01-19| 1000|     950.0|
| 11| Walmart|2013-01-20| 1100|    1000.0|
+---+--------+----------+-----+----------+



In [27]:
(200+300+400)/3

300.0

In [6]:
avgLastThree.printSchema()

root
 |-- sno: string (nullable = true)
 |-- Stocks: string (nullable = true)
 |-- Date_ist: string (nullable = true)
 |-- price: string (nullable = true)
 |-- moving_avg: double (nullable = true)



In [22]:
w= (Window.partitionBy("Stocks").orderBy("Date_ist").rowsBetween(-2,0))

In [23]:
df = read_select.withColumn("moving_avg",f.avg("price").over(w))

In [24]:
df.show()

+---+--------+----------+-----+----------+
|sno|  Stocks|  Date_ist|price|moving_avg|
+---+--------+----------+-----+----------+
|  1|   Apple|2013-01-10|  100|     100.0|
|  2|   Apple|2013-01-11|  200|     150.0|
|  3|   Apple|2013-01-12|  300|     200.0|
|  4|   Apple|2013-01-13|  400|     300.0|
|  5|Ericsson|2013-01-14|  500|     500.0|
|  6|Ericsson|2013-01-15|  600|     550.0|
|  7|Ericsson|2013-01-16|  700|     600.0|
|  8|Ericsson|2013-01-17|  800|     700.0|
|  9| Walmart|2013-01-18|  900|     900.0|
| 10| Walmart|2013-01-19| 1000|     950.0|
| 11| Walmart|2013-01-20| 1100|    1000.0|
+---+--------+----------+-----+----------+

