In [12]:
# To start, this will create the SparkSession and set the environment to use our local MongoDB cluster.
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("pyspark-notebook2") \
        .master("spark://spark-master:7077") \
        .config("spark.executor.memory", "1g") \
        .config("spark.mongodb.input.uri","mongodb://mongo1:27017,mongo2:27018,mongo3:27019/Stocks.Source?replicaSet=rs0") \
        .config("spark.mongodb.output.uri","mongodb://mongo1:27017,mongo2:27018,mongo3:27019/Stocks.Source?replicaSet=rs0") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .getOrCreate()

In [9]:
# Next load the dataframes from MongoDB
df = spark.read.format("mongo").load()
#Let’s verify the data was loaded by looking at the schema:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_symbol: string (nullable = true)
 |-- price: double (nullable = true)
 |-- tx_time: string (nullable = true)



In [13]:
# We can see that the tx_time field is loaded as a string. We can easily convert this to a time by issuing a cast statement:
df = df.withColumn('tx_time', df.tx_time.cast('timestamp'))

In [14]:
# Next, we can add a new ‘movingAverage’ column that will show a moving average based upon the previous value in the dataset. To do this we leverage the PySpark Window function as follows:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

movAvg = df.withColumn("movingAverage", F.avg("price")
             .over(Window.partitionBy("company_symbol").rowsBetween(-1,1)))


In [15]:
# To see our data with the new moving average column we can issue a movAvg.show().
movAvg.show()

+--------------------+--------------------+--------------+-----+-------------------+------------------+
|                 _id|        company_name|company_symbol|price|            tx_time|     movingAverage|
+--------------------+--------------------+--------------+-----+-------------------+------------------+
|{5f527ac22f6a1552...|ITCHY ACRE CORPOR...|           IAC|43.38|2020-09-04 13:34:58|43.385000000000005|
|{5f527ac32f6a1552...|ITCHY ACRE CORPOR...|           IAC|43.39|2020-09-04 13:34:59| 43.39666666666667|
|{5f527ac42f6a1552...|ITCHY ACRE CORPOR...|           IAC|43.42|2020-09-04 13:35:00|43.419999999999995|
|{5f527ac52f6a1552...|ITCHY ACRE CORPOR...|           IAC|43.45|2020-09-04 13:35:01|43.443333333333335|
|{5f527ac62f6a1552...|ITCHY ACRE CORPOR...|           IAC|43.46|2020-09-04 13:35:02|             43.46|
|{5f527ac72f6a1552...|ITCHY ACRE CORPOR...|           IAC|43.47|2020-09-04 13:35:03| 43.47666666666667|
|{5f527ac82f6a1552...|ITCHY ACRE CORPOR...|           IAC| 43.5|

In [16]:
# To update the data in our MongoDB cluster, we use the save method.
movAvg.write.format("mongo").option("replaceDocument", "true").mode("append").save()

In [10]:
# We can also use the power of the MongoDB Aggregation Framework to pre-filter, sort or aggregate our MongoDB data.
pipeline = "[{'$group': {_id:'$company_name', 'maxprice': {$max:'$price'}}},{$sort:{'maxprice':-1}}]"
aggPipelineDF = spark.read.format("mongo").option("pipeline", pipeline).option("partitioner", "MongoSinglePartitioner").load()
aggPipelineDF.show()

+--------------------+--------+
|                 _id|maxprice|
+--------------------+--------+
|FRUSTRATING CHAOS...|    87.6|
|HOMELY KIOSK UNLI...|   86.48|
| CREEPY GIT HOLDINGS|    83.4|
|GREASY CHAMPION C...|   81.76|
|COMBATIVE TOWNSHI...|   72.18|
|FROTHY MIDNIGHT P...|   66.81|
|ITCHY ACRE CORPOR...|   44.42|
|LACKADAISICAL SAV...|   42.34|
|CORNY PRACTITIONE...|   38.55|
|TRITE JACKFRUIT P...|   22.62|
+--------------------+--------+



In [6]:
# stop spark session
spark.stop()