In [1]:
import datetime
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder.master("local[*]").appName("cas2").getOrCreate()
spark

## Weather Data

In [3]:
sc = spark.sparkContext 

#!!! MAKE SURE THAT YOUR TEXTFILE HEADER IS THIS EXACT STRING: !!!
"STATION_ID,DATE,LOCATION,WIND,TMP,DEW,SLP"

weather_path_2017 = 'out_2017.txt'
weather_path_2018 = 'out_2018.txt'
weather_path_2019 = 'out_2019.txt'
weather_path_2020 = 'out_2020.txt'

In [19]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType,DateType
schema = StructType([StructField('STATION_ID', StringType(), False),\
                    StructField('DATE', StringType(), False),\
                    StructField('LOCATION', StringType(), False),\
                    StructField('WIND', StringType(), False),\
                    StructField('TMP', StringType(), False),\
                    StructField('DEW', StringType(), False),\
                    StructField('SLP', StringType(), False)])

df_2017 = spark.read.option("header", "true").option("dateFormat", "yyyy-mm-dd").csv(weather_path_2017)
df_2017 = df_2017.withColumn("splitcount", f.size(f.split(df_2017.WIND, " ")))
df_2017 = df_2017.filter("splitcount == 5")
df_2017 = df_2017.withColumn('wind_angle', f.split(df_2017['WIND'], ' ').getItem(0)).withColumn('wind_qualityAngle', f.split(df_2017['WIND'], ' ').getItem(1)).withColumn('wind_type', f.split(df_2017['WIND'], ' ').getItem(2)).withColumn('wind_speed', f.split(df_2017['WIND'], ' ').getItem(3)).withColumn('wind_qualitySpeed', f.split(df_2017['WIND'], ' ').getItem(4))
df_2017 = df_2017.withColumn('tmp_val', f.split(df_2017['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(df_2017['TMP'], ' ').getItem(1))
df_2017 = df_2017.withColumn('dew_val', f.split(df_2017['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(df_2017['DEW'], ' ').getItem(1))
df_2017 = df_2017.withColumn('slp_val', f.split(df_2017['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(df_2017['SLP'], ' ').getItem(1))
df_2017 = df_2017.drop("splitcount", "WIND", "TMP", "DEW", "SLP", "wind_qualityAngle", "wind_type", "wind_qualitySpeed", "tmp_quality", "dew_quality", "slp_quality")

df_2018 = spark.read.option("header", "true").option("dateFormat", "yyyy-mm-dd").csv(weather_path_2018)
df_2018 = df_2018.withColumn("splitcount", f.size(f.split(df_2018.WIND, " ")))
df_2018 = df_2018.filter("splitcount == 5")
df_2018 = df_2018.withColumn('wind_angle', f.split(df_2018['WIND'], ' ').getItem(0)).withColumn('wind_qualityAngle', f.split(df_2018['WIND'], ' ').getItem(1)).withColumn('wind_type', f.split(df_2018['WIND'], ' ').getItem(2)).withColumn('wind_speed', f.split(df_2018['WIND'], ' ').getItem(3)).withColumn('wind_qualitySpeed', f.split(df_2018['WIND'], ' ').getItem(4))
df_2018 = df_2018.withColumn('tmp_val', f.split(df_2018['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(df_2018['TMP'], ' ').getItem(1))
df_2018 = df_2018.withColumn('dew_val', f.split(df_2018['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(df_2018['DEW'], ' ').getItem(1))
df_2018 = df_2018.withColumn('slp_val', f.split(df_2018['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(df_2018['SLP'], ' ').getItem(1))
df_2018 = df_2018.drop("splitcount", "WIND", "TMP", "DEW", "SLP", "wind_qualityAngle", "wind_type", "wind_qualitySpeed", "tmp_quality", "dew_quality", "slp_quality")

df_2019 = spark.read.option("header", "true").option("dateFormat", "yyyy-mm-dd").csv(weather_path_2019)
df_2019 = df_2019.withColumn("splitcount", f.size(f.split(df_2019.WIND, " ")))
df_2019 = df_2019.filter("splitcount == 5")
df_2019 = df_2019.withColumn('wind_angle', f.split(df_2019['WIND'], ' ').getItem(0)).withColumn('wind_qualityAngle', f.split(df_2019['WIND'], ' ').getItem(1)).withColumn('wind_type', f.split(df_2019['WIND'], ' ').getItem(2)).withColumn('wind_speed', f.split(df_2019['WIND'], ' ').getItem(3)).withColumn('wind_qualitySpeed', f.split(df_2019['WIND'], ' ').getItem(4))
df_2019 = df_2019.withColumn('tmp_val', f.split(df_2019['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(df_2019['TMP'], ' ').getItem(1))
df_2019 = df_2019.withColumn('dew_val', f.split(df_2019['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(df_2019['DEW'], ' ').getItem(1))
df_2019 = df_2019.withColumn('slp_val', f.split(df_2019['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(df_2019['SLP'], ' ').getItem(1))
df_2019 = df_2019.drop("splitcount", "WIND", "TMP", "DEW", "SLP", "wind_qualityAngle", "wind_type", "wind_qualitySpeed", "tmp_quality", "dew_quality", "slp_quality")

union = df_2017.union(df_2018).union(df_2019).sort(['STATION_ID', 'DATE'])

# select stations which easy start date and then first 1000 of those
df_union_selection = union.filter(union.DATE == "2017-01-01").select('STATION_ID').limit(1000)

# filter these over the complete dataset
filtered_union= df_union_selection.join(union, 'STATION_ID')


# fill the easy start stations with min and max date
df_union_selection = df_union_selection.withColumn("min_date", f.lit("2017-01-01").cast('date')).withColumn("max_date", f.lit("2019-12-31").cast('date'))

# expand to add all days
df_union_selection = df_union_selection.withColumn('DATE', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))).drop("min_date", "max_date")

# left join to make sure all dates are there (missing dates will get null)
df_union_selection = df_union_selection.join(filtered_union, ["STATION_ID", "DATE"], "left").sort(['STATION_ID', 'DATE'])
df_union_selection = df_union_selection.withColumn("wind_speed", df_union_selection.wind_speed.cast('int')).withColumn("wind_angle", df_union_selection.wind_angle.cast('int')).withColumn("tmp_val", df_union_selection.tmp_val.cast('int')).withColumn("dew_val", df_union_selection.dew_val.cast('int')).withColumn("slp_val", df_union_selection.slp_val.cast('int'))
df_union_selection = df_union_selection.replace([999, 9999], None)
print(df_union_selection.show(5000))
# w_forward = Window.partitionBy().orderBy('STATION_ID').rowsBetween(Window.unboundedPreceding,Window.currentRow)
# w_backward = Window.partitionBy().orderBy('STATION_ID').rowsBetween(Window.currentRow,Window.unboundedFollowing)
# df_2017_selection = df_2017_selection.withColumn('wind_angle',f.last('wind_angle',ignorenulls=True).over(w_forward)).withColumn('wind_angle',f.first('wind_angle',ignorenulls=True).over(w_backward))
# df_2017_selection = df_2017_selection.withColumn('wind_speed',f.last('wind_speed',ignorenulls=True).over(w_forward)).withColumn('wind_speed',f.first('wind_speed',ignorenulls=True).over(w_backward))
# df_2017_selection = df_2017_selection.withColumn('tmp_val',f.last('tmp_val',ignorenulls=True).over(w_forward)).withColumn('tmp_val',f.first('tmp_val',ignorenulls=True).over(w_backward))
# df_2017_selection = df_2017_selection.withColumn('dew_val',f.last('dew_val',ignorenulls=True).over(w_forward)).withColumn('dew_val',f.first('dew_val',ignorenulls=True).over(w_backward))
# df_2017_selection = df_2017_selection.withColumn('slp_val',f.last('slp_val',ignorenulls=True).over(w_forward)).withColumn('slp_val',f.first('slp_val',ignorenulls=True).over(w_backward))
    # .withColumn('wind_speed',f.last('wind_speed',ignorenulls=True).over(w_forward))\
    # .withColumn('wind_speed',f.first('wind_speed',ignorenulls=True).over(w_backward))




print(df_union_selection.groupBy('STATION_ID').count().show())

+-----------+----------+--------------------+----------+----------+-------+-------+-------+
| STATION_ID|      DATE|            LOCATION|wind_angle|wind_speed|tmp_val|dew_val|slp_val|
+-----------+----------+--------------------+----------+----------+-------+-------+-------+
|01001099999|2017-01-01|JAN MAYEN NOR NAV...|       340|        90|    -37|    -84|  10263|
|01001099999|2017-01-02|JAN MAYEN NOR NAV...|       330|        90|    -49|    -93|  10190|
|01001099999|2017-01-03|JAN MAYEN NOR NAV...|        10|       110|    -21|    -32|   9958|
|01001099999|2017-01-04|JAN MAYEN NOR NAV...|       330|       100|    -54|    -87|  10215|
|01001099999|2017-01-05|JAN MAYEN NOR NAV...|       210|       100|     13|      4|  10083|
|01001099999|2017-01-06|JAN MAYEN NOR NAV...|       219|       106|      2|    -46|   9763|
|01001099999|2017-01-07|JAN MAYEN NOR NAV...|       308|        51|    -39|    -65|  10074|
|01001099999|2017-01-08|JAN MAYEN NOR NAV...|       280|       110|    -25|    -

In [5]:
# df_date_split = df_good_cols.withColumn('year', f.split(df_good_cols['DATE'], '-').getItem(0)).withColumn('month', f.split(df_good_cols['DATE'], '-').getItem(1)).withColumn('day', f.split(df_good_cols['DATE'], '-').getItem(2))
# df_date_split = df_date_split.drop("DATE")
# df_date_split.show()

In [6]:
# from pyspark.sql.functions import col, avg
# df_date_split.select(avg('month')).collect()

## Stocks data

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType,DateType
stocks_file = "Stocks.txt"
df_stocks = spark.read.option("header", "false").csv(stocks_file)
df_stocks =  df_stocks.selectExpr(
    '_c0 AS Stock',
    '_c1 AS Date',
    '_c2 AS Price',
    '_c3 AS Volume',
)

#
df_stocks_selection = df_stocks.filter((df_stocks.Date.contains("2017") | df_stocks.Date.contains("2018") | df_stocks.Date.contains("2019")))
df_stocks_selection_filtered = df_stocks_selection.filter(df_stocks_selection.Date == "01/02/2017").select('Stock').limit(1000)
selected_stocks = df_stocks_selection_filtered.join(df_stocks_selection, 'Stock')
#
stocks_dates = df_stocks_selection_filtered.withColumn("min_date", f.lit("2017-01-01").cast('date')).withColumn("max_date", f.lit("2019-12-31").cast('date'))
stocks_dates = stocks_dates.withColumn('Date', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))).drop("min_date", "max_date")

modifiedDF = selected_stocks.withColumn("Date", f.to_date("Date", "MM/dd/yyyy")).dropDuplicates(["Stock", "Date"])
stocks_dates = stocks_dates.join(modifiedDF, ["Stock", "Date"], "left").sort(['Stock', 'Date'])

# w_forward = Window.partitionBy().orderBy('Stock').rowsBetween(Window.unboundedPreceding,Window.currentRow)
# w_backward = Window.partitionBy().orderBy('Stock').rowsBetween(Window.currentRow,Window.unboundedFollowing)
# stocks_dates = stocks_dates.withColumn('Price',f.last('Price',ignorenulls=True).over(w_forward)).withColumn('Price',f.first('Price',ignorenulls=True).over(w_backward))
# w_forward2 = Window.partitionBy().orderBy('Stock').rowsBetween(Window.unboundedPreceding,Window.currentRow)
# w_backward2 = Window.partitionBy().orderBy('Stock').rowsBetween(Window.currentRow,Window.unboundedFollowing)
# stocks_dates = stocks_dates.withColumn('Volume',f.last('Volume',ignorenulls=True).over(w_forward2)).withColumn('Volume',f.first('Volume',ignorenulls=True).over(w_backward2))
print(stocks_dates.show())
# df_stocks = df_stocks.withColumn("Price",df_stocks.Price.cast('float'))
# df_stocks = df_stocks.withColumn("Volume",df_stocks.Volume.cast('int'))


+--------------------+----------+-----+------+
|               Stock|      Date|Price|Volume|
+--------------------+----------+-----+------+
|100.Asien--Austra...|2017-01-01| null|  null|
|100.Asien--Austra...|2017-01-02| 10.6|  1518|
|100.Asien--Austra...|2017-01-03| 10.6|  3444|
|100.Asien--Austra...|2017-01-04|10.65|  8253|
|100.Asien--Austra...|2017-01-05|10.87| 10861|
|100.Asien--Austra...|2017-01-06|11.03|  3622|
|100.Asien--Austra...|2017-01-07| null|  null|
|100.Asien--Austra...|2017-01-08| null|  null|
|100.Asien--Austra...|2017-01-09|10.87|   150|
|100.Asien--Austra...|2017-01-10|10.87|  1387|
|100.Asien--Austra...|2017-01-11|10.87|  8785|
|100.Asien--Austra...|2017-01-12|11.03| 30199|
|100.Asien--Austra...|2017-01-13|   11|  1098|
|100.Asien--Austra...|2017-01-14| null|  null|
|100.Asien--Austra...|2017-01-15| null|  null|
|100.Asien--Austra...|2017-01-16|10.87|  8780|
|100.Asien--Austra...|2017-01-17| 10.5|  2109|
|100.Asien--Austra...|2017-01-18|10.55|  9613|
|100.Asien--A

In [8]:
df_stocks_dates = df_stocks.withColumn('month', f.split(df_stocks['Date'], '/').getItem(0).cast('int')).withColumn('day', f.split(df_stocks['Date'], '/').getItem(1).cast('int')).withColumn('year', f.split(df_stocks['Date'], '/').getItem(2).cast('int'))
# df_stocks_dates = df_stocks_dates.drop("Date")
df_stocks_dates.show()


+--------------------+----------+-----+------+-----+---+----+
|               Stock|      Date|Price|Volume|month|day|year|
+--------------------+----------+-----+------+-----+---+----+
|32843.Nordamerika...|01/01/2016|18.98| 50635|    1|  1|2016|
|32843.Nordamerika...|01/04/2016|18.52| 51616|    1|  4|2016|
|32843.Nordamerika...|01/05/2016|19.15| 54898|    1|  5|2016|
|32843.Nordamerika...|01/06/2016|19.71| 41555|    1|  6|2016|
|32843.Nordamerika...|01/07/2016|19.17| 44430|    1|  7|2016|
|32843.Nordamerika...|01/08/2016|18.94| 72673|    1|  8|2016|
|32843.Nordamerika...|01/11/2016| 19.1| 45426|    1| 11|2016|
|32843.Nordamerika...|01/12/2016|19.39| 61457|    1| 12|2016|
|32843.Nordamerika...|01/13/2016|19.27| 61805|    1| 13|2016|
|32843.Nordamerika...|01/14/2016|19.17| 35597|    1| 14|2016|
|32843.Nordamerika...|01/15/2016|18.81| 69227|    1| 15|2016|
|32843.Nordamerika...|01/18/2016|18.81| 69227|    1| 18|2016|
|32843.Nordamerika...|01/19/2016|18.82| 23700|    1| 19|2016|
|32843.N

In [9]:
df_stocks_dates.printSchema()

root
 |-- Stock: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- year: integer (nullable = true)



## SparkSQL Test

In [10]:
stock1 = "32843.Nordamerika_USA-NASDAQ_CRA-International-Inc._CRAI"
stock2 = "41574.Nordamerika_USA-OTC_Polydex-Pharmaceuticals_POLXF"
stock3 = "23349.Europa_Schweden_BlackPearl-Resources-Inc.-Reg.-Shares-SDRs-1-o._02155Z"
df_stock_1 = df_stocks_dates.filter(f"Stock = '{stock1}'").limit(10).registerTempTable("stock1")
df_stock_2 = df_stocks_dates.filter(f"Stock = '{stock2}'").limit(10).registerTempTable("stock2")
df_stock_3 = df_stocks_dates.filter(f"Stock = '{stock3}'").limit(10).registerTempTable("stock3")

# spark.sql("CACHE TABLE testCache1 OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM stock1")
# spark.sql("CACHE TABLE testCache2 OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM stock2")
# spark.sql("CACHE TABLE testCache3 OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM stock3")

spark.sql("CACHE TABLE stock1")
spark.sql("CACHE TABLE stock2")
spark.sql("CACHE TABLE stock3")


# UNCOMMENT IF FIRST TIME
spark.sql(\
  "CREATE TEMP VIEW COLS3 AS SELECT \
  stock1.volume X, stock2.volume Y1, stock3.volume Y2 FROM stock1\
  INNER JOIN stock2 ON stock1.day = stock2.day\
   INNER JOIN stock3 ON stock1.Date = stock3.Date"\
     ).show()






++
||
++
++



In [11]:
spark.sql("SELECT * FROM COLS3").show()

spark.sql("SELECT *, CASE WHEN Y1 >= Y2 THEN 'ONE' WHEN Y2 > Y1 THEN 'TWO' END AS Y_star FROM COLS3").show()

+-----+-----+------+
|    X|   Y1|    Y2|
+-----+-----+------+
|50635|12000|200784|
|51616|  400|170002|
|54898|  100| 66159|
|41555| 3200| 66159|
|44430| 4976|387422|
|72673| 4976|121141|
|45426|  100| 90265|
|61457|  100|128003|
|61805|  100| 51492|
+-----+-----+------+

+-----+-----+------+------+
|    X|   Y1|    Y2|Y_star|
+-----+-----+------+------+
|50635|12000|200784|   TWO|
|51616|  400|170002|   ONE|
|54898|  100| 66159|   TWO|
|41555| 3200| 66159|   TWO|
|44430| 4976|387422|   ONE|
|72673| 4976|121141|   ONE|
|45426|  100| 90265|   TWO|
|61457|  100|128003|   TWO|
|61805|  100| 51492|   TWO|
+-----+-----+------+------+

