In [1]:
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder.master("local[*]").appName("cas2").getOrCreate()
spark

## Weather Data

In [3]:
sc = spark.sparkContext 

#!!! MAKE SURE THAT YOUR TEXTFILE HEADER IS THIS EXACT STRING: !!!
"STATION_ID,DATE,LOCATION,WIND,TMP,DEW,SLP"

weather_path_2017 = 'out_2017.txt'
weather_path_2018 = 'out_2018.txt'
weather_path_2019 = 'out_2019.txt'
weather_path_2020 = 'out_2020.txt'

In [15]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType,DateType
# schema = StructType([StructField('STATION_ID', StringType(), False),\
#                     StructField('DATE', DateType(), False),\
#                     StructField('LOCATION', StringType(), False),\
#                     StructField('WIND', StringType(), False),\
#                     StructField('TMP', StringType(), False),\
#                     StructField('DEW', StringType(), False),\
#                     StructField('SLP', StringType(), False)])

df_2017 = spark.read.option("header", "true").option("dateFormat", "yyyy-mm-dd").csv(weather_path_2017)
print(df_2017.printSchema())
df_2017 = df_2017.withColumn("splitcount", f.size(f.split(df_2017.WIND, " ")))
df_2017 = df_2017.filter("splitcount == 5")
df_2017 = df_2017.withColumn('wind_angle', f.split(df_2017['WIND'], ' ').getItem(0)).withColumn('wind_qualityAngle', f.split(df_2017['WIND'], ' ').getItem(1)).withColumn('wind_type', f.split(df_2017['WIND'], ' ').getItem(2)).withColumn('wind_speed', f.split(df_2017['WIND'], ' ').getItem(3)).withColumn('wind_qualitySpeed', f.split(df_2017['WIND'], ' ').getItem(4))
df_2017 = df_2017.withColumn('tmp_val', f.split(df_2017['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(df_2017['TMP'], ' ').getItem(1))
df_2017 = df_2017.withColumn('dew_val', f.split(df_2017['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(df_2017['DEW'], ' ').getItem(1))
df_2017 = df_2017.withColumn('slp_val', f.split(df_2017['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(df_2017['SLP'], ' ').getItem(1))
df_2017 = df_2017.drop("splitcount", "WIND", "TMP", "DEW", "SLP")

df_2018 = spark.read.option("header", "true").option("dateFormat", "yyyy-mm-dd").csv(weather_path_2018)
df_2018 = df_2018.withColumn("splitcount", f.size(f.split(df_2018.WIND, " ")))
df_2018 = df_2018.filter("splitcount == 5")
df_2018 = df_2018.withColumn('wind_angle', f.split(df_2018['WIND'], ' ').getItem(0)).withColumn('wind_qualityAngle', f.split(df_2018['WIND'], ' ').getItem(1)).withColumn('wind_type', f.split(df_2018['WIND'], ' ').getItem(2)).withColumn('wind_speed', f.split(df_2018['WIND'], ' ').getItem(3)).withColumn('wind_qualitySpeed', f.split(df_2018['WIND'], ' ').getItem(4))
df_2018 = df_2018.withColumn('tmp_val', f.split(df_2018['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(df_2018['TMP'], ' ').getItem(1))
df_2018 = df_2018.withColumn('dew_val', f.split(df_2018['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(df_2018['DEW'], ' ').getItem(1))
df_2018 = df_2018.withColumn('slp_val', f.split(df_2018['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(df_2018['SLP'], ' ').getItem(1))
df_2018 = df_2018.drop("splitcount", "WIND", "TMP", "DEW", "SLP")

df_2019 = spark.read.option("header", "true").option("dateFormat", "yyyy-mm-dd").csv(weather_path_2019)
df_2019 = df_2019.withColumn("splitcount", f.size(f.split(df_2019.WIND, " ")))
df_2019 = df_2019.filter("splitcount == 5")
df_2019 = df_2019.withColumn('wind_angle', f.split(df_2019['WIND'], ' ').getItem(0)).withColumn('wind_qualityAngle', f.split(df_2019['WIND'], ' ').getItem(1)).withColumn('wind_type', f.split(df_2019['WIND'], ' ').getItem(2)).withColumn('wind_speed', f.split(df_2019['WIND'], ' ').getItem(3)).withColumn('wind_qualitySpeed', f.split(df_2019['WIND'], ' ').getItem(4))
df_2019 = df_2019.withColumn('tmp_val', f.split(df_2019['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(df_2019['TMP'], ' ').getItem(1))
df_2019 = df_2019.withColumn('dew_val', f.split(df_2019['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(df_2019['DEW'], ' ').getItem(1))
df_2019 = df_2019.withColumn('slp_val', f.split(df_2019['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(df_2019['SLP'], ' ').getItem(1))
df_2019 = df_2019.drop("splitcount", "WIND", "TMP", "DEW", "SLP")

# select stations which easy start date and then first 1000 of those
df_2017_selection = df_2017.filter(df_2017.DATE == "2017-01-01").select('STATION_ID').limit(1000)
df_2018_selection = df_2018.filter(df_2018.DATE == "2018-01-01").select('STATION_ID').limit(1000)
df_2019_selection = df_2019.filter(df_2019.DATE == "2019-01-01").select('STATION_ID').limit(1000)
print(df_2017_selection.count(), df_2018_selection.count(), df_2019_selection.count())

# filter these over the complete dataset
filtered_2017= df_2017_selection.join(df_2017, 'STATION_ID')
filtered_2018= df_2018_selection.join(df_2018, 'STATION_ID')
filtered_2019= df_2019_selection.join(df_2019, 'STATION_ID')

# fill the easy start stations with min and max date
df_2017_selection = df_2017_selection.withColumn("min_date", f.lit("2017-01-01").cast('date')).withColumn("max_date", f.lit("2017-12-31").cast('date'))
df_2018_selection = df_2018_selection.withColumn("min_date", f.lit("2018-01-01").cast('date')).withColumn("max_date", f.lit("2018-12-31").cast('date'))
df_2019_selection = df_2019_selection.withColumn("min_date", f.lit("2019-01-01").cast('date')).withColumn("max_date", f.lit("2019-12-31").cast('date'))

# expand to add all days
df_2017_selection = df_2017_selection.withColumn('DATE', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))).drop("min_date", "max_date")
df_2018_selection = df_2018_selection.withColumn('DATE', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))).drop("min_date", "max_date")
df_2019_selection = df_2019_selection.withColumn('DATE', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))).drop("min_date", "max_date")

# left join to make sure all dates are there (missing dates will get null)
df_2017_selection = df_2017_selection.join(filtered_2017, ["STATION_ID", "DATE"], "left")
df_2018_selection = df_2018_selection.join(filtered_2018, ["STATION_ID", "DATE"], "left")
df_2019_selection = df_2019_selection.join(filtered_2019, ["STATION_ID", "DATE"], "left")
print(df_2017_selection.count(), df_2018_selection.count(), df_2019_selection.count())

root
 |-- STATION_ID: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- WIND: string (nullable = true)
 |-- TMP: string (nullable = true)
 |-- DEW: string (nullable = true)
 |-- SLP: string (nullable = true)

None
1000 1000 1000
365000 365000 365000


In [None]:
# df_date_split = df_good_cols.withColumn('year', f.split(df_good_cols['DATE'], '-').getItem(0)).withColumn('month', f.split(df_good_cols['DATE'], '-').getItem(1)).withColumn('day', f.split(df_good_cols['DATE'], '-').getItem(2))
# df_date_split = df_date_split.drop("DATE")
# df_date_split.show()

In [None]:
# from pyspark.sql.functions import col, avg
# df_date_split.select(avg('month')).collect()

## Stocks data

In [18]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType,DateType
stocks_file = "Stocks.txt"
df_stocks = spark.read.option("header", "false").csv(stocks_file)
df_stocks =  df_stocks.selectExpr(
    '_c0 AS Stock',
    '_c1 AS Date',
    '_c2 AS Price',
    '_c3 AS Volume',
)

#
df_stocks_selection = df_stocks.filter((df_stocks.Date.contains("2017") | df_stocks.Date.contains("2018") | df_stocks.Date.contains("2019")))
print(df_stocks_selection.show())
df_stocks_selection_filtered = df_stocks_selection.filter(df_stocks_selection.Date == "01/01/2017").select('Stock').limit(1000)
print(df_stocks_selection_filtered.show())
# selected_stocks = df_stocks_selection_filtered.join(df_stocks_selection, 'Stock')
# print(selected_stocks.show(1200))
#
# stocks_dates = df_stocks_selection_filtered.withColumn("min_date", f.lit("2017-01-01").cast('date')).withColumn("max_date", f.lit("2017-12-31").cast('date'))
# stocks_dates = stocks_dates.withColumn('date', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))).drop("min_date", "max_date")
# print(stocks_dates.show(1200))

# print(df_stocks_selection.show())


# df_stocks = df_stocks.withColumn("Price",df_stocks.Price.cast('float'))
# df_stocks = df_stocks.withColumn("Volume",df_stocks.Volume.cast('int'))

# df_stocks.show()
# df_stocks.printSchema()

+--------------------+----------+-------+------+
|               Stock|      Date|  Price|Volume|
+--------------------+----------+-------+------+
|32843.Nordamerika...|01/02/2017|  36.62| 29618|
|32843.Nordamerika...|01/03/2017|  37.03| 46929|
|32843.Nordamerika...|01/04/2017|  36.34| 47533|
|32843.Nordamerika...|01/05/2017|  36.49| 29710|
|32843.Nordamerika...|01/06/2017|  36.16| 56468|
|32843.Nordamerika...|01/09/2017|  34.75| 50080|
|32843.Nordamerika...|01/10/2017|  34.57| 34604|
|32843.Nordamerika...|01/11/2017|  34.74| 41340|
|32843.Nordamerika...|01/12/2017|  35.55| 24844|
|32843.Nordamerika...|01/13/2017|   35.3| 39395|
|32843.Nordamerika...|01/16/2017|   35.3| 39395|
|32843.Nordamerika...|01/17/2017|  35.24| 19939|
|32843.Nordamerika...|01/18/2017|33.2543| 30599|
|32843.Nordamerika...|01/19/2017|  33.17| 20668|
|32843.Nordamerika...|01/20/2017|  33.27| 29281|
|32843.Nordamerika...|01/23/2017|   33.5| 42937|
|32843.Nordamerika...|01/24/2017|  33.55| 38238|
|32843.Nordamerika..

In [None]:
df_stocks_dates = df_stocks.withColumn('month', f.split(df_stocks['Date'], '/').getItem(0).cast('int')).withColumn('day', f.split(df_stocks['Date'], '/').getItem(1).cast('int')).withColumn('year', f.split(df_stocks['Date'], '/').getItem(2).cast('int'))
# df_stocks_dates = df_stocks_dates.drop("Date")
df_stocks_dates.show()


In [None]:
df_stocks_dates.printSchema()

## SparkSQL Test

In [None]:
stock1 = "32843.Nordamerika_USA-NASDAQ_CRA-International-Inc._CRAI"
stock2 = "41574.Nordamerika_USA-OTC_Polydex-Pharmaceuticals_POLXF"
stock3 = "23349.Europa_Schweden_BlackPearl-Resources-Inc.-Reg.-Shares-SDRs-1-o._02155Z"
df_stock_1 = df_stocks_dates.filter(f"Stock = '{stock1}'").limit(10).registerTempTable("stock1")
df_stock_2 = df_stocks_dates.filter(f"Stock = '{stock2}'").limit(10).registerTempTable("stock2")
df_stock_3 = df_stocks_dates.filter(f"Stock = '{stock3}'").limit(10).registerTempTable("stock3")

# spark.sql("CACHE TABLE testCache1 OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM stock1")
# spark.sql("CACHE TABLE testCache2 OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM stock2")
# spark.sql("CACHE TABLE testCache3 OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM stock3")

spark.sql("CACHE TABLE stock1")
spark.sql("CACHE TABLE stock2")
spark.sql("CACHE TABLE stock3")


# UNCOMMENT IF FIRST TIME
spark.sql(\
  "CREATE TEMP VIEW COLS3 AS SELECT \
  stock1.volume X, stock2.volume Y1, stock3.volume Y2 FROM stock1\
  INNER JOIN stock2 ON stock1.day = stock2.day\
   INNER JOIN stock3 ON stock1.Date = stock3.Date"\
     ).show()




In [None]:
spark.sql("SELECT * FROM COLS3").show()

spark.sql("SELECT *, CASE WHEN Y1 >= Y2 THEN 'ONE' WHEN Y2 > Y1 THEN 'TWO' END AS Y_star FROM COLS3").show()