In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as f



In [2]:
spark = SparkSession.builder.master("local[*]").appName("cas2").getOrCreate()
spark

## Weather Data

In [3]:
sc = spark.sparkContext 

#!!! MAKE SURE THAT YOUR TEXTFILE HEADER IS THIS EXACT STRING: !!!
"STATION_ID,DATE,LOCATION,WIND,TMP,DEW,SLP"

weather_path = 'out_2020_header_changed.txt'


In [4]:
df1 = spark.read.option("header", "true").csv(weather_path)

# Apply transformations and filter abnormal wind data columns
df_wind_data_counts = df1.withColumn("splitcount", f.size(f.split(df1.WIND, " ")))
# Filter out columns which do not have 5 elements in the wind information
df_filtered_wind_data_counts = df_wind_data_counts.filter("splitcount == 5")
wind_split =df_filtered_wind_data_counts.select(f.split(df_filtered_wind_data_counts.WIND," ")).rdd.flatMap(lambda x: x).toDF(schema=["wind_angle", "wind_qualityAngle", "wind_type", "wind_speed", "wind_qualitySpeed"])
dfnew = df_filtered_wind_data_counts.join(wind_split)
dfnew.show()

+-----------+----------+-----------------+--------------+-------+-------+-------+----------+----------+-----------------+---------+----------+-----------------+
| STATION_ID|      DATE|         LOCATION|          WIND|    TMP|    DEW|    SLP|splitcount|wind_angle|wind_qualityAngle|wind_type|wind_speed|wind_qualitySpeed|
+-----------+----------+-----------------+--------------+-------+-------+-------+----------+----------+-----------------+---------+----------+-----------------+
|99773799999|2020-04-20|SILVER BAY  MN US|260 1 N 0088 1|+0081 1|+9999 9|09973 1|         5|       260|                1|        N|      0088|                1|
|99773799999|2020-04-20|SILVER BAY  MN US|260 1 N 0088 1|+0081 1|+9999 9|09973 1|         5|       330|                1|        N|      0082|                1|
|99773799999|2020-04-20|SILVER BAY  MN US|260 1 N 0088 1|+0081 1|+9999 9|09973 1|         5|       170|                1|        N|      0026|                1|
|99773799999|2020-04-20|SILVER BAY

In [5]:
#Split other columns into multiples
dfnewer = dfnew.withColumn('tmp_val', f.split(dfnew['TMP'], ' ').getItem(0)).withColumn('tmp_quality', f.split(dfnew['TMP'], ' ').getItem(1))
dfnewer = dfnewer.withColumn('dew_val', f.split(dfnewer['DEW'], ' ').getItem(0)).withColumn('dew_quality', f.split(dfnewer['DEW'], ' ').getItem(1))
dfnewer = dfnewer.withColumn('slp_val', f.split(dfnewer['SLP'], ' ').getItem(0)).withColumn('slp_quality', f.split(dfnewer['SLP'], ' ').getItem(1))
dfnewer.show()

+-----------+----------+-----------------+--------------+-------+-------+-------+----------+----------+-----------------+---------+----------+-----------------+-------+-----------+-------+-----------+-------+-----------+
| STATION_ID|      DATE|         LOCATION|          WIND|    TMP|    DEW|    SLP|splitcount|wind_angle|wind_qualityAngle|wind_type|wind_speed|wind_qualitySpeed|tmp_val|tmp_quality|dew_val|dew_quality|slp_val|slp_quality|
+-----------+----------+-----------------+--------------+-------+-------+-------+----------+----------+-----------------+---------+----------+-----------------+-------+-----------+-------+-----------+-------+-----------+
|99773799999|2020-04-20|SILVER BAY  MN US|260 1 N 0088 1|+0081 1|+9999 9|09973 1|         5|       260|                1|        N|      0088|                1|  +0081|          1|  +9999|          9|  09973|          1|
|99773799999|2020-04-20|SILVER BAY  MN US|260 1 N 0088 1|+0081 1|+9999 9|09973 1|         5|       330|             

In [6]:
df_good_cols = dfnewer.drop("splitcount", "WIND", "TMP", "DEW", "SLP")
df_good_cols.show()

+-----------+----------+-----------------+----------+-----------------+---------+----------+-----------------+-------+-----------+-------+-----------+-------+-----------+
| STATION_ID|      DATE|         LOCATION|wind_angle|wind_qualityAngle|wind_type|wind_speed|wind_qualitySpeed|tmp_val|tmp_quality|dew_val|dew_quality|slp_val|slp_quality|
+-----------+----------+-----------------+----------+-----------------+---------+----------+-----------------+-------+-----------+-------+-----------+-------+-----------+
|99773799999|2020-04-20|SILVER BAY  MN US|       260|                1|        N|      0088|                1|  +0081|          1|  +9999|          9|  09973|          1|
|99773799999|2020-04-20|SILVER BAY  MN US|       330|                1|        N|      0082|                1|  +0081|          1|  +9999|          9|  09973|          1|
|99773799999|2020-04-20|SILVER BAY  MN US|       170|                1|        N|      0026|                1|  +0081|          1|  +9999|       

In [10]:
df_date_split = df_good_cols.withColumn('year', f.split(df_good_cols['DATE'], '-').getItem(0)).withColumn('month', f.split(df_good_cols['DATE'], '-').getItem(1)).withColumn('day', f.split(df_good_cols['DATE'], '-').getItem(2))
df_date_split = df_date_split.drop("DATE")
df_date_split.show()

+-----------+-----------------+----------+-----------------+---------+----------+-----------------+-------+-----------+-------+-----------+-------+-----------+----+-----+---+
| STATION_ID|         LOCATION|wind_angle|wind_qualityAngle|wind_type|wind_speed|wind_qualitySpeed|tmp_val|tmp_quality|dew_val|dew_quality|slp_val|slp_quality|year|month|day|
+-----------+-----------------+----------+-----------------+---------+----------+-----------------+-------+-----------+-------+-----------+-------+-----------+----+-----+---+
|99773799999|SILVER BAY  MN US|       260|                1|        N|      0088|                1|  +0081|          1|  +9999|          9|  09973|          1|2020|   04| 20|
|99773799999|SILVER BAY  MN US|       330|                1|        N|      0082|                1|  +0081|          1|  +9999|          9|  09973|          1|2020|   04| 20|
|99773799999|SILVER BAY  MN US|       170|                1|        N|      0026|                1|  +0081|          1|  +999

## Stocks data

In [12]:

stocks_file = "Stocks.txt"
df_stocks = spark.read.option("header", "false").csv(stocks_file)
df_stocks =  df_stocks.selectExpr(
    '_c0 AS Stock',
    '_c1 AS Date',
    '_c2 AS Price',
    '_c3 AS Volume',
)
df_stocks.show()

+--------------------+----------+-----+------+
|               Stock|      Date|Price|Volume|
+--------------------+----------+-----+------+
|32843.Nordamerika...|01/01/2016|18.98| 50635|
|32843.Nordamerika...|01/04/2016|18.52| 51616|
|32843.Nordamerika...|01/05/2016|19.15| 54898|
|32843.Nordamerika...|01/06/2016|19.71| 41555|
|32843.Nordamerika...|01/07/2016|19.17| 44430|
|32843.Nordamerika...|01/08/2016|18.94| 72673|
|32843.Nordamerika...|01/11/2016| 19.1| 45426|
|32843.Nordamerika...|01/12/2016|19.39| 61457|
|32843.Nordamerika...|01/13/2016|19.27| 61805|
|32843.Nordamerika...|01/14/2016|19.17| 35597|
|32843.Nordamerika...|01/15/2016|18.81| 69227|
|32843.Nordamerika...|01/18/2016|18.81| 69227|
|32843.Nordamerika...|01/19/2016|18.82| 23700|
|32843.Nordamerika...|01/20/2016|17.97| 41439|
|32843.Nordamerika...|01/21/2016|17.82| 35240|
|32843.Nordamerika...|01/22/2016|18.62| 79139|
|32843.Nordamerika...|01/25/2016|18.09| 89251|
|32843.Nordamerika...|01/26/2016|17.62| 75300|
|32843.Nordam

In [14]:
df_stocks_dates = df_stocks.withColumn('month', f.split(df_stocks['Date'], '/').getItem(0)).withColumn('day', f.split(df_stocks['Date'], '/').getItem(1)).withColumn('year', f.split(df_stocks['Date'], '/').getItem(2))
# df_stocks_dates = df_stocks_dates.drop("Date")
df_stocks_dates.show()

+--------------------+----------+-----+------+-----+---+----+
|               Stock|      Date|Price|Volume|month|day|year|
+--------------------+----------+-----+------+-----+---+----+
|32843.Nordamerika...|01/01/2016|18.98| 50635|   01| 01|2016|
|32843.Nordamerika...|01/04/2016|18.52| 51616|   01| 04|2016|
|32843.Nordamerika...|01/05/2016|19.15| 54898|   01| 05|2016|
|32843.Nordamerika...|01/06/2016|19.71| 41555|   01| 06|2016|
|32843.Nordamerika...|01/07/2016|19.17| 44430|   01| 07|2016|
|32843.Nordamerika...|01/08/2016|18.94| 72673|   01| 08|2016|
|32843.Nordamerika...|01/11/2016| 19.1| 45426|   01| 11|2016|
|32843.Nordamerika...|01/12/2016|19.39| 61457|   01| 12|2016|
|32843.Nordamerika...|01/13/2016|19.27| 61805|   01| 13|2016|
|32843.Nordamerika...|01/14/2016|19.17| 35597|   01| 14|2016|
|32843.Nordamerika...|01/15/2016|18.81| 69227|   01| 15|2016|
|32843.Nordamerika...|01/18/2016|18.81| 69227|   01| 18|2016|
|32843.Nordamerika...|01/19/2016|18.82| 23700|   01| 19|2016|
|32843.N