## Imports

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, sqrt, pow
import pyspark.sql.functions as f
from math import sqrt


## Config

In [2]:
spark = SparkSession.builder.master("local[*]").appName("app").getOrCreate()

weather_dataset_paths = [
    "out_2017.txt",
    "out_2018.txt",
    "out_2019.txt",
    "out_2020.txt"
]

stocks_dataset_path = "MS1.txt"

number_stocks_and_weather = 20



In [4]:
# spark.stop()

## Functions

In [5]:
def weather_transforms(path):
    # Read data
    df = spark.read \
        .option("header", "true") \
        .option("dateFormat", "yyyy-mm-dd") \
        .csv(path)
    # Split temperature into variables
    df = df.withColumn('TEMP', f.split(df['TMP'], ' ').getItem(0)) 
    # Drop unnecessary columns
    df = df.drop(
        "splitcount", "LOCATION", "WIND", "TMP", "DEW", "SLP", "tmp_quality"
    )

    return df

# Weather data

In [6]:
# Load and concatenate all weather data
union = weather_transforms(weather_dataset_paths[0])
for weather_dataset_path in weather_dataset_paths[1:]:
    union = union.union(weather_transforms(weather_dataset_path))

# Select stations which easy start date and then first 1000 of those
weather_df = union.filter(union.DATE == "2017-01-01") \
    .select('STATION_ID') \
    .limit(number_stocks_and_weather)

# Filter these over the complete dataset
filtered_union = weather_df.join(union, 'STATION_ID')

# Fill the easy start stations with min and max date
weather_df = weather_df \
    .withColumn("min_date", f.lit("2017-01-01").cast('date')) \
    .withColumn("max_date", f.lit("2019-12-31").cast('date'))

# Expand to add all days
weather_df = weather_df \
    .withColumn('DATE', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))) \
    .drop("min_date", "max_date")

# Left join to make sure all dates are there (missing dates will get null)
weather_df = weather_df \
    .join(filtered_union, ["STATION_ID", "DATE"], "left") \
    .sort(['STATION_ID', 'DATE'])
weather_df = weather_df \
    .withColumn("TEMP" , weather_df.TEMP.cast('int'))
weather_df = weather_df.replace([999, 9999], None)

weather_df = weather_df.na.fill(20) # TODO

weather_df.persist()

weather_df.show()
print(f"Number of rows: {weather_df.count()}")


+-----------+----------+----+
| STATION_ID|      DATE|TEMP|
+-----------+----------+----+
|02265099999|2017-01-01|  -5|
|02265099999|2017-01-02| -87|
|02265099999|2017-01-03| -95|
|02265099999|2017-01-04| -82|
|02265099999|2017-01-05|-214|
|02265099999|2017-01-06|-269|
|02265099999|2017-01-07| -62|
|02265099999|2017-01-08| -30|
|02265099999|2017-01-09| -58|
|02265099999|2017-01-10| -25|
|02265099999|2017-01-11|   5|
|02265099999|2017-01-12|  -5|
|02265099999|2017-01-13| -19|
|02265099999|2017-01-14|-105|
|02265099999|2017-01-15|-199|
|02265099999|2017-01-16| -70|
|02265099999|2017-01-17|-217|
|02265099999|2017-01-18|  -8|
|02265099999|2017-01-19|  39|
|02265099999|2017-01-20| -30|
+-----------+----------+----+
only showing top 20 rows

Number of rows: 21900


# Stock data

In [7]:
# Load data
stocks_df = spark.read \
    .option("header", "false") \
    .csv(stocks_dataset_path)

# Rename columns and drop volume
stocks_df = stocks_df.selectExpr(
    '_c0 AS STOCK',
    '_c1 AS DATE',
    '_c2 AS PRICE',
    '_c3 AS VOLUME'
)
stocks_df = stocks_df.drop("VOLUME")

# Select 1000 stocks with appropriate years
df_stocks_selection = stocks_df \
    .filter((stocks_df.DATE.contains("2017") | stocks_df.DATE.contains("2018") | stocks_df.DATE.contains("2019")))
df_stocks_selection_filtered = df_stocks_selection \
    .filter(df_stocks_selection.DATE == "01/02/2017") \
    .select('STOCK') \
    .limit(number_stocks_and_weather)
selected_stocks = df_stocks_selection_filtered \
    .join(df_stocks_selection, 'STOCK')

# 
stocks_df = df_stocks_selection_filtered \
    .withColumn("min_date", f.lit("2017-01-01").cast('date')) \
    .withColumn("max_date", f.lit("2019-12-31").cast('date'))
stocks_df = stocks_df \
    .withColumn('DATE', f.explode(f.expr('sequence(min_date, max_date, interval 1 day)'))) \
    .drop("min_date", "max_date")

# 
modifiedDF = selected_stocks \
    .withColumn("DATE", f.to_date("DATE", "MM/dd/yyyy")) \
    .dropDuplicates(["STOCK", "DATE"])
stocks_df = stocks_df \
    .join(modifiedDF, ["Stock", "DATE"], "left") \
    .sort(['STOCK', 'DATE'])

stocks_df = stocks_df.withColumn("PRICE", stocks_df.PRICE.cast("int"))

stocks_df = stocks_df.na.fill(20) # TODO

stocks_df.persist()

stocks_df.show()
print(f"Number of rows: {stocks_df.count()}")

+--------------------+----------+-----+
|               STOCK|      DATE|PRICE|
+--------------------+----------+-----+
|12173.Asien--Aust...|2017-01-01|   20|
|12173.Asien--Aust...|2017-01-02|20201|
|12173.Asien--Aust...|2017-01-03|20499|
|12173.Asien--Aust...|2017-01-04|19780|
|12173.Asien--Aust...|2017-01-05|19550|
|12173.Asien--Aust...|2017-01-06|19999|
|12173.Asien--Aust...|2017-01-07|   20|
|12173.Asien--Aust...|2017-01-08|   20|
|12173.Asien--Aust...|2017-01-09|19600|
|12173.Asien--Aust...|2017-01-10|19825|
|12173.Asien--Aust...|2017-01-11|19215|
|12173.Asien--Aust...|2017-01-12|19501|
|12173.Asien--Aust...|2017-01-13|19397|
|12173.Asien--Aust...|2017-01-14|   20|
|12173.Asien--Aust...|2017-01-15|   20|
|12173.Asien--Aust...|2017-01-16|19329|
|12173.Asien--Aust...|2017-01-17|19674|
|12173.Asien--Aust...|2017-01-18|19900|
|12173.Asien--Aust...|2017-01-19|19850|
|12173.Asien--Aust...|2017-01-20|19808|
+--------------------+----------+-----+
only showing top 20 rows

Number of rows

# Part 2

In [7]:
"""
SQL query used:

SELECT sum(X*Y') / (sqrt(sum(X))*sqrt(sum(Y')))
FROM (
    SELECT S.DATE, S.STOCK, W.STATION_ID_1, W.STATION_ID_2, S.PRICE as X, W.Y'
    FROM stocks AS S, (
        SELECT W1.DATE, W1.STATION_ID AS STATION_ID_1, W2.STATION_ID AS STATION_ID_2, avg|max|min(W1.TEMP, W2.TEMP) as W.Y'
        FROM weather AS W1, weather AS W2,
        WHERE W1.DATE = W2.DATE
    ) AS W
    WHERE S.DATE = W.DATE
)
GROUPBY STOCK, STATION_ID_1, STATION_ID_2

"""

"\nSQL query used:\n\nSELECT sum(X*Y') / (sqrt(sum(X))*sqrt(sum(Y')))\nFROM (\n    SELECT S.DATE, S.STOCK, W.STATION_ID_1, W.STATION_ID_2, S.PRICE as X, W.Y'\n    FROM stocks AS S, (\n        SELECT W1.DATE, W1.STATION_ID AS STATION_ID_1, W2.STATION_ID AS STATION_ID_2, avg|max|min(W1.TEMP, W2.TEMP) as W.Y'\n        FROM weather AS W1, weather AS W2,\n        WHERE W1.DATE = W2.DATE\n    ) AS W\n    WHERE S.DATE = W.DATE\n)\nGROUPBY STOCK, STATION_ID_1, STATION_ID_2\n\n"

In [8]:
# agg_function = f.greatest
# agg_function = f.least
agg_function = lambda col1, col2: (col(col1) + col(col2)) / 2

Y_df = weather_df \
    .withColumnRenamed("STATION_ID", "STATION_ID_1") \
    .withColumnRenamed("TEMP", "TEMP_1") \
    .join(
        weather_df \
            .withColumnRenamed("STATION_ID", "STATION_ID_2") \
            .withColumnRenamed("TEMP", "TEMP_2"), 
        "DATE"
    )
Y_df = Y_df.withColumn("Y'", agg_function("TEMP_1", "TEMP_2"))
Y_df = Y_df.drop("TEMP_1", "TEMP_2")
Y_df.persist()
Y_df.show()
print(f"Number of rows: {Y_df.count()}")

+----------+------------+------------+------+
|      DATE|STATION_ID_1|STATION_ID_2|    Y'|
+----------+------------+------------+------+
|2017-01-01| 02265099999| 94804099999|  85.0|
|2017-01-01| 02265099999| 94677099999|  95.5|
|2017-01-01| 02265099999| 94584099999| 156.5|
|2017-01-01| 02265099999| 83650099999| 127.5|
|2017-01-01| 02265099999| 76749399999| 137.5|
|2017-01-01| 02265099999| 71889099999| -22.0|
|2017-01-01| 02265099999| 71453099999| -20.0|
|2017-01-01| 02265099999| 71450099999| -37.0|
|2017-01-01| 02265099999| 71322099999|-171.5|
|2017-01-01| 02265099999| 70333325518|  31.0|
|2017-01-01| 02265099999| 70259526559| -27.5|
|2017-01-01| 02265099999| 68487099999|  90.0|
|2017-01-01| 02265099999| 63708099999| 112.5|
|2017-01-01| 02265099999| 62398099999|  37.5|
|2017-01-01| 02265099999| 54292099999| -76.0|
|2017-01-01| 02265099999| 43278099999| 115.5|
|2017-01-01| 02265099999| 41862199999|  63.0|
|2017-01-01| 02265099999| 28552099999| -17.0|
|2017-01-01| 02265099999| 06022499

In [9]:
combined_df = stocks_df \
    .withColumnRenamed("PRICE", "X") \
    .join(Y_df, "DATE")

combined_df_grouped = combined_df \
    .groupBy("STOCK", "STATION_ID_1", "STATION_ID_2") \
    .agg(
        f.sqrt(f.sum(f.pow("X", 2))).alias("X_norm"),
        f.sqrt(f.sum(f.pow("Y'", 2))).alias("Y_norm"),
        f.sum(col("X") * col("Y'")).alias("XY")
    )

combined_df_grouped = combined_df_grouped \
    .withColumn("COSINE_SIM", col("XY") / (col("Y_norm") * col("X_norm")))
combined_df_grouped = combined_df_grouped.drop("X_norm", "Y_norm", "XY")
combined_df_grouped.persist()
combined_df_grouped.show()

print(f"Number of rows: {combined_df_grouped.count()}")

+--------------------+------------+------------+-------------------+
|               STOCK|STATION_ID_1|STATION_ID_2|         COSINE_SIM|
+--------------------+------------+------------+-------------------+
|42814.Nordamerika...| 02265099999| 94677099999| 0.8642378548973022|
|4438.Asien--Austr...| 02265099999| 76749399999| 0.7725067131938536|
|43297.Nordamerika...| 02265099999| 43278099999| 0.8487977948316912|
|13784.Europa_Deut...| 06022499999| 94677099999| 0.5193424493119109|
|28549.Futures--In...| 28552099999| 94804099999| 0.7120823821302918|
|32843.Nordamerika...| 28552099999| 62398099999| 0.6929047123707867|
|31474.Nordamerika...| 28552099999| 54292099999|0.11055468034407347|
|41574.Nordamerika...| 41862199999| 71450099999|0.45457949948720755|
|34271.Nordamerika...| 43278099999| 94584099999| 0.5726049863036856|
|13784.Europa_Deut...| 43278099999| 71453099999| 0.4919507413036301|
|38565.Nordamerika...| 62398099999| 71450099999| 0.6591657325739723|
|37239.Nordamerika...| 68487099999

In [10]:
print(weather_df.rdd.getNumPartitions())
print(stocks_df.rdd.getNumPartitions())
print(Y_df.rdd.getNumPartitions())
print(combined_df_grouped.rdd.getNumPartitions())

200
200
200
200


In [11]:
print(combined_df_grouped.agg({"COSINE_SIM": "min"}).collect()[0])
print(combined_df_grouped.agg({"COSINE_SIM": "max"}).collect()[0])

Row(min(COSINE_SIM)=-0.45379823228059263)
Row(max(COSINE_SIM)=0.992982338961731)


## Part 3

In [14]:
# Data nog handmatig verplaatsen
# weather_df = weather_df \
#     .withColumn("DATE" , weather_df.DATE.cast('string'))
# weather_df.repartition(1).write.csv("weather_processed.csv")
# stocks_df = stocks_df \
#     .withColumn("DATE" , stocks_df.DATE.cast('string'))
# stocks_df.repartition(1).write.csv("stocks_processed")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, sqrt, pow
import pyspark.sql.functions as f
from math import sqrt
spark = SparkSession.builder.master("local[*]").appName("app").getOrCreate()

In [40]:
def Y_agg_func(vec1, vec2, mode):
    if mode == "max":
        return [max(vec1[i], vec2[i]) for i in range(len(vec1))]
    elif mode == "min":
        return [min(vec1[i], vec2[i]) for i in range(len(vec1))]
    elif mode == "avg":
        return [(vec1[i] + vec2[i])/2 for i in range(len(vec1))]

def cosine_sim(vec1, vec2):
    norm1 = sqrt(sum([x**2 for x in vec1]))
    norm2 = sqrt(sum([x**2 for x in vec2]))
    dot   = sum([vec1[i]*vec2[i] for i in range(len(vec1))])
    return dot/(norm1*norm2)

In [56]:
weather_rdd = spark.sparkContext.textFile("weather_processed.csv")
# weather_rdd = weather_rdd.repartition(64)
weather_rdd_pivot = weather_rdd \
    .map(lambda x: (x.split(",")[0], [x.split(",")[1], x.split(",")[2]])) \
    .groupByKey() \
    .mapValues(list) \
    .sortBy(lambda x: x[1][0]) \
    .map(lambda x: (x[0], [int(y[1]) for y in x[1]])) 

stocks_rdd = spark.sparkContext.textFile("stocks_processed.csv")
# stocks_rdd = stocks_rdd.repartition(64)
stocks_rdd_pivot = stocks_rdd \
    .map(lambda x: (x.split(",")[0], [x.split(",")[1], x.split(",")[2]])) \
    .groupByKey() \
    .mapValues(list) \
    .sortBy(lambda x: x[1][0]) \
    .map(lambda x: (x[0], [int(y[1]) for y in x[1]])) 

weather_rdd_cartesian = weather_rdd_pivot.cartesian(weather_rdd_pivot)
weather_rdd_cartesian = weather_rdd_cartesian \
    .filter(lambda x: x[0][0] != x[1][0]) \
    .map(lambda x: ((x[0][0], x[1][0]), Y_agg_func(x[0][1], x[1][1], "avg")))


final_rdd_cartesian = stocks_rdd_pivot.cartesian(weather_rdd_cartesian)
final_rdd_cartesian = final_rdd_cartesian \
    .map(lambda x: ((x[0][0], x[1][0][0], x[1][0][1]), cosine_sim(x[0][1], x[1][1])))

output = final_rdd_cartesian.collect()
print(len(output))
output
# 1 minute

7600


[(('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '28552099999'),
  0.07376060399181888),
 (('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '71322099999'),
  -0.1420166006893375),
 (('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '71453099999'),
  0.34289612768302047),
 (('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '71889099999'),
  0.46920612216401986),
 (('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '02265099999'),
  0.15228133001629388),
 (('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '70259526559'),
  0.34686201167678604),
 (('12173.Asien--Australien--Südamerika--Afrika_Südafrika_The-SPAR-Group-Ltd._02602Z',
   '54292099999',
   '714