In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import input_file_name, lit
from pyspark.sql import functions as F

In [40]:
#Create a SparkSession
spark = SparkSession.builder.appName("ETFs").getOrCreate()

In [41]:


existing_schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Open", FloatType(), True),
    StructField("High", FloatType(), True),
    StructField("Low", FloatType(), True),
    StructField("Close", FloatType(), True),
    StructField("Adj Close", FloatType(), True),
    StructField("Volume", FloatType(), True)

])

In [42]:
input_path = "../data/stocks_etfs/A.csv"
stock_df = spark.read.csv(input_path, header=True)

In [43]:
meta_symbol = spark.read.csv("../data/symbols_valid_meta.csv", header=True)
symbol_mapping = meta_symbol.select("Symbol", "Security Name").rdd.collectAsMap()
symbol_name = input_path.split("/")[-1].split(".")[0]


In [44]:
stock_df.show(10)

+----------+------------------+------------------+------------------+------------------+------------------+--------+
|      Date|              Open|              High|               Low|             Close|         Adj Close|  Volume|
+----------+------------------+------------------+------------------+------------------+------------------+--------+
|1999-11-18| 32.54649353027344|   35.765380859375|28.612302780151367|31.473533630371094| 27.06866455078125|62546300|
|1999-11-19|30.713520050048828| 30.75822639465332| 28.47818374633789|28.880542755126953|24.838577270507812|15234100|
|1999-11-22|29.551143646240234|31.473533630371094| 28.65700912475586|31.473533630371094| 27.06866455078125| 6577800|
|1999-11-23|30.400571823120117|31.205293655395508|28.612302780151367|28.612302780151367|24.607879638671875| 5975600|
|1999-11-24|28.701717376708984|29.998210906982422|28.612302780151367|29.372318267822266|25.261524200439453| 4843200|
|1999-11-26|29.238197326660156|29.685264587402344|29.14878463745

In [45]:
stock_df = stock_df.withColumn("Symbol", F.lit(symbol_name))
stock_df = stock_df.withColumn("Security Name", F.lit(symbol_mapping.get(symbol_name)))

In [46]:
stock_df.show(10)

+----------+------------------+------------------+------------------+------------------+------------------+--------+------+--------------------+
|      Date|              Open|              High|               Low|             Close|         Adj Close|  Volume|Symbol|       Security Name|
+----------+------------------+------------------+------------------+------------------+------------------+--------+------+--------------------+
|1999-11-18| 32.54649353027344|   35.765380859375|28.612302780151367|31.473533630371094| 27.06866455078125|62546300|     A|Agilent Technolog...|
|1999-11-19|30.713520050048828| 30.75822639465332| 28.47818374633789|28.880542755126953|24.838577270507812|15234100|     A|Agilent Technolog...|
|1999-11-22|29.551143646240234|31.473533630371094| 28.65700912475586|31.473533630371094| 27.06866455078125| 6577800|     A|Agilent Technolog...|
|1999-11-23|30.400571823120117|31.205293655395508|28.612302780151367|28.612302780151367|24.607879638671875| 5975600|     A|Agilent

In [38]:
import pandas as pd

pd.read_parquet("../data/processed_stocks_etfs/A.parquet", engine="pyarrow")

Unnamed: 0,Symbol,Security Name,Date,Open,High,Low,Close,Adj Close,Volume
0,A,"Agilent Technologies, Inc. Common Stock",1999-11-18,32.546494,35.765381,28.612303,31.473534,27.068665,62546300
1,A,"Agilent Technologies, Inc. Common Stock",1999-11-19,30.713520,30.758226,28.478184,28.880543,24.838577,15234100
2,A,"Agilent Technologies, Inc. Common Stock",1999-11-22,29.551144,31.473534,28.657009,31.473534,27.068665,6577800
3,A,"Agilent Technologies, Inc. Common Stock",1999-11-23,30.400572,31.205294,28.612303,28.612303,24.607880,5975600
4,A,"Agilent Technologies, Inc. Common Stock",1999-11-24,28.701717,29.998211,28.612303,29.372318,25.261524,4843200
...,...,...,...,...,...,...,...,...,...
5119,A,"Agilent Technologies, Inc. Common Stock",2020-03-26,70.000000,74.449997,69.650002,73.720001,73.532867,3267500
5120,A,"Agilent Technologies, Inc. Common Stock",2020-03-27,71.550003,73.209999,70.279999,70.910004,70.730003,1829800
5121,A,"Agilent Technologies, Inc. Common Stock",2020-03-30,71.059998,73.180000,71.059998,72.669998,72.669998,1486200
5122,A,"Agilent Technologies, Inc. Common Stock",2020-03-31,72.339996,72.800003,70.500000,71.620003,71.620003,1822100
