In [1]:
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import input_file_name, lit, col, isnull
from pyspark.sql import functions as F
print(f"PySpark Version : {pyspark.__version__}")

PySpark Version : 3.4.1


In [2]:
#Create a spark Context class, with custom config
conf = SparkConf()
conf.set('spark.default.parallelism', 700)
conf.set('spark.sql.shuffle.partitions', 700)
conf.set('spark.driver.memory', '30g')
conf.set('spark.driver.cores', 8)
conf.set('spark.executor.cores', 8)
conf.set('spark.executor.memory', '30g')
sc = SparkContext.getOrCreate(conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/26 00:49:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
## Create spark session
spark = SparkSession.builder.master('local[*]').\
                config('spark.sql.debug.maxToStringFields', '100').\
                appName("ETFs Spark Airflow Docker").getOrCreate()

In [4]:

existing_schema = StructType([
    StructField("Date", StringType(), False),
    StructField("Open", FloatType(), False),
    StructField("High", FloatType(), False),
    StructField("Low", FloatType(), False),
    StructField("Close", FloatType(), False),
    StructField("Adj Close", FloatType(), False),
    StructField("Volume", FloatType(), False)

])

In [5]:
input_path = "../data/stocks_etfs/A.csv"
stock_df = spark.read.csv(input_path, header=True, schema=existing_schema)

In [6]:
stock_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Adj Close: float (nullable = true)
 |-- Volume: float (nullable = true)



In [7]:
meta_symbol = spark.read.csv("../data/symbols_valid_meta.csv", header=True)
symbol_mapping = meta_symbol.select("Symbol", "Security Name").rdd.collectAsMap()
symbol_name = input_path.split("/")[-1].split(".")[0]


                                                                                

In [8]:
stock_df.show(10)

+----------+---------+---------+---------+---------+---------+---------+
|      Date|     Open|     High|      Low|    Close|Adj Close|   Volume|
+----------+---------+---------+---------+---------+---------+---------+
|1999-11-18|32.546494| 35.76538|28.612303|31.473534|27.068665|6.25463E7|
|1999-11-19| 30.71352|30.758226|28.478184|28.880543|24.838577|1.52341E7|
|1999-11-22|29.551144|31.473534| 28.65701|31.473534|27.068665|6577800.0|
|1999-11-23|30.400572|31.205294|28.612303|28.612303| 24.60788|5975600.0|
|1999-11-24|28.701717| 29.99821|28.612303|29.372318|25.261524|4843200.0|
|1999-11-26|29.238197|29.685265|29.148785|29.461731|25.338428|1729400.0|
|1999-11-29| 29.32761|30.355865|29.014664|30.132332|25.915169|4074700.0|
|1999-11-30| 30.04292| 30.71352|29.282904|30.177038|25.953619|4310000.0|
|1999-12-01|30.177038|31.071173|29.953505| 30.71352|26.415012|2957300.0|
|1999-12-02|31.294706|32.188843|30.892345|31.562946|27.145563|3069800.0|
+----------+---------+---------+---------+---------

In [9]:
stock_df = stock_df.withColumn("Symbol", F.lit(symbol_name))
stock_df = stock_df.withColumn("Security Name", F.lit(symbol_mapping.get(symbol_name)))

In [10]:
stock_df.show(10)

+----------+---------+---------+---------+---------+---------+---------+------+--------------------+
|      Date|     Open|     High|      Low|    Close|Adj Close|   Volume|Symbol|       Security Name|
+----------+---------+---------+---------+---------+---------+---------+------+--------------------+
|1999-11-18|32.546494| 35.76538|28.612303|31.473534|27.068665|6.25463E7|     A|Agilent Technolog...|
|1999-11-19| 30.71352|30.758226|28.478184|28.880543|24.838577|1.52341E7|     A|Agilent Technolog...|
|1999-11-22|29.551144|31.473534| 28.65701|31.473534|27.068665|6577800.0|     A|Agilent Technolog...|
|1999-11-23|30.400572|31.205294|28.612303|28.612303| 24.60788|5975600.0|     A|Agilent Technolog...|
|1999-11-24|28.701717| 29.99821|28.612303|29.372318|25.261524|4843200.0|     A|Agilent Technolog...|
|1999-11-26|29.238197|29.685265|29.148785|29.461731|25.338428|1729400.0|     A|Agilent Technolog...|
|1999-11-29| 29.32761|30.355865|29.014664|30.132332|25.915169|4074700.0|     A|Agilent Tech

In [11]:
stock_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- Close: float (nullable = true)
 |-- Adj Close: float (nullable = true)
 |-- Volume: float (nullable = true)
 |-- Symbol: string (nullable = false)
 |-- Security Name: string (nullable = false)



In [12]:
# Create a DataFrame
data_f = [("Alice", 25),
        ("Bob", 30),
        ("Charlie", 35)]
  
#

In [13]:
data_f = pd.DataFrame(columns=["name", "age"], data=data_f)

NameError: name 'pd' is not defined

In [8]:
data_f

Unnamed: 0,name,age
0,Alice,25
1,Bob,30
2,Charlie,35


In [14]:
df = spark.createDataFrame(data_f, ["name", "age"])

In [15]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [16]:
df.show()



+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



                                                                                

In [4]:
import pandas as pd

pd.read_parquet("../data/processed_stocks_etfs/A.parquet", engine="pyarrow")

Unnamed: 0,Symbol,Security Name,Date,Open,High,Low,Close,Adj Close,Volume
0,A,"Agilent Technologies, Inc. Common Stock",1999-11-18,32.546494,35.765381,28.612303,31.473534,27.068665,62546300
1,A,"Agilent Technologies, Inc. Common Stock",1999-11-19,30.713520,30.758226,28.478184,28.880543,24.838577,15234100
2,A,"Agilent Technologies, Inc. Common Stock",1999-11-22,29.551144,31.473534,28.657009,31.473534,27.068665,6577800
3,A,"Agilent Technologies, Inc. Common Stock",1999-11-23,30.400572,31.205294,28.612303,28.612303,24.607880,5975600
4,A,"Agilent Technologies, Inc. Common Stock",1999-11-24,28.701717,29.998211,28.612303,29.372318,25.261524,4843200
...,...,...,...,...,...,...,...,...,...
5119,A,"Agilent Technologies, Inc. Common Stock",2020-03-26,70.000000,74.449997,69.650002,73.720001,73.532867,3267500
5120,A,"Agilent Technologies, Inc. Common Stock",2020-03-27,71.550003,73.209999,70.279999,70.910004,70.730003,1829800
5121,A,"Agilent Technologies, Inc. Common Stock",2020-03-30,71.059998,73.180000,71.059998,72.669998,72.669998,1486200
5122,A,"Agilent Technologies, Inc. Common Stock",2020-03-31,72.339996,72.800003,70.500000,71.620003,71.620003,1822100
