In [2]:
# Initialize the Spark environment and imports the necessary libraries to work with Spark and SparkSQL

In [33]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [34]:
# Creates a SparkSession to manage the execution of Spark SQL jobs and read/write data in Spark

In [35]:
# Logger.getLogger("org").setLevel(Level.ERROR)         # set the logging level to ERROR for Spark logs
# set the name for the spark application, run Spark locally using all available cores, 
# get location for Spark SQL warehouse, creates or gets an existing SparkSession
spark = SparkSession \
    .builder \
    .appName("csv2parquet") \
    .master("local[*]") \
    .config("spark.sql.warehouse.dir", "file:///tmp") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/25 22:46:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/25 22:46:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [36]:
# Creates a SparkContext object and set log level to "Error" to avoid unnecessary log messages

In [37]:
sc = spark.sparkContext

In [38]:
sc.setLogLevel("ERROR")

In [42]:
# reads a csv file into a DataFrame, display the first few rows without truncating

In [44]:
ds = spark.read.format("csv").option("header", "false").option("quote", "\"").load("/notebooks/ticker_symbol.csv")

                                                                                

In [45]:
ds.show(truncate=False)

+------+-------------------------------+--------+---------------------------+--------------+----+
|_c0   |_c1                            |_c2     |_c3                        |_c4           |_c5 |
+------+-------------------------------+--------+---------------------------+--------------+----+
|Ticker|Name                           |Exchange|CategoryName               |CategoryNumber|NULL|
|AUB.AX|Austbrokers Holdings Limited   |ASX     |Accident & Health Insurance|431           |NULL|
|GLRE  |Greenlight Capital Re, Ltd.    |NMS     |Accident & Health Insurance|431           |NULL|
|SFG   |StanCorp Financial Group Inc.  |NYQ     |Accident & Health Insurance|431           |NULL|
|AMIC  |American Independence Corp.    |NGM     |Accident & Health Insurance|431           |NULL|
|GTS   |Triple-S Management Corporation|NYQ     |Accident & Health Insurance|431           |NULL|
|LRE.L |Lancashire Holdings Limited    |LSE     |Accident & Health Insurance|431           |NULL|
|EIG   |Employers Ho

In [46]:
# renames the columns of the DataFrame

In [47]:
heading = ["Ticker", "Name", "Exchange", "CategoryName", "CategoryNumber", "_c5"]

In [48]:
df = ds.toDF(*heading)

In [49]:
# show the updated DataFrame after renaming

In [50]:
df.show(truncate=False)

+------+-------------------------------+--------+---------------------------+--------------+----+
|Ticker|Name                           |Exchange|CategoryName               |CategoryNumber|_c5 |
+------+-------------------------------+--------+---------------------------+--------------+----+
|Ticker|Name                           |Exchange|CategoryName               |CategoryNumber|NULL|
|AUB.AX|Austbrokers Holdings Limited   |ASX     |Accident & Health Insurance|431           |NULL|
|GLRE  |Greenlight Capital Re, Ltd.    |NMS     |Accident & Health Insurance|431           |NULL|
|SFG   |StanCorp Financial Group Inc.  |NYQ     |Accident & Health Insurance|431           |NULL|
|AMIC  |American Independence Corp.    |NGM     |Accident & Health Insurance|431           |NULL|
|GTS   |Triple-S Management Corporation|NYQ     |Accident & Health Insurance|431           |NULL|
|LRE.L |Lancashire Holdings Limited    |LSE     |Accident & Health Insurance|431           |NULL|
|EIG   |Employers Ho

In [51]:
# cast the CategoryNumber column to integer data type and disply with the first 3 rows

In [52]:
df_with_datatype = df.selectExpr("Ticker", \
                    "Name", \
                    "Exchange", \
                    "CategoryName", \
                    "cast(CategoryNumber as int) CategoryNumber")

In [53]:
df_with_datatype.show(3, False)

+------+----------------------------+--------+---------------------------+--------------+
|Ticker|Name                        |Exchange|CategoryName               |CategoryNumber|
+------+----------------------------+--------+---------------------------+--------------+
|Ticker|Name                        |Exchange|CategoryName               |NULL          |
|AUB.AX|Austbrokers Holdings Limited|ASX     |Accident & Health Insurance|431           |
|GLRE  |Greenlight Capital Re, Ltd. |NMS     |Accident & Health Insurance|431           |
+------+----------------------------+--------+---------------------------+--------------+
only showing top 3 rows



In [54]:
# save to parquet format(columnar) for analytics queries

In [55]:
# Save the DataFrame to Parquet format, overwrite if existing.
# Parquet is Columnar, good for Analytics query
df_with_datatype.write.mode("Overwrite").parquet("/notebooks/ticker_symbol.parquet")

                                                                                

In [56]:
# Read and run SQL queries and show the first 3 rows, renames the columns and print the schema

In [57]:
# Read the Parquet data back and run SQL query on it
read_parquet_df = spark.read.parquet("/notebooks/ticker_symbol.parquet")
read_parquet_df.show(3, False)

+------+----------------------------+--------+---------------------------+--------------+
|Ticker|Name                        |Exchange|CategoryName               |CategoryNumber|
+------+----------------------------+--------+---------------------------+--------------+
|Ticker|Name                        |Exchange|CategoryName               |NULL          |
|AUB.AX|Austbrokers Holdings Limited|ASX     |Accident & Health Insurance|431           |
|GLRE  |Greenlight Capital Re, Ltd. |NMS     |Accident & Health Insurance|431           |
+------+----------------------------+--------+---------------------------+--------------+
only showing top 3 rows



In [60]:
TickerSymbol = read_parquet_df.toDF("Ticker", "Name", "Exchange", "CategoryName", "CategoryNumber")
TickerSymbol.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Exchange: string (nullable = true)
 |-- CategoryName: string (nullable = true)
 |-- CategoryNumber: integer (nullable = true)



In [61]:
# create a temporary view, have SQL queries to filter specific ticker symbols

In [62]:
TickerSymbol.createOrReplaceTempView("TickerSymbol")
spark.sql("SELECT * from TickerSymbol where Ticker in ('IBM', 'MSFT', 'HPQ', 'GE')").show(20, False)

+------+-------------------------------------------+--------+-------------------------------+--------------+
|Ticker|Name                                       |Exchange|CategoryName                   |CategoryNumber|
+------+-------------------------------------------+--------+-------------------------------+--------------+
|MSFT  |Microsoft Corporation                      |NMS     |Business Software & Services   |826           |
|HPQ   |Hewlett-Packard Company                    |NYQ     |Diversified Computer Systems   |810           |
|GE    |General Electric Company                   |NYQ     |Diversified Machinery          |622           |
|IBM   |International Business Machines Corporation|NYQ     |Information Technology Services|824           |
+------+-------------------------------------------+--------+-------------------------------+--------------+

