In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [2]:
spark = SparkSession.builder \
    .appName("TestReadBronze") \
    .getOrCreate()

25/03/23 12:27:31 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
df = spark.read.option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/src/data/Oil")

df.show(5)  # Hiển thị 5 dòng đầu
df.printSchema()  # Xem schema của DataFrame

                                                                                

+----------+-----+-----+-----+-----+-------+--------+
|      Date|Price| Open| High|  Low|   Vol.|Change %|
+----------+-----+-----+-----+-----+-------+--------+
|11/28/2013|92.28|92.19|92.36|92.08|   NULL|  -0.03%|
|11/27/2013| 92.3| 93.5| 93.6|91.77|230.88K|  -1.47%|
|11/26/2013|93.68|94.21|94.69|93.43|160.18K|  -0.44%|
|11/25/2013|94.09|94.15| 94.5|93.08|270.86K|  -0.79%|
|11/22/2013|94.84|95.29|95.57|94.05|257.84K|  -0.63%|
+----------+-----+-----+-----+-----+-------+--------+
only showing top 5 rows

root
 |-- Date: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Vol.: string (nullable = true)
 |-- Change %: string (nullable = true)



In [4]:
# Select only the desired columns (dropping "Vol." and "Change %")
df_selected = df.select("Date", "Price", "Open", "High", "Low")

# Create a window spec; here ordering by "Date", can be adjusted as needed
window_spec = Window.orderBy("Date")
df_with_index = df_selected.withColumn("index", row_number().over(window_spec))

# Reorder columns to get "index", "Date", "Price", "Open", "High", "Low"
df_with_index = df_with_index.select("index", "Date", "Price", "Open", "High", "Low")
df_with_index.show()

25/03/23 12:27:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/23 12:27:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/23 12:27:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----+----------+-----+-----+-----+-----+
|index|      Date|Price| Open| High|  Low|
+-----+----------+-----+-----+-----+-----+
|    1|01/01/2014| 98.7|98.61|98.76|98.54|
|    2|01/01/2015|54.56|53.73|54.85|53.72|
|    3|01/01/2018|60.24|60.26|60.28|60.15|
|    4|01/01/2019|45.89|45.77|45.95|45.73|
|    5|01/01/2020|61.33|61.52|61.54|61.33|
|    6|01/01/2024|71.97|71.67|72.24|71.67|
|    7|01/02/1985|25.92|26.15| 26.2|25.86|
|    8|01/02/1986|25.56|25.85|25.96| 25.2|
|    9|01/02/1987|18.13|17.92|18.19| 17.9|
|   10|01/02/1990|22.89|21.81|22.92|21.79|
|   11|01/02/1991|26.49|28.07|28.45|26.35|
|   12|01/02/1992|19.49| 19.1|19.52| 19.1|
|   13|01/02/1996|19.81|19.52|19.83|19.41|
|   14|01/02/1997|25.69|25.88|26.05| 25.6|
|   15|01/02/1998|17.43|17.68|17.68| 17.4|
|   16|01/02/2001|27.21| 26.9| 27.4|26.65|
|   17|01/02/2002|21.01| 19.9|21.05|19.72|
|   18|01/02/2003|31.85| 31.6|32.09| 31.4|
|   19|01/02/2007|61.05|61.05|61.05|61.05|
|   20|01/02/2008|99.62|96.05|100.0|96.05|
+-----+----

In [None]:
# Write the DataFrame to a Silver table
df_with_index.write \
    .mode("overwrite") \
    .format("parquet") \
    .save("/src/data/Silver/Oil")