In [1]:
from pyspark.sql import Window, SparkSession, DataFrame
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("Training_model").getOrCreate()

In [3]:
spark.read.format("iceberg")
df = spark.read.table("datalake.gold.all_gold_data")

In [4]:
df_with_index = df.withColumn("row_num", F.monotonically_increasing_id())

In [5]:
df_with_index.show(5)

                                                                                

+----------+-----+-----+------+---------------+--------------+---------------+--------------+---------+-------+---------------+--------------+--------------+-------+-------+
|      Date| gold|  oil|s_p500|us_10_year_bond|us_2_year_bond|us_3_month_bond|us_5_year_bond|us_dollar|usd_vnd|vn_10_year_bond|vn_2_year_bond|vn_5_year_bond|vnd_usd|row_num|
+----------+-----+-----+------+---------------+--------------+---------------+--------------+---------+-------+---------------+--------------+--------------+-------+-------+
|1995-01-02| NULL| NULL|  NULL|          7.831|          NULL|          5.629|          NULL|     NULL|11042.0|           NULL|          NULL|          NULL|   NULL|      0|
|1995-01-03|380.9|17.44| 459.1|          7.873|         7.723|          5.655|         7.873|    89.21|11042.0|           NULL|          NULL|          NULL|   NULL|      1|
|1995-01-04|375.3|17.48| 460.7|          7.818|          7.62|          5.812|         7.819|    89.35|11040.0|           NULL|   

In [6]:
batch_size = 1000
total_rows = df_with_index.count()

In [7]:
display(total_rows)

8357

In [8]:
for start in range(1, total_rows + 1, batch_size):
    end = start + batch_size - 1
    print(f"Processing rows {start} to {end}")

    batch = df_with_index.filter((F.col("row_num") >= start) & (F.col("row_num") < end)) \
                         .drop("row_num") \
                         .select("Date", "gold", "oil", "s_p500", "us_10_year_bond", "us_2_year_bond", "us_3_month_bond", "us_5_year_bond", "us_dollar", "usd_vnd")

    # Save to CSV (append mode)
    # batch.write.mode("append").saveAsTable("all_gold_data.csv")

Processing rows 1 to 1000
Processing rows 1001 to 2000
Processing rows 2001 to 3000
Processing rows 3001 to 4000
Processing rows 4001 to 5000
Processing rows 5001 to 6000
Processing rows 6001 to 7000
Processing rows 7001 to 8000
Processing rows 8001 to 9000


In [9]:
spark.stop()