In [2]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [22]:
data = spark.range(0, 5)

data.write.format("delta").save("data/delta-table")

AnalysisException: [DELTA_PATH_EXISTS] Cannot write to already existent path file:/Users/alfio/projects/upc/BDMP1/data/delta-table without setting OVERWRITE = 'true'.

In [6]:
df = spark.read.format("delta").load("data/delta-table")
df.show()

In [31]:
data = spark.read.option("multiline","true").json('data.json')

In [23]:
data.select("articles").show()

+--------------------+
|            articles|
+--------------------+
|[{Eric Todisco, A...|
+--------------------+



In [11]:
data.write.format("delta").save("data/delta-json")
deltaT = DeltaTable.forPath(spark, "data/delta-json")


TypeError: 'DeltaTable' object is not subscriptable

In [55]:
data = spark.range(10, 20)
data.write.format("delta").mode("overwrite").save("data/delta-table")

In [56]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "data/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [61]:
df = spark.read.format("delta") \
  .option("versionAsOf", 3)\
  .load("data/delta-table")

df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [40]:
streamingDf = spark.readStream.format("rate").load()

stream = streamingDf \
  .selectExpr("value as id") \
  .writeStream.format("delta") \
  .option("checkpointLocation", "data/checkpoint") \
  .start("/data/delta-table")


25/03/23 17:42:55 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [45]:
stream.stop()

In [51]:
data = DeltaTable.forPath(spark, "data/delta-table").toDF()

data.

60

In [54]:
df = spark.read.format("delta").load("data/delta_tables/api_data")
df.show()

+----------+---+---------+-----+
|      date| id|     name|value|
+----------+---+---------+-----+
|2025-03-23|  1|Product A|  100|
|2025-03-23|  2|Product B|  200|
|2025-03-23|  3|Product C|  300|
+----------+---+---------+-----+



In [36]:
from newsapi import NewsApiClient
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv("NEWS_API")
newsapi = NewsApiClient(api_key)

In [39]:
CATEGORIES = ['entertainment', 'technology', 'sports']
for category in CATEGORIES:
    print(category)
    top_headlines = newsapi.get_top_headlines(page_size=100,
                                              category=category,
                                              language='en')
    if top_headlines['status'] == 'ok' and top_headlines['totalResults'] > 0:
        spark.createDataFrame(top_headlines['articles']).write.format("delta").save(f"data/delta-{category}")

entertainment
technology
sports


25/03/24 15:48:37 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/03/24 15:48:37 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/03/24 15:48:37 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


In [81]:
import json
with open('data.json', 'w') as f:
    json.dump(top_headlines, f)

In [47]:
deltaT = DeltaTable.forPath(spark, "data/delta-sports")
deltaT.toDF().sort('publishedAt', ascending=False).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|              author|             content|         description|         publishedAt|              source|               title|                 url|          urlToImage|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|           Ben Morse|Oscar Piastri won...|Oscar Piastri won...|2025-03-23T13:17:00Z|{name -> CNN, id ...|Oscar Piastri win...|https://www.cnn.c...|https://media.cnn...|
|    Harrison Goodman|A look at the key...|A look at the key...|2025-03-23T13:06:00Z|{name -> New York...|Breaking down bes...|https://nypost.co...|https://nypost.co...|
|         Steve Serby|                NULL|RJ Luis Jr. showe...|2025-03-23T12:23:00Z|{name -> New York...|RJ Luis Jr. devas...|https://nypost.co...|ht