In [1]:
import pyspark
from pyspark.sql.functions import current_timestamp
from delta import *

In [2]:
def create_spark_session():
    conf = (
        pyspark.conf.SparkConf()
        .setAppName("LetsTalk")
        .set(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
        .set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
        .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/Users/alfio/projects/upc/LetsTalkBDM/gcs.json")
        .set("spark.sql.shuffle.partitions", "4")
        .set("spark.jars", "../jars/gcs-connector-hadoop3-latest.jar")
        .setMaster(
            "local[*]"
        )
    )

    builder = pyspark.sql.SparkSession.builder.appName("LetsTalk").config(conf=conf)
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    return spark



In [11]:
spark = create_spark_session()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [26]:
deltane = DeltaTable.forPath(spark, "../data/letstalk_landing_zone_bdma/delta_news/entertainment")
dfne = deltane.toDF()

In [43]:
print(dfne.count())
print(dfne.dropDuplicates().count())

422
386


In [16]:
dfne.groupBy('publishedAt').count().orderBy('count', ascending=False).show()

+--------------------+-----+
|         publishedAt|count|
+--------------------+-----+
|2025-04-25T02:00:00Z|    3|
|2025-04-22T00:45:04Z|    3|
|2025-04-25T20:38:00Z|    2|
|2025-04-25T23:01:39Z|    2|
|2025-04-26T01:01:00Z|    2|
|2025-04-17T02:47:45Z|    2|
|2025-04-16T14:00:00Z|    2|
|2025-04-25T20:57:06Z|    2|
|2025-04-17T07:01:36Z|    2|
|2025-04-24T20:31:40Z|    2|
|2025-04-17T04:01:00Z|    2|
|2025-04-25T02:58:00Z|    2|
|2025-04-17T02:29:12Z|    2|
|2025-04-25T22:30:00Z|    2|
|2025-04-23T13:00:00Z|    2|
|2025-04-26T00:17:00Z|    2|
|2025-04-16T19:00:00Z|    2|
|2025-04-22T00:55:31Z|    2|
|2025-04-27T11:00:00Z|    2|
|2025-04-22T00:02:58Z|    2|
+--------------------+-----+
only showing top 20 rows



In [19]:
deltatu = DeltaTable.forPath(spark, "../data/letstalk_landing_zone_bdma/delta_tmdb/upcoming")
dftu = deltatu.toDF()

In [23]:
dftu.groupby('id').count().orderBy('count', ascending=False).show()

+-------+-----+
|     id|count|
+-------+-----+
|1244944|    3|
|1126166|    3|
|1064486|    3|
| 995926|    3|
|1233575|    3|
|1212855|    3|
|1226406|    3|
|1241436|    3|
| 324544|    3|
|1353117|    3|
|1388366|    3|
|1380415|    2|
| 970450|    2|
| 575265|    2|
| 986056|    2|
|1181107|    2|
|1233413|    2|
|1232546|    2|
|1249289|    1|
|1249213|    1|
+-------+-----+
only showing top 20 rows



In [24]:
dftu.filter(dftu.id == 1244944)

adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,ingestion_time,begin_date,end_date
False,/3lEV4CoKoeT2cZ4f...,"[27, 9648]",1244944,en,The Woman in the ...,In the aftermath ...,172.9102,/n0WS2TsNcS6dtaZK...,2025-03-27,The Woman in the ...,False,6.27,37,2025-04-17 16:59:...,2025-04-23,2025-05-14
False,/3lEV4CoKoeT2cZ4f...,"[27, 9648]",1244944,en,The Woman in the ...,In the aftermath ...,157.7558,/n0WS2TsNcS6dtaZK...,2025-03-27,The Woman in the ...,False,6.0,97,2025-04-27 12:53:...,2025-04-30,2025-05-21
False,/3lEV4CoKoeT2cZ4f...,"[27, 9648]",1244944,en,The Woman in the ...,In the aftermath ...,262.5169,/n0WS2TsNcS6dtaZK...,2025-03-27,The Woman in the ...,False,6.0,89,2025-04-23 08:47:...,2025-04-30,2025-05-21
