In [69]:
%load_ext autoreload
%autoreload 2

import pyspark
from delta import *
from dags.lib.IncrementalLoader import IncrementalLoader
from dags.lib.Processer import *
from pyspark.sql.functions import col, lit, max as spark_max

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
def create_spark_session():
    conf = (
        pyspark.conf.SparkConf()
        .setAppName("LetsTalk")
        .set(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
        .set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
        .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/Users/alfio/projects/upc/BDMP2/gcs.json")
        .set("spark.sql.shuffle.partitions", "4")
        .set("spark.jars", "../jars/gcs-connector-hadoop3-latest.jar")
        .setMaster(
            "local[*]"
        )
    )

    builder = pyspark.sql.SparkSession.builder.appName("LetsTalk").config(conf=conf)
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    return spark



In [71]:
spark = create_spark_session()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [72]:
absolute_path_to_landing = '/Users/alfio/projects/upc/BDMP2/data/letstalk_landing_zone_bdma'
table_subpath = 'delta_news/entertainment'
loader = IncrementalLoader(spark, absolute_path_to_landing, table_subpath)
df = loader.get_new_data()
df.head(5)

Using CDF from version 14


[Row(author='Matt Webb Mitovich', content='The following contains spoilers from the Season 2 finale of NCIS: Sydney, which aired April 25 on CBS.\r\nThe NCIS: Sydney team eked out another lives-saving win in Season 2’s two-part finale — though … [+7754 chars]', description="'NCIS Sydney' showrunner Morgan O'Neill teases what's next for Mackey, JD and the team after events of Season 2 finale.", publishedAt='2025-04-26T01:00:00Z', source=Row(id=None, name='TVLine'), title='NCIS: Sydney Boss Breaks Down (Sweaty!) Finale’s Fraught Closing Moments, Teases ‘Bigger’ Season 3 - TVLine', url='https://tvline.com/interviews/ncis-sydney-season-2-finale-mackey-jd-romance-blue-mystery-1235439603/', urlToImage='https://tvline.com/wp-content/uploads/2025/04/ncis-sydney-season-2-finale-recap.jpg?w=650'),
 Row(author="Anthony D'Alessandro", content='There are many questions that Star Wars fans have about the George Lucas produced and co-written 1980 sequel The Empire Strikes Backwhich kicked off the 16t

In [73]:
processor = NewsProcessor(spark, df)

In [75]:
processor.ensure_schema()
processor.remove_clear_duplicates()
processor.name_to_id()
processor.remove_hidden_duplicates(['url'], ['publishedAt'])
processor.normalize_text(['Title', 'Description', 'Content'])
processor.expand_source()
processor.order_by('publishedAt', ascending=False)

Removed 0 simple duplicate(s)
Removed 0 hidden duplicate(s)


In [76]:
processor.df

url,author,Content,Description,publishedAt,source,Title,urlToImage
http://deadline.c...,Natalie Oganesyan,the rock amp ro...,the rock roll h...,2025-04-28 03:33:00,deadline,outkast cyndi la...,https://deadline....
http://www.vultur...,Sara Holdren,the new musical r...,real women have ...,2025-04-28 03:30:23,vulture,bright and bold i...,https://pyxis.nym...
https://deadline....,Glenn Garner,the american film...,stars like elle f...,2025-04-27 04:20:01,deadline,afi life achievem...,https://deadline....
https://www.yourt...,Kate Rose,the love horoscop...,sunday s new moon...,2025-04-27 03:03:23,yourtango,love horoscopes a...,https://www.yourt...
https://www.count...,Anna Logan,did you know gigi...,from hollywood to...,2025-04-26 15:17:46,countryliving.com,celebrities se...,https://hips.hear...
https://www.slash...,Kieran Fisher,the original sno...,how the heck did ...,2025-04-26 15:00:00,/film,the rachel zegler...,https://www.slash...
https://tvline.co...,Matt Webb Mitovich,the following con...,fire country s xl...,2025-04-26 05:19:51,tvline,fire country shoc...,https://tvline.co...
https://tvline.co...,Matt Webb Mitovich,the following con...,ncis sydney sho...,2025-04-26 03:00:00,tvline,ncis sydney boss...,https://tvline.co...
https://www.wwno....,Joseph King,the first weekend...,lil wayne will cl...,2025-04-26 00:30:00,wwno.org,lil wayne s new o...,https://npr.brigh...
http://deadline.c...,Anthony D'Alessandro,there are many qu...,george lucas the ...,2025-04-26 00:08:00,deadline,george lucas on w...,https://deadline....


In [82]:
import os
absolute_path_to_trusted = '/Users/alfio/projects/upc/BDMP2/data/letstalk_trusted_zone_bdma'
save_path = os.path.join(absolute_path_to_trusted, table_subpath)

processor.df = processor.df.withColumn(
    "url",
    when(col('author') == "Sara Holdren", "https:test.com").otherwise(col("url"))
)

In [84]:
processor.merge_with_trusted(save_path, ['url'])

INFO:root:Saving unique records from overlapping ones
INFO:root:Added new 1 unique records
INFO:root:Appending non overlapping records
INFO:root:Adding new 0 records
