In [45]:
try:
    spark.stop()
except Exception:
    pass

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Iceberg via REST")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "rest")
    .config("spark.sql.catalog.local.uri", "http://iceberg-rest:8181")
    .config("spark.sql.catalog.local.warehouse", "s3://lake/warehouse")
    .config("spark.sql.catalog.local.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config("spark.sql.catalog.local.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.local.s3.path-style-access", "true")
    .config("spark.sql.catalog.local.s3.access-key-id", "admin")
    .config("spark.sql.catalog.local.s3.secret-access-key", "admin123")
    .config("spark.sql.catalog.local.s3.region", "us-east-1")
    .getOrCreate()
)

spark

25/11/03 01:10:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [46]:
spark.sql("SHOW NAMESPACES IN local").show(truncate=False)

+---------+
|namespace|
+---------+
|raw      |
+---------+



In [47]:
spark.sql("SHOW TABLES IN local.raw").show(truncate=False)

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|raw      |avito    |false      |
+---------+---------+-----------+



In [48]:
tbl = "local.raw.avito"
spark.sql(f"SELECT COUNT(*) AS rows FROM {tbl}").show()

+----+
|rows|
+----+
| 220|
+----+



In [49]:
# Load RAW table
raw_df = spark.table("local.raw.avito")   # or local.raw.sarouty

In [50]:
# Créer un DataFrame "Silver" minimal
silver_df = raw_df.select("id").distinct()

# Afficher quelques lignes
silver_df.show(5, truncate=False)

print("✅ Silver dataset initialized with only 'id' column.")
print("Total IDs:", silver_df.count())

+--------+
|id      |
+--------+
|56926340|
|57153825|
|56045169|
|57153879|
|57153823|
+--------+
only showing top 5 rows

✅ Silver dataset initialized with only 'id' column.
Total IDs: 153


In [51]:
from pyspark.sql import functions as F, types as T

# 1) Définir le schéma du JSON dans "payload"
payload_schema = T.StructType([
    T.StructField("id", T.StringType()),
    T.StructField("url", T.StringType()),
    T.StructField("error", T.StringType()),
    T.StructField("title", T.StringType()),
    T.StructField("price_text", T.StringType()),
    T.StructField("breadcrumbs", T.StringType()),
    T.StructField("category", T.StringType()),
    T.StructField("description", T.StringType()),
    T.StructField("attributes", T.StringType()),  # JSON imbriqué sous forme de string
    T.StructField("equipments", T.StringType()),
    T.StructField("seller_name", T.StringType()),
    T.StructField("seller_type", T.StringType()),
    T.StructField("published_date", T.StringType()),
    T.StructField("image_urls", T.StringType()),
])

# 2) Parser le JSON depuis la colonne string "payload"
parsed = (raw_df
    .select(
        *[c for c in raw_df.columns if c != "payload"],  # ex: garder ingest_ts s'il existe
        F.from_json(F.col("payload"), payload_schema).alias("p")
    )
    .filter(F.col("p").isNotNull())  # ignorer les lignes avec JSON invalide
)

# 3) Parser le JSON imbriqué "attributes" -> Map<String,String>
attrs_map = F.from_json(F.col("p.attributes"), T.MapType(T.StringType(), T.StringType()))

# 4) Nettoyages utiles:
# - price_value (MAD) à partir de "price_text" (ex: "6 000 DH" -> 6000.0)
price_value = F.regexp_replace(F.col("p.price_text"), r"[^0-9]", "").cast("double")

# - image_urls -> array<string> en splittant sur " | " et trim de chaque url
image_urls_arr = F.transform(
    F.split(F.col("p.image_urls"), r"\s*\|\s*"),
    lambda x: F.trim(x)
)

# - equipments -> array<string> en splittant sur ";"
equipments_arr = F.transform(
    F.split(F.col("p.equipments"), r"\s*;\s*"),
    lambda x: F.trim(x)
)

# 5) Construire le DataFrame silver (colonnes à plat)
silver_df = parsed.select(
    F.col("p.id").alias("id"),
    F.col("p.url").alias("url"),
    F.col("p.title").alias("title"),
    F.col("p.price_text").alias("price_text"),
    price_value.alias("price_value_mad"),
    F.col("p.category").alias("category"),
    F.col("p.breadcrumbs").alias("breadcrumbs"),
    F.col("p.description").alias("description"),
    F.col("p.seller_name").alias("seller_name"),
    F.col("p.seller_type").alias("seller_type"),
    F.col("p.published_date").alias("published_date_text"),
    image_urls_arr.alias("image_urls"),
    equipments_arr.alias("equipments"),
    attrs_map.alias("attributes_map"),
    # garder le timestamp d'ingestion s'il est présent dans ton raw_df
    *([F.col("ingest_ts")] if "ingest_ts" in raw_df.columns else [])
)

In [61]:
import pandas as pd
from IPython.display import display

# Keep things compact
pd.set_option("display.max_columns", 20)   # don't try to show hundreds
pd.set_option("display.max_colwidth", 80)  # clamp long cells to ~80 chars

# pick a small sample and flatten newlines so rows stay short
pdf = (silver_df.limit(10)
       .toPandas()
       .replace({r'[\r\n\t]+': ' '}, regex=True))

# simple, compact table with ellipsis in long cells
display(
    pdf.style
      .set_table_styles([
          {'selector': 'table', 'props': [('table-layout','fixed'), ('width','100%')]},
          {'selector': 'th, td', 'props': [
              ('max-width','280px'),
              ('white-space','nowrap'),
              ('overflow','hidden'),
              ('text-overflow','ellipsis')
          ]}
      ])
      .hide(axis='index')  # remove row numbers
)

id,url,title,price,description,seller_name,seller_type,published_date_text,image_urls,equipments,attributes_map,ingest_ts,offre,type,city,neighborhood,site
56995728,https://www.avito.ma/fr/agdal/autre_immobilier/Appartement_usage_de_bureau_à_louer_à_l_Agdal_56995728.htm,Appartement usage de bureau à louer à l'Agdal,13500.0,"Haut Agdal un appartement à usage bureau à louer dans un immeuble bien entretenu, situé au 1er étage d'un immeuble, sur une superficie de 145 m², se composant de double réception, 3 chambres une salle de bain avec baignoire et salle de douche, une cuisine et une place au garage. Réf:B2025972",akkor immobilier,Particulier,2025-11-02 17:25:03,"['https://content.avito.ma/classifieds/images/10139702658?t=images', 'https://content.avito.ma/classifieds/images/10139702533?t=images', 'https://content.avito.ma/classifieds/images/10139702532?t=images', 'https://content.avito.ma/classifieds/images/10139702534?t=images', 'https://content.avito.ma/classifieds/images/10139702626?t=images', 'https://content.avito.ma/classifieds/images/10139702529?t=images', 'https://content.avito.ma/classifieds/images/10139702641?t=images', 'https://content.avito.ma/classifieds/images/10139702659?t=images', 'https://content.avito.ma/classifieds/images/10139702660?t=images', 'https://content.avito.ma/classifieds/images/10139702531?t=images', 'https://content.avito.ma/classifieds/images/10139702530?t=images']",,,2025-11-03 00:25:15.002000,rent,Autre Immobilier,Rabat,Agdal,avito
56995816,https://www.avito.ma/fr/autre_secteur/appartements/splendide_appartement_56995816.htm,splendide appartement,9000.0,"splendide appartement bien situé dans une résidence familiale propre, composé de 2 chambres, salon, séjour, cuisine équipée, parking sous-sol.",opera immobilier,Particulier,2025-11-02 22:25:04,"['https://content.avito.ma/classifieds/images/10139703349?t=images', 'https://content.avito.ma/classifieds/images/10139703351?t=images', 'https://content.avito.ma/classifieds/images/10139703354?t=images', 'https://content.avito.ma/classifieds/images/10139703353?t=images', 'https://content.avito.ma/classifieds/images/10139703376?t=images', 'https://content.avito.ma/classifieds/images/10139703358?t=images', 'https://content.avito.ma/classifieds/images/10139703361?t=images', 'https://content.avito.ma/classifieds/images/10139703362?t=images', 'https://content.avito.ma/classifieds/images/10139703365?t=images', 'https://content.avito.ma/classifieds/images/10139703369?t=images', 'https://content.avito.ma/classifieds/images/10139703368?t=images', 'https://content.avito.ma/classifieds/images/10139703370?t=images', 'https://content.avito.ma/classifieds/images/10139703373?t=images']","['2', '2', '139', '2 mois', '300', '1', 'Ascenseur', 'Balcon', 'Chauffage', 'Climatisation', 'Concierge', 'Cuisine équipée']","{'Salle de bain': '2', 'Frais de syndic / mois': '300', 'Chambres': '2', 'Caution': '2 mois', 'Salons': '1', 'Surface totale': '139'}",2025-11-03 00:25:15.002000,rent,Appartements,Agadir,Autre secteur,avito
56952547,https://www.avito.ma/fr/palmier/local/Local_commercial_Palmier_proche_de_l_université_56952547.htm,Local commercial Palmier proche de l'université,17000.0,"A louer local commercial dans un complexe neuf proche de l'Université et d'un hôtel un local de 37 m² plus 19 m² de mezzanine aménagé avec Clim, gaine , et escalier avec place au parking idéal pour tous types de commerce -food & beverage - concept store - superette- librairie papeterie - café d'autant qu'il y a un espace pour une terrasse",Clavis Immobilier,Particulier,2025-11-02 08:25:06,"['https://content.avito.ma/classifieds/images/10139271691?t=images', 'https://content.avito.ma/classifieds/images/10139271692?t=images', 'https://content.avito.ma/classifieds/images/10139271707?t=images', 'https://content.avito.ma/classifieds/images/10139271708?t=images', 'https://content.avito.ma/classifieds/images/10139271705?t=images', 'https://content.avito.ma/classifieds/images/10139271722?t=images', 'https://content.avito.ma/classifieds/images/10139271721?t=images', 'https://content.avito.ma/classifieds/images/10139271720?t=images', 'https://content.avito.ma/classifieds/images/10139271725?t=images', 'https://content.avito.ma/classifieds/images/10139271726?t=images']","['1', '76', 'Chauffage', 'Climatisation', 'Parking', 'Sécurité']","{'Salle de bain': '1', 'Surface totale': '76'}",2025-11-03 00:25:15.002000,rent,Local,Casablanca,Palmier,avito
56620071,https://www.avito.ma/fr/val_fleuri/appartements/Studio_meublé_Val_Fleuri_TRAMWAY_56620071.htm,Studio meublé Val Fleuri TRAMWAY,6300.0,"Studio bien meublé à louer, à Val Fleuri, 45m², ETG1, 1 Salon balcon, 1 Cuisine équipée, 1 Chambre à coucher,1 SDB douche, ascenseur parking concierge Loyer 6300 Dh par mois",ABAMNY Immobilier,Particulier,2025-11-03 00:58:12,"['https://content.avito.ma/classifieds/images/10141320194?t=images', 'https://content.avito.ma/classifieds/images/10141320200?t=images', 'https://content.avito.ma/classifieds/images/10141320202?t=images', 'https://content.avito.ma/classifieds/images/10141320211?t=images', 'https://content.avito.ma/classifieds/images/10141320220?t=images', 'https://content.avito.ma/classifieds/images/10141320224?t=images', 'https://content.avito.ma/classifieds/images/10141320228?t=images', 'https://content.avito.ma/classifieds/images/10141320229?t=images', 'https://content.avito.ma/classifieds/images/10141320230?t=images']","['1', '1', '45', '1 mois', '1', '45', 'Ascenseur', 'Balcon', 'Climatisation', 'Concierge', 'Cuisine équipée', 'Meublé']","{'Salle de bain': '1', 'Surface habitable': '45', 'Chambres': '1', 'Caution': '1 mois', 'Salons': '1', 'Surface totale': '45'}",2025-11-03 00:25:15.002000,rent,Appartements,الدار البيضاء,Val Fleuri,avito
56920617,https://www.avito.ma/fr/bourgogne/local/Magasin_à_louer_plein_centre_ville_56920617.htm,Magasin à louer plein centre ville,10000.0,Local à louer totalement aménagé disponible en place centre-ville. Si vous êtes vraiment intéressé. Appelle-moi sur le numéro Cafe sur l’annonce. Merci,New Adresse SARL,Particulier,2025-11-03 00:59:13,"['https://content.avito.ma/classifieds/images/10141252952?t=images', 'https://content.avito.ma/classifieds/images/10141252967?t=images', 'https://content.avito.ma/classifieds/images/10141252968?t=images', 'https://content.avito.ma/classifieds/images/10141252971?t=images']","['1', '100']","{'Salle de bain': '1', 'Surface totale': '100'}",2025-11-03 00:25:15.002000,rent,Local,Casablanca,Bourgogne,avito
57153882,https://www.avito.ma/fr/mehdia/appartements/Appartement_à_louer_53_m²_à_Mehdia_57153882.htm,Appartement à louer 53 m² à Mehdia,0.0,"Joli appartement à louer à Mehdia plage, juste pour famille maximum 4 personnes ou couple marié.",Laila dihi,Particulier,2025-11-03 01:37:16,"['https://content.avito.ma/classifieds/images/10141391161?t=images', 'https://content.avito.ma/classifieds/images/10141391162?t=images', 'https://content.avito.ma/classifieds/images/10141391163?t=images']","['1', '0', '1', '53', 'Studio', '1', 'Ascenseur', 'Balcon', 'Chauffage', 'Climatisation', 'Concierge', 'Cuisine équipée']","{""Type d'appartement"": 'Studio', 'Salle de bain': '0', 'Étage': '1', 'Surface habitable': '53', 'Chambres': '1', 'Salons': '1'}",2025-11-03 00:45:30.003000,rent,Appartements,Mehdia,Toute la ville,avito
57153879,https://www.avito.ma/fr/anza/appartements/Appartement_à_louer_70_m²_à_Agadir_57153879.htm,Appartement à louer 70 m² à Agadir,2300.0,"Je mets en location mensuelle un appartement de 70 m2, un salon plus une chambre plus un Hall, quartier Hassania Anza (très calme) en face collège Ibn-Khaldoun, 7 mins au centre ville 15 mins à Taghazout,",Abdou,Particulier,2025-11-03 01:37:18,"['https://content.avito.ma/classifieds/images/10141390686?t=images', 'https://content.avito.ma/classifieds/images/10141390687?t=images', 'https://content.avito.ma/classifieds/images/10141390688?t=images', 'https://content.avito.ma/classifieds/images/10141390689?t=images', 'https://content.avito.ma/classifieds/images/10141390691?t=images', 'https://content.avito.ma/classifieds/images/10141390696?t=images', 'https://content.avito.ma/classifieds/images/10141390697?t=images', 'https://content.avito.ma/classifieds/images/10141390705?t=images']","['2', '0', '98', '1 mois', '0']","{'Salle de bain': '0', 'Chambres': '2', 'Caution': '1 mois', 'Salons': '0', 'Surface totale': '98'}",2025-11-03 00:45:30.003000,rent,Appartements,Agadir,Anza,avito
57031780,https://www.avito.ma/fr/abdelmoumen/appartements/Appartement_vide_à_louer_Abdelmoumen_57031780.htm,Appartement vide à louer Abdelmoumen,7500.0,"Appartement vide de 116m2 à louer longue durée, très bien situé au croisement de bd Abdelmoumen et Anoual. Composé de deux salons, un avec cheminée, deux chambres et deux salles de bain. Loué VIDE, 6ème étage avec ascenseur. Loyer 7500 dhs.",MedZaim immobilier,Particulier,2025-11-03 01:35:20,"['https://content.avito.ma/classifieds/images/10140077051?t=images', 'https://content.avito.ma/classifieds/images/10140077050?t=images', 'https://content.avito.ma/classifieds/images/10140077052?t=images', 'https://content.avito.ma/classifieds/images/10140077053?t=images', 'https://content.avito.ma/classifieds/images/10140077055?t=images', 'https://content.avito.ma/classifieds/images/10140077054?t=images', 'https://content.avito.ma/classifieds/images/10140077056?t=images']","['2', '2', '2', '116', '6', 'Climatisation', 'Concierge', 'Parking', 'Sécurité']","{'Salle de bain': '2', 'Étage': '6', 'Surface habitable': '116', 'Chambres': '2', 'Salons': '2'}",2025-11-03 00:45:30.003000,rent,Appartements,Casablanca,Abdelmoumen,avito
57105722,https://www.avito.ma/fr/hermitage/appartements/studio_à_louer_hermitage_57105722.htm,studio à louer hermitage,5500.0,"chambre salon cuisine séparée douche ascenseur garage près de toutes commodités endroit excellent ensoleillé ,",Winks immobilier,Particulier,2025-11-02 17:35:03,"['https://content.avito.ma/classifieds/images/10141074289?t=images', 'https://content.avito.ma/classifieds/images/10141074288?t=images', 'https://content.avito.ma/classifieds/images/10141074286?t=images', 'https://content.avito.ma/classifieds/images/10141074285?t=images', 'https://content.avito.ma/classifieds/images/10141074283?t=images', 'https://content.avito.ma/classifieds/images/10141074282?t=images', 'https://content.avito.ma/classifieds/images/10141074280?t=images', 'https://content.avito.ma/classifieds/images/10141074279?t=images']","['1', '1', '1', '50', '4']","{'Salle de bain': '1', 'Étage': '4', 'Surface habitable': '50', 'Chambres': '1', 'Salons': '1'}",2025-11-02 23:35:15.003000,rent,Appartements,Casablanca,Hermitage,avito
57104674,https://www.avito.ma/fr/hay_el_fath/local/Magasin_centre_commercial_ait_baha_57104674.htm,Magasin centre commercial ait baha,0.0,Magasin à louer centre commercial AIT BAHA pour plus d'info contacter,amr,Particulier,2025-11-02 16:35:04,"['https://content.avito.ma/classifieds/images/10140879814?t=images', 'https://content.avito.ma/classifieds/images/10140879815?t=images', 'https://content.avito.ma/classifieds/images/10140879816?t=images', 'https://content.avito.ma/classifieds/images/10140879826?t=images']","['0', '1', 'Parking', 'Sécurité']","{'Salle de bain': '0', 'Surface totale': '1'}",2025-11-02 23:35:15.003000,rent,Local,Rabat,Hay el Fath,avito


In [53]:
silver_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price_text: string (nullable = true)
 |-- price_value_mad: double (nullable = true)
 |-- category: string (nullable = true)
 |-- breadcrumbs: string (nullable = true)
 |-- description: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- published_date_text: string (nullable = true)
 |-- image_urls: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- equipments: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- attributes_map: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- ingest_ts: timestamp (nullable = true)



In [54]:
# Drop the 'price_text' column
silver_df = silver_df.drop('price_text')

# Rename 'price_value_mad' to 'price'
silver_df = silver_df.withColumnRenamed('price_value_mad', 'price')

# Replace NULL 'price' values with 0.0
silver_df = silver_df.fillna({'price': 0.0})

silver_df.select('price').show(10, truncate=False)

+-------+
|price  |
+-------+
|8500.0 |
|2500.0 |
|6000.0 |
|0.0    |
|5000.0 |
|35000.0|
|0.0    |
|9880.0 |
|14000.0|
|6500.0 |
+-------+
only showing top 10 rows



In [55]:
from pyspark.sql import functions as F

# Transform 'category' into 'offre' and 'category_type'
silver_df = (
    silver_df
    # Create 'offre' column for 'rent' or 'sale'
    .withColumn(
        "offre",
        F.when(F.col("category").contains("à louer"), "rent")
         .when(F.col("category").contains("à vendre"), "sale")
         .otherwise(None)
    )
    # Create 'category_type' column with only the property type (e.g., Maisons, Appartements)
    .withColumn(
        "type",
        F.when(
            F.col("category").contains("à louer") | F.col("category").contains("à vendre"),
            F.split(F.col("category"), ",")[0]
        ).otherwise(None)
    )
    # Drop the original 'category' column
    .drop("category")
)

# Check the result
silver_df.select("offre", "type").show(30, truncate=False)


+-----+----------------+
|offre|type            |
+-----+----------------+
|rent |Appartements    |
|rent |Appartements    |
|rent |Appartements    |
|rent |Autre Immobilier|
|rent |Appartements    |
|rent |Local           |
|rent |Appartements    |
|rent |Local           |
|rent |Appartements    |
|rent |Local           |
|rent |Appartements    |
|rent |Appartements    |
|rent |Appartements    |
|rent |Appartements    |
|rent |Appartements    |
|sale |Villas et Riads |
|sale |Appartements    |
|sale |Maisons         |
|sale |Maisons         |
|rent |Local           |
|rent |Appartements    |
|rent |Appartements    |
|rent |Appartements    |
|rent |Bureaux         |
|rent |Appartements    |
|rent |Bureaux         |
|rent |Appartements    |
|rent |Local           |
|rent |Bureaux         |
|rent |Appartements    |
+-----+----------------+
only showing top 30 rows



In [60]:
from pyspark.sql import functions as F

# Split 'breadcrumbs' based on '>'
split_breadcrumbs = F.split(F.col("breadcrumbs"), " > ")

# Create new columns for each segment
silver_df = (
    silver_df
    .withColumn("city", split_breadcrumbs.getItem(2))  # e.g., "Casablanca"
    .withColumn("neighborhood", split_breadcrumbs.getItem(3))  # e.g., "Maarif"
    .withColumn("site", split_breadcrumbs.getItem(4))  # e.g., "Avito Immobilier"
    .drop("breadcrumbs")  # Drop the original column if not needed
)

# Check the result
silver_df.select(
     "city", "neighborhood", "site", 
).show(30, truncate=False)


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `breadcrumbs` cannot be resolved. Did you mean one of the following? [`image_urls`, `city`, `id`, `offre`, `price`].;
'Project [id#1007, url#1008, title#1009, price#1097, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135, split('breadcrumbs,  > , -1)[2] AS city#1296, neighborhood#1198, site#1271]
+- Project [id#1007, url#1008, title#1009, price#1097, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135, city#1181, neighborhood#1198, avito AS site#1271]
   +- Project [id#1007, url#1008, title#1009, price#1097, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135, city#1181, neighborhood#1198, site#1216]
      +- Project [id#1007, url#1008, title#1009, price#1097, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135, city#1181, neighborhood#1198, split(breadcrumbs#1013,  > , -1)[4] AS site#1216]
         +- Project [id#1007, url#1008, title#1009, price#1097, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135, city#1181, split(breadcrumbs#1013,  > , -1)[3] AS neighborhood#1198]
            +- Project [id#1007, url#1008, title#1009, price#1097, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135, split(breadcrumbs#1013,  > , -1)[2] AS city#1181]
               +- Project [id#1007, url#1008, title#1009, price#1097, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, type#1135]
                  +- Project [id#1007, url#1008, title#1009, price#1097, category#1012, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, offre#1119, CASE WHEN (Contains(category#1012, à louer) OR Contains(category#1012, à vendre)) THEN split(category#1012, ,, -1)[0] ELSE cast(null as string) END AS type#1135]
                     +- Project [id#1007, url#1008, title#1009, price#1097, category#1012, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980, CASE WHEN Contains(category#1012, à louer) THEN rent WHEN Contains(category#1012, à vendre) THEN sale ELSE cast(null as string) END AS offre#1119]
                        +- Project [id#1007, url#1008, title#1009, coalesce(nanvl(price#1068, cast(null as double)), cast(0.0 as double)) AS price#1097, category#1012, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980]
                           +- Project [id#1007, url#1008, title#1009, price_value_mad#1011 AS price#1068, category#1012, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980]
                              +- Project [id#1007, url#1008, title#1009, price_value_mad#1011, category#1012, breadcrumbs#1013, description#1014, seller_name#1015, seller_type#1016, published_date_text#1017, image_urls#1018, equipments#1019, attributes_map#1020, ingest_ts#980]
                                 +- Project [p#1003.id AS id#1007, p#1003.url AS url#1008, p#1003.title AS title#1009, p#1003.price_text AS price_text#1010, cast(regexp_replace(p#1003.price_text, [^0-9], , 1) as double) AS price_value_mad#1011, p#1003.category AS category#1012, p#1003.breadcrumbs AS breadcrumbs#1013, p#1003.description AS description#1014, p#1003.seller_name AS seller_name#1015, p#1003.seller_type AS seller_type#1016, p#1003.published_date AS published_date_text#1017, transform(split(p#1003.image_urls, \s*\|\s*, -1), lambdafunction(trim(lambda x_6#1035, None), lambda x_6#1035, false)) AS image_urls#1018, transform(split(p#1003.equipments, \s*;\s*, -1), lambdafunction(trim(lambda x_7#1036, None), lambda x_7#1036, false)) AS equipments#1019, from_json(MapType(StringType,StringType,true), p#1003.attributes, Some(Etc/UTC)) AS attributes_map#1020, ingest_ts#980]
                                    +- Filter isnotnull(p#1003)
                                       +- Project [id#978, ingest_ts#980, from_json(StructField(id,StringType,true), StructField(url,StringType,true), StructField(error,StringType,true), StructField(title,StringType,true), StructField(price_text,StringType,true), StructField(breadcrumbs,StringType,true), StructField(category,StringType,true), StructField(description,StringType,true), StructField(attributes,StringType,true), StructField(equipments,StringType,true), StructField(seller_name,StringType,true), StructField(seller_type,StringType,true), StructField(published_date,StringType,true), StructField(image_urls,StringType,true), payload#979, Some(Etc/UTC)) AS p#1003]
                                          +- SubqueryAlias local.raw.avito
                                             +- RelationV2[id#978, payload#979, ingest_ts#980] local.raw.avito local.raw.avito


In [59]:
from pyspark.sql import functions as F

# Change all values in the 'site' column to 'avito'
silver_df = silver_df.withColumn('site', F.lit('avito'))

# Show the updated data
silver_df.select('site').show(10, truncate=False)

+-----+
|site |
+-----+
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
+-----+
only showing top 10 rows

