In [67]:
try:
    spark.stop()
except Exception:
    pass

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Iceberg via REST")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "rest")
    .config("spark.sql.catalog.local.uri", "http://iceberg-rest:8181")
    .config("spark.sql.catalog.local.warehouse", "s3://lake/warehouse")
    .config("spark.sql.catalog.local.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config("spark.sql.catalog.local.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.local.s3.path-style-access", "true")
    .config("spark.sql.catalog.local.s3.access-key-id", "admin")
    .config("spark.sql.catalog.local.s3.secret-access-key", "admin123")
    .config("spark.sql.catalog.local.s3.region", "us-east-1")
    .getOrCreate()
)

spark

25/11/03 01:16:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [68]:
spark.sql("SHOW NAMESPACES IN local").show(truncate=False)

+---------+
|namespace|
+---------+
|raw      |
+---------+



In [119]:
spark.sql("SHOW TABLES IN local.raw").show(truncate=False)

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|silver   |avito    |false      |
+---------+---------+-----------+



In [97]:
import os

# Define the export path relative to the current directory of the Jupyter notebook
export_path = os.path.join(os.getcwd(), "exported_avito_table")

# Export the Iceberg table to Parquet format in the current directory
spark.read.format("iceberg").load("local.raw.avito") \
    .write.format("parquet").save(export_path)

# Confirm the files were saved by checking the directory content
print(f"Table exported to: {export_path}")

25/11/03 01:25:08 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

Table exported to: /opt/work/src/notebooks/exported_avito_table


In [100]:
# Load RAW table
raw_df = spark.table("local.raw.avito")   # or local.raw.sarouty

In [101]:
# Cr√©er un DataFrame "Silver" minimal
silver_df = raw_df.select("id").distinct()

# Afficher quelques lignes
silver_df.show(5, truncate=False)

print("‚úÖ Silver dataset initialized with only 'id' column.")
print("Total IDs:", silver_df.count())

                                                                                

+--------+
|id      |
+--------+
|57084338|
|29806631|
|57020711|
|37621281|
|56920617|
+--------+
only showing top 5 rows

‚úÖ Silver dataset initialized with only 'id' column.
Total IDs: 284


                                                                                

In [102]:
from pyspark.sql import functions as F, types as T

# 1) D√©finir le sch√©ma du JSON dans "payload"
payload_schema = T.StructType([
    T.StructField("id", T.StringType()),
    T.StructField("url", T.StringType()),
    T.StructField("error", T.StringType()),
    T.StructField("title", T.StringType()),
    T.StructField("price_text", T.StringType()),
    T.StructField("breadcrumbs", T.StringType()),
    T.StructField("category", T.StringType()),
    T.StructField("description", T.StringType()),
    T.StructField("attributes", T.StringType()),  # JSON imbriqu√© sous forme de string
    T.StructField("equipments", T.StringType()),
    T.StructField("seller_name", T.StringType()),
    T.StructField("seller_type", T.StringType()),
    T.StructField("published_date", T.StringType()),
    T.StructField("image_urls", T.StringType()),
])

# 2) Parser le JSON depuis la colonne string "payload"
parsed = (raw_df
    .select(
        *[c for c in raw_df.columns if c != "payload"],  # ex: garder ingest_ts s'il existe
        F.from_json(F.col("payload"), payload_schema).alias("p")
    )
    .filter(F.col("p").isNotNull())  # ignorer les lignes avec JSON invalide
)

# 3) Parser le JSON imbriqu√© "attributes" -> Map<String,String>
attrs_map = F.from_json(F.col("p.attributes"), T.MapType(T.StringType(), T.StringType()))

# 4) Nettoyages utiles:
# - price_value (MAD) √† partir de "price_text" (ex: "6 000 DH" -> 6000.0)
price_value = F.regexp_replace(F.col("p.price_text"), r"[^0-9]", "").cast("double")

# - image_urls -> array<string> en splittant sur " | " et trim de chaque url
image_urls_arr = F.transform(
    F.split(F.col("p.image_urls"), r"\s*\|\s*"),
    lambda x: F.trim(x)
)

# - equipments -> array<string> en splittant sur ";"
equipments_arr = F.transform(
    F.split(F.col("p.equipments"), r"\s*;\s*"),
    lambda x: F.trim(x)
)

# 5) Construire le DataFrame silver (colonnes √† plat)
silver_df = parsed.select(
    F.col("p.id").alias("id"),
    F.col("p.url").alias("url"),
    F.col("p.title").alias("title"),
    F.col("p.price_text").alias("price_text"),
    price_value.alias("price_value_mad"),
    F.col("p.category").alias("category"),
    F.col("p.breadcrumbs").alias("breadcrumbs"),
    F.col("p.description").alias("description"),
    F.col("p.seller_name").alias("seller_name"),
    F.col("p.seller_type").alias("seller_type"),
    F.col("p.published_date").alias("published_date_text"),
    image_urls_arr.alias("image_urls"),
    equipments_arr.alias("equipments"),
    attrs_map.alias("attributes_map"),
    # garder le timestamp d'ingestion s'il est pr√©sent dans ton raw_df
    *([F.col("ingest_ts")] if "ingest_ts" in raw_df.columns else [])
)

In [103]:
import pandas as pd
from IPython.display import display

# Keep things compact
pd.set_option("display.max_columns", 20)   # don't try to show hundreds
pd.set_option("display.max_colwidth", 80)  # clamp long cells to ~80 chars

# pick a small sample and flatten newlines so rows stay short
pdf = (silver_df.limit(10)
       .toPandas()
       .replace({r'[\r\n\t]+': ' '}, regex=True))

# simple, compact table with ellipsis in long cells
display(
    pdf.style
      .set_table_styles([
          {'selector': 'table', 'props': [('table-layout','fixed'), ('width','100%')]},
          {'selector': 'th, td', 'props': [
              ('max-width','280px'),
              ('white-space','nowrap'),
              ('overflow','hidden'),
              ('text-overflow','ellipsis')
          ]}
      ])
      .hide(axis='index')  # remove row numbers
)

id,url,title,price_text,price_value_mad,category,breadcrumbs,description,seller_name,seller_type,published_date_text,image_urls,equipments,attributes_map,ingest_ts
57050424,https://www.avito.ma/fr/route_de_casablanca/terrains_et_fermes/Terrain_17_km_Marrakech_surface_4500_m¬≤_57050424.htm,Terrain 17 km Marrakech surface 4500 m¬≤,600 000 DH,600000.0,"Terrains et fermes, √† vendre",Home Icon Accueil > Tout le Maroc > Marrakech > Route de Casablanca > Avito Immobilier > Terrains et fermes > Terrain 17 km Marrakech surface 4500 m¬≤,Terrain 17 km Marrakech surface 4500 m¬≤,ÿ®Ÿäÿπ Ÿàÿ¥ÿ±ÿßÿ°,Particulier,2025-11-02 14:30:03,"['https://content.avito.ma/classifieds/images/10140264120?t=images', 'https://content.avito.ma/classifieds/images/10140264121?t=images', 'https://content.avito.ma/classifieds/images/10140264116?t=images', 'https://content.avito.ma/classifieds/images/10140264130?t=images', 'https://content.avito.ma/classifieds/images/10140264119?t=images', 'https://content.avito.ma/classifieds/images/10140264131?t=images']","['4500', 'Villa']","{'Surface totale': '4500', 'Zoning': 'Villa'}",2025-11-03 01:30:15.002000
57084338,https://www.avito.ma/fr/av_mohammed_v/villas_et_riads/Villa_Haut_Standing___Morocco_Mall___Centre_Ville_57084338.htm,Villa Haut Standing - Morocco Mall / Centre-Ville,11 500 000 DH,11500000.0,"Villas et Riads, √† vendre",Home Icon Accueil > Tout le Maroc > Marrakech > Av Mohammed V > Avito Immobilier > Ventes Immobili√®res > Villas et Riads > Villa Haut Standing - Morocco Mall / Centre-Ville,"üìç Emplacement d‚Äôexception Nich√©e sur le prestigieux Boulevard Mohammed VI, √† 5 minutes du centre-ville et du Morocco Mall, cette villa contemporaine allie √©l√©gance, confort et fort potentiel locatif, dans un cadre moderne et lumineux. üíº Usage : habitation principale ou investissement locatif haut rendement üí∞ Prix de vente : 11 500 000dh üìà D√©j√† exploiter en location nuit√©e avec une rentabilit√© de 150 000dh/mois üèó Ann√©e de construction : 2018 üíº Frais d‚Äôagence : 2,5 % HT üìê Caract√©ristiques principales ‚Ä¢ üè† Terrain : 600 m¬≤ - Surface habitable : 233 m¬≤ sur trois niveaux ‚Ä¢ üß± Architecture moderne & finitions haut standing ‚Ä¢ üåø Jardin paysager entourant la villa ‚Ä¢ üèä‚Äç‚ôÇ Piscine priv√©e de 15 m x 4 m (non chauff√©e) ‚Ä¢ üöó Parking int√©rieur üè° Sous-sol ‚Ä¢ üõè 1 Chambre avec salle de bain et dressing ‚Ä¢ üõã Double s√©jour avec vue sur le jardin ‚Ä¢ üé¨ Salle de cin√©ma ‚Ä¢ üé± Espace billard üè† Rez-de-chauss√©e ‚Ä¢ üõã Triple s√©jour spacieux avec chemin√©e et vue sur piscine ‚Ä¢ üçΩ Cuisine am√©ricaine √©quip√©e moderne et fonctionnelle ‚Ä¢ üöª WC invit√©s ‚Ä¢ üåû Espaces de vie ouverts et lumineux gr√¢ce aux baies vitr√©es üõè √âtage ‚Ä¢ üëë Suite parentale senior avec terrasse priv√©e, salle de bain et grand dressing ‚Ä¢ üõå Deux suites suppl√©mentaires, chacune avec salle de bain et dressing privatif ‚òÄ Rooftop ‚Ä¢ Vue panoramique imprenable sur les montagnes de l'Atlas üîë Atouts majeurs ‚úÖ Emplacement prestigieux ‚úÖ Villa r√©cente & haut standing ‚úÖ Excellente rentabilit√© locative ‚úÖ Finitions premium ‚úÖ Id√©ale pour vivre ou investir",ORIGINAL STAY,Particulier,2025-11-02 15:30:05,"['https://content.avito.ma/classifieds/images/10140653996?t=images', 'https://content.avito.ma/classifieds/images/10140654085?t=images', 'https://content.avito.ma/classifieds/images/10140654089?t=images', 'https://content.avito.ma/classifieds/images/10140654125?t=images', 'https://content.avito.ma/classifieds/images/10140654129?t=images', 'https://content.avito.ma/classifieds/images/10140654145?t=images', 'https://content.avito.ma/classifieds/images/10140654159?t=images', 'https://content.avito.ma/classifieds/images/10140654193?t=images', 'https://content.avito.ma/classifieds/images/10140654220?t=images', 'https://content.avito.ma/classifieds/images/10140654319?t=images', 'https://content.avito.ma/classifieds/images/10140654348?t=images', 'https://content.avito.ma/classifieds/images/10140654352?t=images', 'https://content.avito.ma/classifieds/images/10140654370?t=images', 'https://content.avito.ma/classifieds/images/10140654411?t=images', 'https://content.avito.ma/classifieds/images/10140654422?t=images']","['4', '4', '598', 'Bon √©tat', 'Imm√©diate', '5', 'Cuisine √©quip√©e', 'Jardin', 'Parking', 'Piscine', 'Terrasse']","{'Condition': 'Bon √©tat', 'Disponibilit√©': 'Imm√©diate', 'Salle de bain': '4', 'Chambres': '4', 'Salons': '5', 'Surface totale': '598'}",2025-11-03 01:30:15.002000
57132830,https://www.avito.ma/fr/route_de_casablanca/appartements/Appartement_√†_vendre_97_m¬≤_√†_Marrakech_57132830.htm,Appartement √† vendre 97 m¬≤ √† Marrakech,1 400 000 DH,1400000.0,"Appartements, √† vendre",Home Icon Accueil > Tout le Maroc > Marrakech > Route de Casablanca > Avito Immobilier > Ventes Immobili√®res > Appartements > Appartement √† vendre 97 m¬≤ √† Marrakech,üè° Appartement √† vendre üìç Emplacement : √Ä proximit√© de Marjane et McDonald's Road de Gaza üìè Superficie : 97 m¬≤ üè¢ √âtage : 3·µâ √©tage (avec ascenseur) üöó Parking : Place de stationnement au sous-sol Description : Bel appartement lumineux situ√© dans un emplacement calme et recherch√©. Il se compose de : Deux chambres avec placards et balcon Un grand salon spacieux Une cuisine tr√®s grande et fonctionnelle Une terrasse ensoleill√©e offrant une belle exposition toute la journ√©e üí∞ Prix : 1 400 000 DH,Souk Immo,Particulier,2025-11-02 14:30:07,"['https://content.avito.ma/classifieds/images/10141174978?t=images', 'https://content.avito.ma/classifieds/images/10141174979?t=images', 'https://content.avito.ma/classifieds/images/10141174980?t=images', 'https://content.avito.ma/classifieds/images/10141174981?t=images', 'https://content.avito.ma/classifieds/images/10141174995?t=images', 'https://content.avito.ma/classifieds/images/10141174994?t=images', 'https://content.avito.ma/classifieds/images/10141174996?t=images', 'https://content.avito.ma/classifieds/images/10141174997?t=images', 'https://content.avito.ma/classifieds/images/10141175044?t=images', 'https://content.avito.ma/classifieds/images/10141175045?t=images', 'https://content.avito.ma/classifieds/images/10141175043?t=images', 'https://content.avito.ma/classifieds/images/10141175046?t=images', 'https://content.avito.ma/classifieds/images/10141175065?t=images', 'https://content.avito.ma/classifieds/images/10141175068?t=images', 'https://content.avito.ma/classifieds/images/10141175066?t=images', 'https://content.avito.ma/classifieds/images/10141175067?t=images', 'https://content.avito.ma/classifieds/images/10141175115?t=images', 'https://content.avito.ma/classifieds/images/10141175114?t=images', 'https://content.avito.ma/classifieds/images/10141175113?t=images', 'https://content.avito.ma/classifieds/images/10141175116?t=images']","['2', '2', '97', 'Bon √©tat', 'Imm√©diate', '1', 'Ascenseur', 'Balcon', 'Climatisation', 'Concierge', 'Cuisine √©quip√©e', 'Meubl√©']","{'Condition': 'Bon √©tat', 'Disponibilit√©': 'Imm√©diate', 'Salle de bain': '2', 'Chambres': '2', 'Salons': '1', 'Surface totale': '97'}",2025-11-03 01:30:15.002000
57135550,https://www.avito.ma/fr/chefchaouen/terrains_et_fermes/Terrain_urbain_titr√©_√†_vendre_√†_Chefchaouen_57135550.htm,Terrain urbain titr√© √† vendre √† Chefchaouen,300 DH,300.0,"Terrains et fermes, √† vendre",Home Icon Accueil > Tout le Maroc > Chefchaouen > Toute la ville > Avito Immobilier > Terrains et fermes > Terrain urbain titr√© √† vendre √† Chefchaouen,"Terrain urbain titr√© √† vendre, zone touristique Superficie 4332 m√®tres Situ√© √† l'entr√©e de Chefchaouen en direction de la route de T√©touan dans le quartier de Tourafine Contient plus de 150 oliviers grands et petits et un certain nombre d'arbres fruitiers √Ä c√¥t√© de la route nationale principale Vente directe (sans interm√©diaire)",Loutfi,Particulier,2025-11-03 02:28:09,"['https://content.avito.ma/classifieds/images/10141271962?t=images', 'https://content.avito.ma/classifieds/images/10141271973?t=images', 'https://content.avito.ma/classifieds/images/10141271972?t=images', 'https://content.avito.ma/classifieds/images/10141271974?t=images', 'https://content.avito.ma/classifieds/images/10141271966?t=images', 'https://content.avito.ma/classifieds/images/10141271970?t=images', 'https://content.avito.ma/classifieds/images/10141271968?t=images', 'https://content.avito.ma/classifieds/images/10141271963?t=images', 'https://content.avito.ma/classifieds/images/10141271969?t=images', 'https://content.avito.ma/classifieds/images/10141271961?t=images', 'https://content.avito.ma/classifieds/images/10141271964?t=images', 'https://content.avito.ma/classifieds/images/10141271967?t=images']","['4332', 'Industriel', 'Titr√©']","{'Surface totale': '4332', 'Zoning': 'Industriel'}",2025-11-03 01:30:15.002000
37621456,https://www.avito.ma/fr/sidi_bernoussi/appartements/Appartement_√†_vendre_64_m¬≤_√†_Casablanca_37621456.htm,Appartement √† vendre 64 m¬≤ √† Casablanca,8 500 DH,8500.0,"Appartements, √† vendre",Home Icon Accueil > Tout le Maroc > Casablanca > Sidi Bernoussi > Avito Immobilier > Ventes Immobili√®res > Appartements > Appartement √† vendre 64 m¬≤ √† Casablanca,"R√©sidence s√©curis√©e avec cam√©ras de surveillance, ascenseur, parking sous-sol, parabole collective, piscine et climatisation pr√©-install√©e. Avec une finition de bon standing, les appartements de r√©sidences El Yassamine disposent de cuisines √©quip√©es, Bois de ch√™ne naturel, Volets roulants motoris√©s, Douche italienne, Sanitaire haut de gamme, ‚Ä¶ La r√©sidence est √† proximit√© de : √âcoles priv√©e et publique, commerces de proximit√©, boulangeries, p√¢tisseries, caf√©s, pharmacies, hammam, plateaux de bureau. R√©servez votre appartement et b√©n√©ficier de remises exceptionnelles sur le prix de vente.",Groupe Ikamati,Particulier,2025-11-03 02:21:12,"['https://content.avito.ma/classifieds/images/9757591781?t=images', 'https://content.avito.ma/classifieds/images/7742218029?t=images', 'https://content.avito.ma/classifieds/images/9790388927?t=images', 'https://content.avito.ma/classifieds/images/9777514240?t=images', 'https://content.avito.ma/classifieds/images/9768290784?t=images', 'https://content.avito.ma/classifieds/images/9785829271?t=images']","['2', '1', '1', '64', '4']","{'Salle de bain': '1', '√âtage': '4', 'Surface habitable': '64', 'Chambres': '2', 'Salons': '1'}",2025-11-03 01:30:15.002000
37619566,https://www.avito.ma/fr/autre_secteur/appartements/Appartement_Riad_Essalam_Beni_Yekhlef__Mohammedia_37619566.htm,Appartement Riad Essalam Beni Yekhlef- Mohammedia,195 000 DH,195000.0,"Appartements, √† vendre",Home Icon Accueil > Tout le Maroc > Mohammedia > Autre secteur > Avito Immobilier > Ventes Immobili√®res > Appartements > Appartement Riad Essalam Beni Yekhlef- Mohammedia,R√©sidences √©conomiques avec ascenseurs. Une Chambre/ salon/ cuisine/salle de bain. Prix : 195 000DH Livraison imm√©diate.,Groupe Ikamati,Particulier,2025-11-03 02:17:14,"['https://content.avito.ma/classifieds/images/2533557603?t=images', 'https://content.avito.ma/classifieds/images/2539801094?t=images', 'https://content.avito.ma/classifieds/images/2516510965?t=images', 'https://content.avito.ma/classifieds/images/2504023983?t=images', 'https://content.avito.ma/classifieds/images/2582417689?t=images', 'https://content.avito.ma/classifieds/images/2572806528?t=images']","['1', '1', 'Neuf', 'Imm√©diate', '0', 'Economique', 'Ascenseur']","{'Condition': 'Neuf', 'Disponibilit√©': 'Imm√©diate', 'Salle de bain': '1', 'Standing': 'Economique', 'Chambres': '1', 'Salons': '0'}",2025-11-03 01:30:15.002000
57138067,https://www.avito.ma/fr/autre_secteur/terrains_et_fermes/Terrain_√†_vendre_pr√®s_de_Tanger_Opportunit√©_Unique_57138067.htm,Terrain √† vendre pr√®s de Tanger Opportunit√© Unique,,,"Terrains et fermes, √† vendre",Home Icon Accueil > Tout le Maroc > Tanger > Autre secteur > Avito Immobilier > Terrains et fermes > Terrain √† vendre pr√®s de Tanger Opportunit√© Unique,"Ce terrain titr√© de 22 738 m¬≤ est id√©alement situ√© au c≈ìur de la zone commerciale et industrielle de Melloussa. Il b√©n√©ficie d‚Äôun zoning ¬´ Zone de service et animation ¬ª, permettant des projets h√¥teliers, touristiques, commerciaux et de services (restaurants, agences bancaires, agences de voyage, bureaux, stations-service, etc.). La construction est autoris√©e jusqu‚Äô√† R plus 3, offrant de nombreuses possibilit√©s de d√©veloppement. De plus, il b√©n√©ficie d‚Äôun acc√®s facile √† T√©touan, √† Tanger, au port Tanger Med, ainsi qu‚Äôaux zones industrielles comme celles de Mellousa et de Renault (l‚Äôusine Renault). L‚Äôautoroute √† proximit√© permet √©galement un acc√®s rapide envers Rabat. Ne manquez pas cette opportunit√© rare. Pour toute demande de renseignements, veuillez nous contacter au num√©ro suivant, disponible √©galement sur WhatsApp.",Ben,Particulier,2025-11-02 18:10:04,"['https://content.avito.ma/classifieds/images/10141229399?t=images', 'https://content.avito.ma/classifieds/images/10141229434?t=images', 'https://content.avito.ma/classifieds/images/10141229435?t=images', 'https://content.avito.ma/classifieds/images/10141229437?t=images', 'https://content.avito.ma/classifieds/images/10141229438?t=images']","['22738', 'Service public', 'Titr√©']","{'Surface totale': '22738', 'Zoning': 'Service public'}",2025-11-03 00:10:15.002000
57028285,https://www.avito.ma/fr/sidi_rahal/villas_et_riads/VILLA_√Ä_VENDRE_SUR_SIDI_RAHAL_EXCEPTIONNELLE_57028285.htm,VILLA √Ä VENDRE SUR SIDI RAHAL EXCEPTIONNELLE,3 100 000 DH,3100000.0,"Villas et Riads, √† vendre",Home Icon Accueil > Tout le Maroc > Sidi Rahal > Toute la ville > Avito Immobilier > Ventes Immobili√®res > Villas et Riads > VILLA √Ä VENDRE SUR SIDI RAHAL EXCEPTIONNELLE,VILLA √Ä VENDRE SUR SIDI RAHAL -,Century21ollier,Particulier,2025-11-02 12:10:06,"['https://content.avito.ma/classifieds/images/10140044485?t=images', 'https://content.avito.ma/classifieds/images/10140044496?t=images', 'https://content.avito.ma/classifieds/images/10140044500?t=images', 'https://content.avito.ma/classifieds/images/10140044504?t=images', 'https://content.avito.ma/classifieds/images/10140044517?t=images', 'https://content.avito.ma/classifieds/images/10140044526?t=images', 'https://content.avito.ma/classifieds/images/10140044531?t=images', 'https://content.avito.ma/classifieds/images/10140044538?t=images']","['6', '3', '1553', 'Bon √©tat', 'Imm√©diate', '1', 'Balcon', 'Chauffage', 'Climatisation', 'Garage', 'Jardin', 'Parking']","{'Condition': 'Bon √©tat', 'Disponibilit√©': 'Imm√©diate', ""Nombre d'√©tage"": '1', 'Salle de bain': '3', 'Chambres': '6', 'Surface totale': '1553'}",2025-11-03 00:10:15.002000
57027014,https://www.avito.ma/fr/souissi/terrains_et_fermes/Terrain_Lala_Zineb_de_2343m¬≤_57027014.htm,Terrain Lala Zineb de 2343m¬≤,,,"Terrains et fermes, √† vendre",Home Icon Accueil > Tout le Maroc > Rabat > Souissi > Avito Immobilier > Terrains et fermes > Terrain Lala Zineb de 2343m¬≤,"Terrain √† vendre Lala Zineb Terrain en vente √† Souissi Pour plus d'informations, veuillez contacter M. Rashid",High immo,Particulier,2025-11-02 23:10:07,['https://content.avito.ma/classifieds/images/10140028831?t=images'],['2343'],{'Surface totale': '2343'},2025-11-03 00:10:15.002000
57066031,https://www.avito.ma/fr/autre_secteur/local/Local_commercial_3_garages_57066031.htm,Local commercial 3 garages,15 000 DH,15000.0,"Local, √† vendre",Home Icon Accueil > Tout le Maroc > Casablanca > Autre secteur > Avito Immobilier > Local > Local commercial 3 garages,"Soci√©t√© by home vous propose des magasines sur * bouskoura Victoria superficies 66m2 commerce et 12m2 voiture * Dar bouaaza errahma 41m2, RDC 21m2 suppent et 50m2 cave. dans une r√©sidence haut-standing. pour plus d'informations contactez nous.",By home,Particulier,2025-11-03 00:59:09,"['https://content.avito.ma/classifieds/images/10140466849?t=images', 'https://content.avito.ma/classifieds/images/10140466850?t=images', 'https://content.avito.ma/classifieds/images/10140466852?t=images', 'https://content.avito.ma/classifieds/images/10140466851?t=images']","['Neuf', 'Imm√©diate', '1', '66', 'Moins de 1 an', 'Chauffage', 'Climatisation', 'Parking', 'S√©curit√©']","{'Condition': 'Neuf', 'Disponibilit√©': 'Imm√©diate', 'Salle de bain': '1', 'Surface totale': '66', '√Çge du bien': 'Moins de 1 an'}",2025-11-03 00:10:15.002000


In [104]:
silver_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price_text: string (nullable = true)
 |-- price_value_mad: double (nullable = true)
 |-- category: string (nullable = true)
 |-- breadcrumbs: string (nullable = true)
 |-- description: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- published_date_text: string (nullable = true)
 |-- image_urls: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- equipments: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- attributes_map: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- ingest_ts: timestamp (nullable = true)



In [105]:
# Drop the 'price_text' column
silver_df = silver_df.drop('price_text')

# Rename 'price_value_mad' to 'price'
silver_df = silver_df.withColumnRenamed('price_value_mad', 'price')

# Replace NULL 'price' values with 0.0
silver_df = silver_df.fillna({'price': 0.0})

silver_df.select('price').show(10, truncate=False)

+---------+
|price    |
+---------+
|0.0      |
|3100000.0|
|0.0      |
|15000.0  |
|490000.0 |
|2500000.0|
|8500.0   |
|2500.0   |
|6000.0   |
|0.0      |
+---------+
only showing top 10 rows



In [106]:
from pyspark.sql import functions as F

# Transform 'category' into 'offre' and 'category_type'
silver_df = (
    silver_df
    # Create 'offre' column for 'rent' or 'sale'
    .withColumn(
        "offre",
        F.when(F.col("category").contains("√† louer"), "rent")
         .when(F.col("category").contains("√† vendre"), "sale")
         .otherwise(None)
    )
    # Create 'category_type' column with only the property type (e.g., Maisons, Appartements)
    .withColumn(
        "type",
        F.when(
            F.col("category").contains("√† louer") | F.col("category").contains("√† vendre"),
            F.split(F.col("category"), ",")[0]
        ).otherwise(None)
    )
    # Drop the original 'category' column
    .drop("category")
)

# Check the result
silver_df.select("offre", "type").show(30, truncate=False)


+-----+------------------+
|offre|type              |
+-----+------------------+
|rent |Bureaux           |
|rent |Appartements      |
|rent |Appartements      |
|rent |Appartements      |
|rent |Bureaux           |
|rent |Appartements      |
|rent |Appartements      |
|rent |Appartements      |
|rent |Appartements      |
|rent |Appartements      |
|rent |Appartements      |
|rent |Appartements      |
|rent |Local             |
|rent |Bureaux           |
|rent |Appartements      |
|rent |Bureaux           |
|rent |Appartements      |
|rent |Local             |
|sale |Terrains et fermes|
|sale |Villas et Riads   |
|sale |Terrains et fermes|
|sale |Local             |
|sale |Appartements      |
|sale |Maisons           |
|sale |Terrains et fermes|
|sale |Villas et Riads   |
|sale |Appartements      |
|sale |Terrains et fermes|
|sale |Appartements      |
|sale |Appartements      |
+-----+------------------+
only showing top 30 rows



In [107]:
from pyspark.sql import functions as F

# Split 'breadcrumbs' based on '>'
split_breadcrumbs = F.split(F.col("breadcrumbs"), " > ")

# Create new columns for each segment
silver_df = (
    silver_df
    .withColumn("city", split_breadcrumbs.getItem(2))  # e.g., "Casablanca"
    .withColumn("neighborhood", split_breadcrumbs.getItem(3))  # e.g., "Maarif"
    .withColumn("site", split_breadcrumbs.getItem(4))  # e.g., "Avito Immobilier"
    .drop("breadcrumbs")  # Drop the original column if not needed
)

# Check the result
silver_df.select(
     "city", "neighborhood", "site", 
).show(30, truncate=False)


+-----------+-------------------+----------------+
|city       |neighborhood       |site            |
+-----------+-------------------+----------------+
|Sal√©       |Hay Chmaou         |Avito Immobilier|
|K√©nitra    |Autre secteur      |Avito Immobilier|
|Tanger     |Centre ville       |Avito Immobilier|
|Casablanca |Sidi Bernoussi     |Avito Immobilier|
|Temara     |Autre secteur      |Avito Immobilier|
|El Jadida  |Saada              |Avito Immobilier|
|Casablanca |Laimoune           |Avito Immobilier|
|El Jadida  |Autre secteur      |Avito Immobilier|
|Mohammedia |Quartier du Parc   |Avito Immobilier|
|Casablanca |Roches Noires      |Avito Immobilier|
|Casablanca |Belv√©d√®re          |Avito Immobilier|
|Tanger     |De La Plage        |Avito Immobilier|
|Casablanca |Ain Sebaa          |Avito Immobilier|
|Casablanca |Lissasfa           |Avito Immobilier|
|Rabat      |Hay Riad           |Avito Immobilier|
|Marrakech  |Targa              |Avito Immobilier|
|Casablanca |Val Fleuri    

In [108]:
from pyspark.sql import functions as F

# Change all values in the 'site' column to 'avito'
silver_df = silver_df.withColumn('site', F.lit('avito'))

# Show the updated data
silver_df.select('site').show(10, truncate=False)

+-----+
|site |
+-----+
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
|avito|
+-----+
only showing top 10 rows



In [109]:
# Group by 'seller_name' and 'seller_type', and count the occurrences
silver_df.groupBy('seller_type').count().orderBy('count', ascending=False).show(10, truncate=False)



+-----------+-----+
|seller_type|count|
+-----------+-----+
|Particulier|548  |
|Boutique   |1    |
|NULL       |1    |
+-----------+-----+



                                                                                

In [110]:
from pyspark.sql import functions as F

# Convert 'seller_type' to lowercase
silver_df = silver_df.withColumn("seller_type", F.lower(F.col("seller_type")))

# Show the distinct values of 'seller_type' with counts, including NULLs
silver_df.groupBy('seller_type').count().orderBy('count', ascending=False).show(30, truncate=False)

# Show rows where 'seller_type' is NULL
silver_df.filter(F.col('seller_type').isNull()).show(5, truncate=False)

+-----------+-----+
|seller_type|count|
+-----------+-----+
|particulier|548  |
|boutique   |1    |
|NULL       |1    |
+-----------+-----+



                                                                                

+--------+---------------------------------------------------------------------------------------------+-----+-----+-----------+-----------+-----------+-------------------+----------+----------+--------------+-----------------------+-----+----+----+------------+-----+
|id      |url                                                                                          |title|price|description|seller_name|seller_type|published_date_text|image_urls|equipments|attributes_map|ingest_ts              |offre|type|city|neighborhood|site |
+--------+---------------------------------------------------------------------------------------------+-----+-----+-----------+-----------+-----------+-------------------+----------+----------+--------------+-----------------------+-----+----+----+------------+-----+
|57153737|https://www.avito.ma/fr/b√©ni_mellal/appartements/Bel_appartement_√©quip√©_√†_vendre_57153737.htm|NULL |0.0  |NULL       |NULL       |NULL       |NULL               |NULL      |NULL  

In [111]:
# Drop rows where 'seller_type' is NULL
silver_df = silver_df.filter(F.col('seller_type').isNotNull())

# Show the updated result to confirm the rows are dropped
silver_df.select('seller_type').distinct().show(10, truncate=False)

+-----------+
|seller_type|
+-----------+
|particulier|
|boutique   |
+-----------+



                                                                                

In [112]:
# Group by 'seller_name' and 'seller_type', and count the occurrences
silver_df.groupBy('seller_name').count().orderBy('count', ascending=False).show(5, truncate=False)



+------------------+-----+
|seller_name       |count|
+------------------+-----+
|Groupe Ikamati    |22   |
|Allo immo         |21   |
|Immo continental  |16   |
|MedZaim immobilier|15   |
|SAKAN LIK GROUPE  |13   |
+------------------+-----+
only showing top 5 rows



                                                                                

In [113]:
from pyspark.sql.functions import to_timestamp

# Convert 'published_date_text' to timestamp and rename it to 'published_date'
silver_df = silver_df.withColumn(
    "published_date", 
    to_timestamp("published_date_text", "yyyy-MM-dd HH:mm:ss")  # Adjust format if needed
).drop("published_date_text")  # Drop the original 'published_date_text' column

# Show the result
silver_df.select("id", "published_date").show(5, truncate=False)

+--------+-------------------+
|id      |published_date     |
+--------+-------------------+
|57050424|2025-11-02 14:30:03|
|57084338|2025-11-02 15:30:05|
|57132830|2025-11-02 14:30:07|
|57135550|2025-11-03 02:28:09|
|37621456|2025-11-03 02:21:12|
+--------+-------------------+
only showing top 5 rows



In [114]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType
import re

# Define a UDF to clean the 'equipments' list, handling NoneType
def clean_equipments(equipments):
    # Handle NoneType case
    if equipments is None:
        return []
    
    # Filter out unwanted entries (numbers and time-related strings)
    cleaned = [item for item in equipments if not re.match(r'^\d+$', str(item)) and not re.match(r'.*(mois|an).*', str(item))]
    return cleaned

# Register the UDF
clean_equipments_udf = udf(clean_equipments, ArrayType(StringType()))

# Apply the UDF to clean 'equipments' directly (in place)
silver_df = silver_df.withColumn("equipments", clean_equipments_udf(col("equipments")))

# Show the cleaned 'equipments' column
silver_df.select("equipments").show(5, truncate=False)

+-------------------------------------------------------------------+
|equipments                                                         |
+-------------------------------------------------------------------+
|[Ascenseur, Balcon, Concierge, Cuisine √©quip√©e, Parking, S√©curit√©] |
|[Ascenseur, Climatisation, C√¢blage t√©l√©phonique, Parking, S√©curit√©]|
|[]                                                                 |
|[Ascenseur, Concierge, Parking, S√©curit√©]                          |
|[S√©curit√©]                                                         |
+-------------------------------------------------------------------+
only showing top 5 rows



In [115]:
from pyspark.sql import functions as F

# Extract the keys from 'attributes_map'
keys = silver_df.select(F.explode(F.map_keys(F.col("attributes_map"))).alias("attribute")).distinct().rdd.flatMap(lambda x: x).collect()

# For each key, create a new column with its corresponding value from 'attributes_map'
for key in keys:
    silver_df = silver_df.withColumn(
        key, 
        F.when(F.col("attributes_map").getItem(key).isNotNull(), 
               F.col("attributes_map").getItem(key)).otherwise(None)
    )

# Drop the 'attributes_map' column after extracting the keys and values
silver_df = silver_df.drop("attributes_map")

# Show the result with the new columns and drop the 'attributes_map' column
silver_df.select("id", *keys).show(5, truncate=False)

                                                                                

+--------+-----------------+-------+--------------+--------+--------------+-----+-------------+-------------+----------------+--------+----------------------+---------+--------------+-------------+------+------------------+
|id      |Surface habitable|Caution|Zoning        |Standing|Surface totale|√âtage|√Çge du bien  |Salle de bain|Nombre de pi√®ces|Chambres|Frais de syndic / mois|Condition|Nombre d'√©tage|Disponibilit√©|Salons|Type d'appartement|
+--------+-----------------+-------+--------------+--------+--------------+-----+-------------+-------------+----------------+--------+----------------------+---------+--------------+-------------+------+------------------+
|57138067|NULL             |NULL   |Service public|NULL    |22738         |NULL |NULL         |NULL         |NULL            |NULL    |NULL                  |NULL     |NULL          |NULL         |NULL  |NULL              |
|57028285|NULL             |NULL   |NULL          |NULL    |1553          |NULL |NULL         |3   

In [116]:
import pandas as pd
from IPython.display import display

# Keep things compact
pd.set_option("display.max_columns", 20)   # don't try to show hundreds
pd.set_option("display.max_colwidth", 80)  # clamp long cells to ~80 chars

# pick a small sample and flatten newlines so rows stay short
pdf = (silver_df.limit(10)
       .toPandas()
       .replace({r'[\r\n\t]+': ' '}, regex=True))

# simple, compact table with ellipsis in long cells
display(
    pdf.style
      .set_table_styles([
          {'selector': 'table', 'props': [('table-layout','fixed'), ('width','100%')]},
          {'selector': 'th, td', 'props': [
              ('max-width','280px'),
              ('white-space','nowrap'),
              ('overflow','hidden'),
              ('text-overflow','ellipsis')
          ]}
      ])
      .hide(axis='index')  # remove row numbers
)

id,url,title,price,description,seller_name,seller_type,image_urls,equipments,ingest_ts,offre,type,city,neighborhood,site,published_date,Surface habitable,Caution,Zoning,Standing,Surface totale,√âtage,√Çge du bien,Salle de bain,Nombre de pi√®ces,Chambres,Frais de syndic / mois,Condition,Nombre d'√©tage,Disponibilit√©,Salons,Type d'appartement
57138067,https://www.avito.ma/fr/autre_secteur/terrains_et_fermes/Terrain_√†_vendre_pr√®s_de_Tanger_Opportunit√©_Unique_57138067.htm,Terrain √† vendre pr√®s de Tanger Opportunit√© Unique,0.0,"Ce terrain titr√© de 22 738 m¬≤ est id√©alement situ√© au c≈ìur de la zone commerciale et industrielle de Melloussa. Il b√©n√©ficie d‚Äôun zoning ¬´ Zone de service et animation ¬ª, permettant des projets h√¥teliers, touristiques, commerciaux et de services (restaurants, agences bancaires, agences de voyage, bureaux, stations-service, etc.). La construction est autoris√©e jusqu‚Äô√† R plus 3, offrant de nombreuses possibilit√©s de d√©veloppement. De plus, il b√©n√©ficie d‚Äôun acc√®s facile √† T√©touan, √† Tanger, au port Tanger Med, ainsi qu‚Äôaux zones industrielles comme celles de Mellousa et de Renault (l‚Äôusine Renault). L‚Äôautoroute √† proximit√© permet √©galement un acc√®s rapide envers Rabat. Ne manquez pas cette opportunit√© rare. Pour toute demande de renseignements, veuillez nous contacter au num√©ro suivant, disponible √©galement sur WhatsApp.",Ben,particulier,"['https://content.avito.ma/classifieds/images/10141229399?t=images', 'https://content.avito.ma/classifieds/images/10141229434?t=images', 'https://content.avito.ma/classifieds/images/10141229435?t=images', 'https://content.avito.ma/classifieds/images/10141229437?t=images', 'https://content.avito.ma/classifieds/images/10141229438?t=images']","['Service public', 'Titr√©']",2025-11-03 00:10:15.002000,sale,Terrains et fermes,Tanger,Autre secteur,avito,2025-11-02 18:10:04,,,Service public,,22738.0,,,,,,,,,,,
57028285,https://www.avito.ma/fr/sidi_rahal/villas_et_riads/VILLA_√Ä_VENDRE_SUR_SIDI_RAHAL_EXCEPTIONNELLE_57028285.htm,VILLA √Ä VENDRE SUR SIDI RAHAL EXCEPTIONNELLE,3100000.0,VILLA √Ä VENDRE SUR SIDI RAHAL -,Century21ollier,particulier,"['https://content.avito.ma/classifieds/images/10140044485?t=images', 'https://content.avito.ma/classifieds/images/10140044496?t=images', 'https://content.avito.ma/classifieds/images/10140044500?t=images', 'https://content.avito.ma/classifieds/images/10140044504?t=images', 'https://content.avito.ma/classifieds/images/10140044517?t=images', 'https://content.avito.ma/classifieds/images/10140044526?t=images', 'https://content.avito.ma/classifieds/images/10140044531?t=images', 'https://content.avito.ma/classifieds/images/10140044538?t=images']","['Bon √©tat', 'Imm√©diate', 'Balcon', 'Chauffage', 'Climatisation', 'Garage', 'Jardin', 'Parking']",2025-11-03 00:10:15.002000,sale,Villas et Riads,Sidi Rahal,Toute la ville,avito,2025-11-02 12:10:06,,,,,1553.0,,,3.0,,6.0,,Bon √©tat,1.0,Imm√©diate,,
57027014,https://www.avito.ma/fr/souissi/terrains_et_fermes/Terrain_Lala_Zineb_de_2343m¬≤_57027014.htm,Terrain Lala Zineb de 2343m¬≤,0.0,"Terrain √† vendre Lala Zineb Terrain en vente √† Souissi Pour plus d'informations, veuillez contacter M. Rashid",High immo,particulier,['https://content.avito.ma/classifieds/images/10140028831?t=images'],[],2025-11-03 00:10:15.002000,sale,Terrains et fermes,Rabat,Souissi,avito,2025-11-02 23:10:07,,,,,2343.0,,,,,,,,,,,
57066031,https://www.avito.ma/fr/autre_secteur/local/Local_commercial_3_garages_57066031.htm,Local commercial 3 garages,15000.0,"Soci√©t√© by home vous propose des magasines sur * bouskoura Victoria superficies 66m2 commerce et 12m2 voiture * Dar bouaaza errahma 41m2, RDC 21m2 suppent et 50m2 cave. dans une r√©sidence haut-standing. pour plus d'informations contactez nous.",By home,particulier,"['https://content.avito.ma/classifieds/images/10140466849?t=images', 'https://content.avito.ma/classifieds/images/10140466850?t=images', 'https://content.avito.ma/classifieds/images/10140466852?t=images', 'https://content.avito.ma/classifieds/images/10140466851?t=images']","['Neuf', 'Imm√©diate', 'Chauffage', 'Climatisation', 'Parking', 'S√©curit√©']",2025-11-03 00:10:15.002000,sale,Local,Casablanca,Autre secteur,avito,2025-11-03 00:59:09,,,,,66.0,,Moins de 1 an,1.0,,,,Neuf,,Imm√©diate,,
56708685,https://www.avito.ma/fr/alliance/appartements/√Ä_VENDRE_Bel_appartement_lumineux_2_fa√ßades_56708685.htm,√Ä VENDRE Bel appartement lumineux 2 fa√ßades,490000.0,"Bonjour, Je met en vente mon appartement presque neuf jamais habit√© situ√© au quartier Alliance Mehdia. Composition : 2 chambres, 1 salon, 2 salles de bain, 2 balcons. L‚Äôappartement est de deux fa√ßades oppos√©es offrant une excellente a√©ration et un ensoleillement optimal toute la journ√©e. Orientation Rabat. Prix raisonnable et l√©g√®rement n√©gociable. Les interm√©diaires √† s‚Äôabstenir svp.",Bouchra,particulier,"['https://content.avito.ma/classifieds/images/10137258041?t=images', 'https://content.avito.ma/classifieds/images/10137258042?t=images', 'https://content.avito.ma/classifieds/images/10137258040?t=images', 'https://content.avito.ma/classifieds/images/10137258043?t=images', 'https://content.avito.ma/classifieds/images/10137258038?t=images', 'https://content.avito.ma/classifieds/images/10137258045?t=images']","['Neuf', 'Imm√©diate', 'Balcon']",2025-11-03 00:10:15.002000,sale,Appartements,K√©nitra,alliance,avito,2025-11-03 00:58:11,,,,,,,,2.0,,2.0,100.0,Neuf,,Imm√©diate,1.0,
55924046,https://www.avito.ma/fr/ain_atig/maisons/Maison_√†_vendre_96_m¬≤_√†_Temara_Ain_Aatig_55924046.htm,Maison √† vendre 96 m¬≤ √† Temara Ain Aatig,2500000.0,"AGENCE MOSTAGIMMO Met en Vente une nouvelle Maison 2 Fa√ßade ou Secteur Ain Aatig bien en solier (Sud et ouest). Superficie Sous Terrain 96m et Habitable 270m. Compos√© pour Chaque √©tage : * RDC : GRANDE MAGASIN * 1er √©tage : 3 Chambres Avec Placard, Salon, Salle de Bain, Cuisine. * 2em √©tage : 3 Chambres Avec Placard, Salon, Salle de Bain, Cuisine. Pour plus d'informations ou faire une Visite, Contacter Votre Agence immobili√®re MOSTAGIMMO.",AGENCE IMMOBILIER MOSTAGIMMO,particulier,"['https://content.avito.ma/classifieds/images/10129786457?t=images', 'https://content.avito.ma/classifieds/images/10129786471?t=images', 'https://content.avito.ma/classifieds/images/10129786470?t=images', 'https://content.avito.ma/classifieds/images/10129786469?t=images', 'https://content.avito.ma/classifieds/images/10129786486?t=images', 'https://content.avito.ma/classifieds/images/10129786488?t=images', 'https://content.avito.ma/classifieds/images/10129786493?t=images', 'https://content.avito.ma/classifieds/images/10129786494?t=images', 'https://content.avito.ma/classifieds/images/10129786501?t=images', 'https://content.avito.ma/classifieds/images/10129786502?t=images', 'https://content.avito.ma/classifieds/images/10129786503?t=images', 'https://content.avito.ma/classifieds/images/10129786504?t=images', 'https://content.avito.ma/classifieds/images/10129786505?t=images', 'https://content.avito.ma/classifieds/images/10129786529?t=images', 'https://content.avito.ma/classifieds/images/10129786530?t=images']","['Neuf', 'Balcon', 'Garage']",2025-11-03 00:10:15.002000,sale,Maisons,Temara,Ain Atig,avito,2025-11-03 00:57:13,,,,,270.0,,,2.0,,6.0,,Neuf,2.0,,2.0,
57020711,https://www.avito.ma/fr/lissasfa/bureaux/Bureau_√†_louer_d_une_superficie_141_m¬≤_57020711.htm,Bureau √† louer d'une superficie 141 m¬≤,12690.0,"FBC met √† votre disposition un plateau bureau 311 √† louer de 141 m¬≤, situ√© dans un espace professionnel, s√©curis√©, calme et propre. Situ√© au 3√®me √©tage, bien √©quip√© : climatisation, 2 ascenseurs, cloisons, chauffage, parking et internet haut d√©bit (fibre optique), kitchenette, acc√®s direct √† deux autoroutes, celle de rond-point Azbane et la Rocade sud-ouest. Pour plus d'information, n'h√©sitez pas √† nous contacter.",FACILITIES BUSINESS CENTER,particulier,"['https://content.avito.ma/classifieds/images/10139961083?t=images', 'https://content.avito.ma/classifieds/images/10139961084?t=images', 'https://content.avito.ma/classifieds/images/10139961070?t=images', 'https://content.avito.ma/classifieds/images/10139961074?t=images', 'https://content.avito.ma/classifieds/images/10139961073?t=images', 'https://content.avito.ma/classifieds/images/10139961069?t=images', 'https://content.avito.ma/classifieds/images/10139961068?t=images', 'https://content.avito.ma/classifieds/images/10139961072?t=images', 'https://content.avito.ma/classifieds/images/10139961076?t=images', 'https://content.avito.ma/classifieds/images/10139961077?t=images', 'https://content.avito.ma/classifieds/images/10139961078?t=images', 'https://content.avito.ma/classifieds/images/10139961082?t=images']","['Ascenseur', 'Chauffage', 'Climatisation', 'C√¢blage t√©l√©phonique', 'Parking', 'S√©curit√©']",2025-11-03 00:17:15.003000,rent,Bureaux,Casablanca,Lissasfa,avito,2025-11-02 11:17:06,,,,,141.0,3.0,,0.0,3.0,,,,,,,
57055336,https://www.avito.ma/fr/hay_riad/appartements/Appartement_en_location_√†_Hay_Riad_Rabat_57055336.htm,Appartement en location √† Hay Riad Rabat,0.0,"Bel appartement de 195 m¬≤ en location au 1 er √©tage situ√© √† Hay Riad Rabat Compos√© de deux salons avec chemin√©e s√©jour cuisine salle d'inviter deux chambres avec salle de bain suite parental avec salle de bain et drissing prix de location 16000dhs / mois pour plus d'informations veuillez contacter nous ,",Immosaadaoui,particulier,"['https://content.avito.ma/classifieds/images/10140311388?t=images', 'https://content.avito.ma/classifieds/images/10140311398?t=images', 'https://content.avito.ma/classifieds/images/10140311399?t=images', 'https://content.avito.ma/classifieds/images/10140311404?t=images', 'https://content.avito.ma/classifieds/images/10140311389?t=images', 'https://content.avito.ma/classifieds/images/10140311393?t=images', 'https://content.avito.ma/classifieds/images/10140311387?t=images']",[],2025-11-03 00:17:15.003000,rent,Appartements,Rabat,Hay Riad,avito,2025-11-02 18:17:08,195.0,,,,,1.0,,,,4.0,,,,,,
57140550,https://www.avito.ma/fr/targa/bureaux/Bureaux_priv√©s_Domiciliation_√†_Targa__Marrakech_57140550.htm,"Bureaux priv√©s-Domiciliation √† Targa, Marrakech",1200.0,üè¢ Bureaux priv√©s & domiciliation √† Marrakech Besoin d‚Äôun espace professionnel cl√© en main √† Marrakech ? Excellentia Business Center vous propose : ‚úÖ Bureaux priv√©s √©quip√©s et climatis√©s ‚Äì pr√™ts √† l‚Äôemploi ‚úÖ Internet fibre optique haut d√©bit inclus ‚úÖ Service de domiciliation 100 % en ligne ‚Äì obtenez votre adresse professionnelle sans vous d√©placer ‚úÖ R√©ception et gestion de votre courrier Offrez √† votre entreprise une image professionnelle et un environnement de travail moderne au c≈ìur de Marrakech. üìû Contactez-nous d√®s aujourd‚Äôhui et profitez d‚Äôune solution compl√®te pour votre bureau et domiciliation.,Excellentia Business Center,particulier,"['https://content.avito.ma/classifieds/images/10141255291?t=images', 'https://content.avito.ma/classifieds/images/10141255293?t=images', 'https://content.avito.ma/classifieds/images/10141255292?t=images', 'https://content.avito.ma/classifieds/images/10141255294?t=images', 'https://content.avito.ma/classifieds/images/10141255301?t=images', 'https://content.avito.ma/classifieds/images/10141255302?t=images']","['Ascenseur', 'Climatisation', 'C√¢blage t√©l√©phonique', 'Parking']",2025-11-03 00:17:15.003000,rent,Bureaux,Marrakech,Targa,avito,2025-11-02 13:17:10,,,,,7.0,0.0,,1.0,5.0,,,,,,,
56620071,https://www.avito.ma/fr/val_fleuri/appartements/Studio_meubl√©_Val_Fleuri_TRAMWAY_56620071.htm,Studio meubl√© Val Fleuri TRAMWAY,6300.0,"Studio bien meubl√© √† louer, √† Val Fleuri, 45m¬≤, ETG1, 1 Salon balcon, 1 Cuisine √©quip√©e, 1 Chambre √† coucher,1 SDB douche, ascenseur parking concierge Loyer 6300 Dh par mois",ABAMNY Immobilier,particulier,"['https://content.avito.ma/classifieds/images/10141320194?t=images', 'https://content.avito.ma/classifieds/images/10141320200?t=images', 'https://content.avito.ma/classifieds/images/10141320202?t=images', 'https://content.avito.ma/classifieds/images/10141320211?t=images', 'https://content.avito.ma/classifieds/images/10141320220?t=images', 'https://content.avito.ma/classifieds/images/10141320224?t=images', 'https://content.avito.ma/classifieds/images/10141320228?t=images', 'https://content.avito.ma/classifieds/images/10141320229?t=images', 'https://content.avito.ma/classifieds/images/10141320230?t=images']","['Ascenseur', 'Balcon', 'Climatisation', 'Concierge', 'Cuisine √©quip√©e', 'Meubl√©']",2025-11-03 00:17:15.003000,rent,Appartements,Casablanca,Val Fleuri,avito,2025-11-03 00:58:12,45.0,1 mois,,,45.0,,,1.0,,1.0,,,,,1.0,


In [117]:
silver_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = false)
 |-- description: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- image_urls: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- equipments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- offre: string (nullable = true)
 |-- type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- site: string (nullable = false)
 |-- published_date: timestamp (nullable = true)
 |-- Surface habitable: string (nullable = true)
 |-- Caution: string (nullable = true)
 |-- Zoning: string (nullable = true)
 |-- Standing: string (nullable = true)
 |-- Surface totale: string (nullable = true)
 |-- √âtage: string (nullable = true)
 |-- √Çge 

In [118]:
# %%  Rename French columns to English (ASCII, snake_case) and sanitize all names

from unicodedata import normalize
import re

# 1) Explicit mapping for known French fields -> English
rename_map = {
    "offre": "offer",
    "type": "property_type",

    "Surface habitable": "living_area",
    "Surface totale": "total_area",
    "√âtage": "floor",
    "√Çge du bien": "property_age",
    "Salle de bain": "bathrooms",
    "Nombre de pi√®ces": "rooms",
    "Chambres": "bedrooms",
    "Frais de syndic / mois": "hoa_fee_per_month",
    "Condition": "condition",
    "Nombre d'√©tage": "floors",
    "Disponibilit√©": "availability",
    "Salons": "living_rooms",
    "Type d'appartement": "apartment_type",
    "Zoning": "zoning",
    "Standing": "standing",
    "Caution": "deposit",
}

# Apply the explicit renames only if the column exists
for old, new in rename_map.items():
    if old in silver_df.columns:
        silver_df = silver_df.withColumnRenamed(old, new)

# 2) Global sanitizer to ensure ASCII + snake_case and remove any lingering accents/spaces
def to_snake_ascii(name: str) -> str:
    # remove accents ‚Üí ASCII
    ascii_name = normalize("NFKD", name).encode("ascii", "ignore").decode("ascii")
    # lowercase
    ascii_name = ascii_name.lower()
    # replace non-alphanumeric with underscores
    ascii_name = re.sub(r"[^a-z0-9]+", "_", ascii_name)
    # collapse multiple underscores and trim edges
    ascii_name = re.sub(r"_+", "_", ascii_name).strip("_")
    return ascii_name

# Build a stable rename plan to avoid collisions
current_cols = silver_df.columns
sanitized = [to_snake_ascii(c) for c in current_cols]

# If any collisions after sanitization, make them unique by suffixing _2, _3, ...
seen = {}
unique_sanitized = []
for s in sanitized:
    if s not in seen:
        seen[s] = 1
        unique_sanitized.append(s)
    else:
        seen[s] += 1
        unique_sanitized.append(f"{s}_{seen[s]}")

# Apply sanitized names
for old, new in zip(current_cols, unique_sanitized):
    if old != new:
        silver_df = silver_df.withColumnRenamed(old, new)

# Quick check
silver_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = false)
 |-- description: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- image_urls: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- equipments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- offer: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- site: string (nullable = false)
 |-- published_date: timestamp (nullable = true)
 |-- living_area: string (nullable = true)
 |-- deposit: string (nullable = true)
 |-- zoning: string (nullable = true)
 |-- standing: string (nullable = true)
 |-- total_area: string (nullable = true)
 |-- floor: string (nullable = true)
 |-- propert