In [22]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [63]:
try:
    spark.stop()
except Exception:
    pass

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Iceberg via REST")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.rest.type", "rest")
    .config("spark.sql.catalog.rest.uri", "http://iceberg-rest:8181")
    .config("spark.sql.catalog.rest.warehouse", "s3://lake/warehouse")
    .config("spark.sql.catalog.rest.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config("spark.sql.catalog.rest.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.rest.s3.path-style-access", "true")
    .config("spark.sql.catalog.rest.s3.access-key-id", "admin")
    .config("spark.sql.catalog.rest.s3.secret-access-key", "admin123")
    .config("spark.sql.catalog.rest.s3.region", "us-east-1")
    .getOrCreate()
)

spark

In [64]:
spark.sql("SHOW NAMESPACES IN rest").show(truncate=False)

+---------+
|namespace|
+---------+
|raw      |
|silver   |
+---------+



In [65]:
spark.sql("SHOW TABLES IN rest.raw").show(truncate=False)

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|raw      |avito    |false      |
|raw      |mubawab  |false      |
+---------+---------+-----------+



In [53]:
# Load RAW table


In [68]:
## spark.sql("TRUNCATE TABLE rest.raw.mubawab")

DataFrame[]

In [73]:
spark.sql("SELECT COUNT(*) FROM rest.raw.mubawab").show()

+--------+
|count(1)|
+--------+
|     407|
+--------+



In [74]:
raw_mubawab = spark.table("rest.raw.mubawab") 

In [75]:
import pandas as pd

pd.set_option('display.max_colwidth', None)  # show full column text
pd.set_option('display.width', None)         # no wrapping
pd.set_option('display.max_rows', 10)        # adjust as you like

raw_mubawab.limit(1).toPandas()

                                                                                

Unnamed: 0,id,payload,ingest_ts
0,8248870,"{""id"": ""8248870"", ""url"": ""https://www.mubawab.ma/fr/a/8248870/location-longue-dur%C3%A9e-meubl%C3%A9"", ""error"": null, ""listing_type"": ""location"", ""title"": ""Location longue durée meublé"", ""price"": 14000, ""location_text"": ""Hivernage à Marrakech"", ""features_main_json"": ""{\""Type de bien\"": \""Appartement\"", \""Etat\"": \""Bon état\"", \""Surface\"": \""150 m²\"", \""Pièces\"": \""3 Pièces\"", \""Chambres\"": \""2 Chambres\"", \""Salles de bain\"": \""1 Salle de bain\""}"", ""features_amenities_json"": ""[\""Terrasse\"", \""Garage\""]"", ""description_text"": ""L’appartement est très bien situé à l’hivernage proche de tout commerce 2 chambres1 séjourUne grande terrassePlace de parking"", ""gallery_urls"": ""[\""https://www.mubawab-media.com/ad/8/248/870F/h/f59c37cb-fef0-498a-9e0a-a013cbf99294_81373544.avif\"", \""https://www.mubawab-media.com/ad/8/248/870F/h/1a6018eb-4585-4763-b76a-c2f862844f26_81373545.avif\"", \""https://www.mubawab-media.com/ad/8/248/870F/h/9fa3a823-0e59-4256-8dd0-76f017a54c70_81373546.avif\"", \""https://www.mubawab-media.com/ad/8/248/870F/h/466b2caa-d4a7-4efe-8c2c-34b466e71a7e_81373547.avif\"", \""https://www.mubawab-media.com/ad/8/248/870F/h/58002fbe-29a2-4e6b-b24b-42f6450893e2_81373548.avif\"", \""https://www.mubawab-media.com/ad/8/248/870F/h/822dd847-b15b-4204-a6a1-03f774d8eaea_81373549.avif\"", \""https://www.mubawab-media.com/ad/8/248/870F/h/5ff5269c-785d-468e-b7d9-10d8a98b7232_81373550.avif\""]"", ""agency_name"": ""Marrakech Homes"", ""agency_url"": ""https://www.mubawab.ma/fr/b/10207/marrakech-homes""}",2025-11-08 19:14:00.003


In [76]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, from_json, split, regexp_replace, trim, when, size

# Schema inside payload (based on your sample; extra fields are nullable by default)
payload_schema = StructType([
    StructField("id", StringType()),
    StructField("url", StringType()),
    StructField("error", StringType()),
    StructField("listing_type", StringType()),      # "location" | "vente" | etc.
    StructField("title", StringType()),
    StructField("price", DoubleType()),             # already numeric in your sample
    StructField("location_text", StringType()),     # e.g. "Samlalia à Marrakech"
    StructField("features_amenities_json", StringType()),  # JSON array as string
    StructField("description_text", StringType()),
    StructField("features_main_json", StringType()),       # JSON object as string
    StructField("gallery_urls", StringType()),             # JSON array as string
    StructField("agency_name", StringType()),
    StructField("agency_url", StringType()),
    # Some listings might have these; keep them if your scraper adds later:
    StructField("published_date", StringType(), True),
])

# Parse the outer payload
raw_mubawab = (
    raw_mubawab
    .withColumn("j", from_json(col("payload"), payload_schema))
    .select(
        col("id").alias("record_id"),
        col("ingest_ts"),
        col("j.*")
    )
)

In [77]:

pd.set_option('display.max_colwidth', 30)  
pd.set_option('display.width', 20)         
pd.set_option('display.max_rows', 15)  

raw_mubawab.limit(1).toPandas()

Unnamed: 0,record_id,ingest_ts,id,url,error,listing_type,title,price,location_text,features_amenities_json,description_text,features_main_json,gallery_urls,agency_name,agency_url,published_date
0,8247341,2025-11-08 19:11:30.003,8247341,https://www.mubawab.ma/fr/...,,location,TARGA villa de standing me...,30000.0,Hay Targa à Marrakech,"[""Jardin"", ""Terrasse"", ""Pi...",À LOUER – Villa de standin...,"{""Type de bien"": ""Villa"", ...","[""https://www.mubawab-medi...",Les Tailleurs de l'Immobilier,https://www.mubawab.ma/fr/...,


In [61]:
from pyspark.sql.functions import col, count

(
    raw_mubawab.groupBy("features_main_json")
          .agg(count("*").alias("count"))
          .orderBy(col("count").asc())
          .show(10, truncate=False)
)




+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features_main_json                                                                                                                                                             |count|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|{"Type de bien": "Appartement", "Etat": "Bon état", "Années": "10-20 ans", "Étage du bien": "9ème"}                                                                            |1    |
|{"Type de bien": "Appartement", "Etat": "Nouveau", "Années": "Moins d'un an", "Étage du bien": "1er", "Orientation": "Sud", "Type du sol": "Marbre"}                           |1    |
|{"Type de bien": "Villa", "Surface de la parcelle": "350 m²", "Etat": "Bon état

                                                                                

## id

In [78]:
raw_mubawab.select("id", "record_id").filter(col("id") != col("record_id")).show(truncate=False)

+---+---------+
|id |record_id|
+---+---------+
+---+---------+



In [79]:
raw_mubawab = raw_mubawab.drop("record_id")

In [80]:
from pyspark.sql.functions import col, count


dupes = (
    raw_mubawab.groupBy("id")
      .agg(count("*").alias("count"))
      .filter(col("count") > 1)
      .orderBy(col("count").desc())
)

dupes.show(truncate=False)
print("Total duplicate IDs:", dupes.count())

+---+-----+
|id |count|
+---+-----+
+---+-----+

Total duplicate IDs: 0


In [81]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number

df = spark.table("rest.raw.avito")  # or rest.silver.avito / rest.raw.mubawab

# Window: group by id, order by most recent ingest_ts
w = Window.partitionBy("id").orderBy(col("ingest_ts").desc())

# Keep only first row (latest per id)
raw_mubawab = (
    raw_mubawab.withColumn("rn", row_number().over(w))
      .filter(col("rn") == 1)
      .drop("rn")
)

## url

In [82]:
from pyspark.sql.functions import col, trim

raw_mubawab = raw_mubawab.filter(
    (col("url").isNotNull()) &
    (trim(col("url")) != "")
)

## price

In [83]:
raw_mubawab.select("price").distinct().show(10, truncate=False)

+---------+
|price    |
+---------+
|4800.0   |
|330000.0 |
|90000.0  |
|1100000.0|
|1000000.0|
|4250000.0|
|35000.0  |
|5600.0   |
|21000.0  |
|110000.0 |
+---------+
only showing top 10 rows



In [84]:
from pyspark.sql.functions import when, col, lit

raw_mubawab = raw_mubawab.withColumn(
    "price",
    when((col("price") <= 0) | col("price").isNull(), lit(None)).otherwise(col("price"))
)

## seller

In [85]:
from pyspark.sql.functions import col, count

raw_mubawab.groupBy("agency_name") \
    .agg(count("*").alias("count")) \
    .orderBy(col("count").desc()) \
    .show(truncate=False)

+------------------------------------------------------+-----+
|agency_name                                           |count|
+------------------------------------------------------+-----+
|NULL                                                  |122  |
|8th Avenue                                            |24   |
|Castle Agency                                         |7    |
|Le Comptoir, Agence Immobilière et Sté de Construction|7    |
|MDK Immobilier                                        |7    |
|ANDALOUSSI IMMOBILIER (APS)                           |5    |
|ghita store immo                                      |5    |
|Kay Realestate                                        |5    |
|Clean World Service                                   |5    |
|BY-TAK IMMOBILIER                                     |4    |
|Capital Foncier                                       |4    |
|Nourdine Immo                                         |4    |
|Immoscout24                                           

In [86]:
from pyspark.sql.functions import col, lower, trim, when

raw_mubawab = (
    raw_mubawab
    .withColumn(
        "seller",
        when(
            (col("agency_name").isNull()) |
            (trim(col("agency_name")) == "") |
            (lower(trim(col("agency_name"))).isin("nan", "null", "unknown")),
            "unknown"
        ).otherwise(lower(trim(col("agency_name"))))
    )
    .drop("agency_name")
)

## images 

In [87]:
from pyspark.sql.functions import split, trim, col, from_json, expr
from pyspark.sql.types import ArrayType, StringType


# --- MUBAWAB ---
raw_mubawab = (
    raw_mubawab
    .withColumn(
        "images",
        from_json(col("gallery_urls"), ArrayType(StringType()))
    )
    .drop("gallery_urls")
)

In [88]:
raw_mubawab.select("images").distinct().show(1, truncate=False)

+---------------------------------------------------------------------+
|images                                                               |
+---------------------------------------------------------------------+
|[https://www.mubawab-media.com/ad/8/248/491F/h/photo_0_81369390.avif]|
+---------------------------------------------------------------------+
only showing top 1 row



## equipments

In [89]:
raw_mubawab.groupBy("features_amenities_json") \
    .agg(count("*").alias("count")) \
    .orderBy(col("count").desc()) \
    .show(10, truncate=False)

+------------------------------------------------+-----+
|features_amenities_json                         |count|
+------------------------------------------------+-----+
|NULL                                            |56   |
|["Ascenseur", "Concierge"]                      |8    |
|["Garage", "Ascenseur", "Concierge", "Sécurité"]|6    |
|["Terrasse", "Garage"]                          |5    |
|["Terrasse"]                                    |5    |
|["Ascenseur"]                                   |3    |
|["Terrasse", "Meublé"]                          |3    |
|["Garage"]                                      |3    |
|["Garage", "Ascenseur", "Concierge", "Internet"]|3    |
|["Meublé"]                                      |3    |
+------------------------------------------------+-----+
only showing top 10 rows



## offre 

In [90]:
raw_mubawab.groupBy("listing_type") \
    .agg(count("*").alias("count")) \
    .orderBy(col("count").desc()) \
    .show(50, truncate=False)

+------------+-----+
|listing_type|count|
+------------+-----+
|location    |271  |
|vente       |136  |
+------------+-----+



In [91]:
from pyspark.sql.functions import col

raw_mubawab = raw_mubawab.withColumnRenamed("listing_type", "offre")

## city & nighbrhood

In [92]:
raw_mubawab.groupBy("location_text") \
    .agg(count("*").alias("count")) \
    .orderBy(col("count").desc()) \
    .show(5, truncate=False)



+-------------------+-----+
|location_text      |count|
+-------------------+-----+
|Guéliz à Marrakech |22   |
|Dar Bouazza        |13   |
|Agdal à Marrakech  |12   |
|Maârif à Casablanca|11   |
|Souissi à Rabat    |10   |
+-------------------+-----+
only showing top 5 rows



                                                                                

In [93]:
from pyspark.sql.functions import split, trim, when, col

# Split "location_text" into parts by "à"
split_col = split(col("location_text"), " à ")

# Create new columns
raw_mubawab = (
    raw_mubawab
    .withColumn("neighborhood", trim(split_col.getItem(0)))  # text before "à"
    .withColumn("city", trim(split_col.getItem(1)))           # text after "à"
)

# Optional: handle cases where "à" doesn't exist (e.g. only city name)
raw_mubawab = raw_mubawab.withColumn(
    "city",
    when(col("city").isNull(), col("neighborhood")).otherwise(col("city"))
)

# Verify results
raw_mubawab.select("location_text", "neighborhood", "city").show(10, truncate=False)

+-------------------------------+-------------------+----------+
|location_text                  |neighborhood       |city      |
+-------------------------------+-------------------+----------+
|Samlalia à Marrakech           |Samlalia           |Marrakech |
|Agdal à Marrakech              |Agdal              |Marrakech |
|Moujahidine à Tanger           |Moujahidine        |Tanger    |
|Branes 1 à Tanger              |Branes 1           |Tanger    |
|Riad Al Atlas à Marrakech      |Riad Al Atlas      |Marrakech |
|Maârif à Casablanca            |Maârif             |Casablanca|
|Ain Diab à Casablanca          |Ain Diab           |Casablanca|
|CIL (Hay Salam) à Casablanca   |CIL (Hay Salam)    |Casablanca|
|Maârif à Casablanca            |Maârif             |Casablanca|
|Route de Ouarzazate à Marrakech|Route de Ouarzazate|Marrakech |
+-------------------------------+-------------------+----------+
only showing top 10 rows



In [94]:
raw_mubawab = raw_mubawab.drop("location_text")

In [95]:
from pyspark.sql.functions import lit

raw_mubawab = raw_mubawab.withColumn("site", lit("mubawab"))

In [96]:
raw_mubawab.select("site").show(10, truncate=False)

+-------+
|site   |
+-------+
|mubawab|
|mubawab|
|mubawab|
|mubawab|
|mubawab|
|mubawab|
|mubawab|
|mubawab|
|mubawab|
|mubawab|
+-------+
only showing top 10 rows



In [97]:
raw_mubawab.select("features_main_json").show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features_main_json                                                                                                                                                                    |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"Type de bien": "Appartement", "Etat": "Bon état"}                                                                                                                                   |
|{"Type de bien": "Appartement", "Etat": "Bon état", "Années": "1-5 ans", "Étage du bien": "2ème", "Orientation": "Est", "Type du sol": "Marbre"}                                      |
|{"Type de bien": "Appartement", "Etat": "À rénover", "Étage du bien": "3èm

In [98]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import MapType, StringType

# Parse the JSON column into a map (key → value)
raw_mubawab = raw_mubawab.withColumn(
    "features_map",
    from_json(col("features_main_json"), MapType(StringType(), StringType()))
)

# Extract the value of "Type de bien"
raw_mubawab = raw_mubawab.withColumn(
    "property_type",
    col("features_map")["Type de bien"]
)

# Optional: drop the intermediate parsed map if you don’t need it
raw_mubawab = raw_mubawab.drop("features_map")

# Preview
raw_mubawab.select("features_main_json", "property_type").show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
|features_main_json                                                                                                                              |property_type|
+------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
|{"Type de bien": "Appartement", "Etat": "Bon état"}                                                                                             |Appartement  |
|{"Type de bien": "Appartement", "Etat": "Bon état", "Années": "1-5 ans", "Étage du bien": "2ème", "Orientation": "Est", "Type du sol": "Marbre"}|Appartement  |
|{"Type de bien": "Appartement", "Etat": "À rénover", "Étage du bien": "3ème"}                                                                   |Appartement  |
|{"Type de bien": "Appartement", "

In [99]:
raw_mubawab.select("property_type").show(10, truncate=False)

+-------------+
|property_type|
+-------------+
|Appartement  |
|Appartement  |
|Appartement  |
|Appartement  |
|Appartement  |
|Appartement  |
|Villa        |
|Appartement  |
|Appartement  |
|Villa        |
+-------------+
only showing top 10 rows



                                                                                

In [100]:
raw_mubawab.select("features_main_json",).show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features_main_json                                                                                                                                                                    |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"Type de bien": "Appartement", "Etat": "Bon état"}                                                                                                                                   |
|{"Type de bien": "Appartement", "Etat": "Bon état", "Années": "1-5 ans", "Étage du bien": "2ème", "Orientation": "Est", "Type du sol": "Marbre"}                                      |
|{"Type de bien": "Appartement", "Etat": "À rénover", "Étage du bien": "3èm

In [101]:
from pyspark.sql.functions import from_json, col, trim
from pyspark.sql.types import MapType, StringType
import unicodedata, re

# 1) Parse JSON → Map(String,String)
raw_mubawab = raw_mubawab.withColumn(
    "feat_map",
    from_json(col("features_main_json"), MapType(StringType(), StringType()))
)

# 2) Collect ALL distinct keys present in the map
keys_df = raw_mubawab.selectExpr("explode(map_keys(feat_map)) as k").distinct()
keys = [r["k"] for r in keys_df.collect() if r["k"] is not None]

# 3) Helper to sanitize keys into safe column names
def sanitize(name: str) -> str:
    # remove accents
    name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("ascii")
    # to snake_case
    name = re.sub(r"[^0-9a-zA-Z]+", "_", name).strip("_")
    return name.lower()

# 4) Create one column per key (values kept exactly as-is)
for k in keys:
    raw_mubawab = raw_mubawab.withColumn(sanitize(k), trim(col("feat_map")[k]))

# 5) Optional: drop the intermediate map and/or original JSON column
raw_mubawab = raw_mubawab.drop("feat_map")  # keep features_main_json if you still need the raw string
# raw_moteur = raw_moteur.drop("features_main_json")  # uncomment if you want to drop the raw JSON

# (Optional) quick peek
# raw_moteur.select("features_main_json", *[sanitize(k) for k in keys]).show(20, truncate=False)


In [102]:
raw_mubawab.limit(1).toPandas()

25/11/08 19:18:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,ingest_ts,id,url,error,offre,title,price,features_amenities_json,description_text,features_main_json,...,etage_du_bien,detail_1,annees,constructibilite,salles_de_bain,livraison,pieces,orientation,etat,nombre_d_etages
0,2025-11-08 19:17:30.003,8246872,https://www.mubawab.ma/fr/...,,location,Très bel appartement à la ...,9000.0,"[""Piscine"", ""Meublé""]","Un petit bijou à location,...","{""Type de bien"": ""Appartem...",...,,,,,,,,,Bon état,


In [103]:
raw_mubawab = raw_mubawab.drop("features_main_json")

In [104]:
raw_mubawab.printSchema()

root
 |-- ingest_ts: timestamp (nullable = true)
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- error: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: double (nullable = true)
 |-- features_amenities_json: string (nullable = true)
 |-- description_text: string (nullable = true)
 |-- agency_url: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- neighborhood: string (nullable = true)
 |-- city: string (nullable = true)
 |-- site: string (nullable = false)
 |-- property_type: string (nullable = true)
 |-- type_de_terrain: string (nullable = true)
 |-- type_de_bien: string (nullable = true)
 |-- surface: string (nullable = true)
 |-- statut_du_terrain: string (nullable = true)
 |-- surface_de_la_parcelle: string (nullable = true)
 |-- chambres: string (nu