In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RealEstate_Bronze_Local") \
    .getOrCreate()

spark

In [4]:
from google.colab import files
uploaded = files.upload()

Saving DataLake.zip to DataLake.zip


In [5]:
import zipfile
import os

zip_path = "/content/DataLake.zip"
extract_path = "/content/"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

os.listdir("/content")

['.config', 'DataLake.zip', 'DataLake', 'sample_data']

In [8]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Bronze")
    .getOrCreate()
)

raw_base = "/content/DataLake/raw"

# --- Avito ventes (PARQUET) ---
df_ventes = (
    spark.read
    .parquet(f"{raw_base}/avito/ventes/*/*/*/*/*.parquet")
)

# --- Avito locations (PARQUET) ---
df_locations = (
    spark.read
    .parquet(f"{raw_base}/avito/locations/*/*/*/*/*.parquet")
)

# --- Union (Parquet keeps schema, so union is clean) ---
df = df_ventes.unionByName(df_locations)

In [9]:
df.printSchema()

root
 |-- attributes: string (nullable = true)
 |-- breadcrumbs: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- description: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- id: string (nullable = true)
 |-- images: string (nullable = true)
 |-- location: string (nullable = true)
 |-- price_text: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- scraping_time: string (nullable = true)
 |-- seller_is_store: boolean (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- ingest_ts: string (nullable = true)



In [10]:
bronze_df = df.select(
    # Identifiers
    "id",
    "url",
    "source_site",
    "offre",

    # Main business content
    "title",
    "description",
    "price_text",
    "location",

    # Dates
    "published_date",
    "scraping_time",
    "ingest_ts",

    # Metadata
    "category_label",
    "breadcrumbs",
    "breadcrumbs_list",
    "attributes",

    # Media
    "images",
    "equipments",

    # Seller info
    "seller_name",
    "seller_url",
    "seller_is_store"
)

In [11]:
bronze_df.printSchema()


root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- price_text: string (nullable = true)
 |-- location: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- scraping_time: string (nullable = true)
 |-- ingest_ts: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- breadcrumbs: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- images: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- seller_is_store: boolean (nullable = true)



In [12]:
bronze_df.count()

76

In [13]:
from pyspark.sql import functions as F
bronze_df.filter(
    (F.col("id").isNull()) | (F.col("url").isNull())
).count()


0

In [14]:
bronze_df.groupBy("url").count().filter("count > 1").show()

+---+-----+
|url|count|
+---+-----+
+---+-----+



In [15]:
from pyspark.sql import Window
from pyspark.sql import functions as F

w = Window.partitionBy("url").orderBy(F.col("scraping_time").asc())

bronze_df = (
    bronze_df
    .withColumn("rn", F.row_number().over(w))
    .filter("rn = 1")
    .drop("rn")
)

In [16]:
bronze_df.filter(F.col("source_site").isNull()).count()

0

In [17]:
bronze_df.groupBy("offre").count().orderBy("count", ascending=False).show(truncate=False)

+--------+-----+
|offre   |count|
+--------+-----+
|location|38   |
|vente   |38   |
+--------+-----+



In [18]:
bronze_df.select("price_text").show(20, truncate=False)

+------------+
|price_text  |
+------------+
|7 500 DH    |
|8 500 DH    |
|13 000 DH   |
|35 000 DH   |
|9 000 DH    |
|1 350 000 DH|
|850 000 DH  |
|3 000 DH    |
|1 300 000 DH|
|195 000 DH  |
|15 000 DH   |
|620 000 DH  |
|460 000 DH  |
|3 600 DH    |
|32 000 DH   |
|260 000 DH  |
|640 000 DH  |
|240 000 DH  |
|560 000 DH  |
|900 000 DH  |
+------------+
only showing top 20 rows



In [19]:
from pyspark.sql import functions as F

bronze_df.filter(F.col("price_text").isNull()).count()

0

In [20]:
from pyspark.sql import functions as F

bronze_df = (
    bronze_df
    # 1) Créer la colonne numérique "price" à partir de "price_text"
    .withColumn(
        "price",
        F.when(
            # NULL, "null", chaîne vide → valeur nulle
            (F.col("price_text").isNull()) |
            (F.lower(F.col("price_text")) == "null") |
            (F.trim(F.col("price_text")) == ""),
            None
        ).otherwise(
            F.regexp_replace(                      # garder uniquement les chiffres
                F.regexp_replace(                  # enlever "DH" (insensible à la casse)
                    F.col("price_text"),
                    r"(?i)\s*dh\s*", ""
                ),
                r"[^\d]", ""                       # enlever tous les caractères non numériques (espaces, etc.)
            ).cast("double")
        )
    )
    # 2) Supprimer l’ancienne colonne texte
    .drop("price_text")
)

In [21]:
bronze_df.select("price").show(20, truncate=False)

+---------+
|price    |
+---------+
|7500.0   |
|8500.0   |
|13000.0  |
|35000.0  |
|9000.0   |
|1350000.0|
|850000.0 |
|3000.0   |
|1300000.0|
|195000.0 |
|15000.0  |
|620000.0 |
|460000.0 |
|3600.0   |
|32000.0  |
|260000.0 |
|640000.0 |
|240000.0 |
|560000.0 |
|900000.0 |
+---------+
only showing top 20 rows



In [23]:
from pyspark.sql.types import DecimalType

bronze_df = (
    bronze_df
    .withColumn(
        "price",
        F.when(
            (F.col("price").isNull()),
            None
        ).otherwise(
            F.col("price").cast(DecimalType(20,0))
        )
    )
)

In [24]:
bronze_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- scraping_time: string (nullable = true)
 |-- ingest_ts: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- breadcrumbs: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- images: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- seller_is_store: boolean (nullable = true)
 |-- price: decimal(20,0) (nullable = true)



In [25]:
bronze_df.select("location").show(5, truncate=False)

+--------+
|location|
+--------+
|        |
|        |
|        |
|        |
|        |
+--------+
only showing top 5 rows



In [26]:
bronze_df.select("location").distinct().count()

1

In [27]:
bronze_df = bronze_df.drop("location")

In [28]:
bronze_df.select("breadcrumbs").show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|breadcrumbs                                                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|Accueil > Tout le Maroc > Casablanca > 2 Mars > Avito Immobilier > Locations Immobilières > Appartements > Appartement à louer avec. Une grande terrasse|
|Accueil > Tout le Maroc > Casablanca > Abdelmoumen > Avito Immobilier > Bureaux > A louer bureau 100 m2                                                 |
|Accueil > Tout le Maroc > Rabat > Agdal > Avito Immobilier > Locations Immobilières > Appartements > Appartement à louer à l'Agdal                      |
|Accueil > Tout le Maroc > Rabat > Agdal > Avito Immobilier > Bureaux 

In [29]:
bronze_df.select("breadcrumbs_list").show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|breadcrumbs_list                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|["Accueil", "Tout le Maroc", "Casablanca", "2 Mars", "Avito Immobilier", "Locations Immobilières", "Appartements", "Appartement à louer avec. Une grande terrasse"]|
|["Accueil", "Tout le Maroc", "Casablanca", "Abdelmoumen", "Avito Immobilier", "Bureaux", "A louer bureau 100 m2"]                                                  |
|["Accueil", "Tout le Maroc", "Rabat", "Agdal", "Avito Immobilier", "Locations Immobilières", "Appartements", "Appartement à louer à l'Agdal"]                      |
|["A

In [30]:
bronze_df.filter(F.col("breadcrumbs_list").isNull()).count()

0

In [31]:
bronze_df = bronze_df.drop("breadcrumbs")

In [32]:
bronze_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- scraping_time: string (nullable = true)
 |-- ingest_ts: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- images: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- seller_is_store: boolean (nullable = true)
 |-- price: decimal(20,0) (nullable = true)



In [33]:
bronze_df.select("seller_name").show(5, truncate=False)

+----------------+
|seller_name     |
+----------------+
|khalid          |
|bercker fisli   |
|akkor immobilier|
|ANOUAR IMMO     |
|vpn uniwell     |
+----------------+
only showing top 5 rows



In [34]:
bronze_df.groupBy("seller_is_store").count().orderBy("count", ascending=False).show(truncate=False)

+---------------+-----+
|seller_is_store|count|
+---------------+-----+
|false          |76   |
+---------------+-----+



In [35]:
bronze_df = bronze_df.drop("seller_is_store")

In [36]:
bronze_df.select("category_label").show(5, truncate=False)

+---------------------+
|category_label       |
+---------------------+
|Appartements, à louer|
|Bureaux, à louer     |
|Appartements, à louer|
|Bureaux, à louer     |
|Bureaux, à louer     |
+---------------------+
only showing top 5 rows



In [37]:
bronze_df = bronze_df.drop("scraping_time")

In [38]:
bronze_df.select("published_date").show(5, truncate=False)

+------------------------+
|published_date          |
+------------------------+
|2025-11-24T22:52:39.000Z|
|2025-11-24T22:47:57.000Z|
|2025-11-24T14:44:19.000Z|
|2025-11-24T23:31:50.000Z|
|2025-11-24T22:51:34.000Z|
+------------------------+
only showing top 5 rows



In [39]:
bronze_df.select("ingest_ts").show(5, truncate=False)

+--------------------------+
|ingest_ts                 |
+--------------------------+
|2025-11-25T00:16:34.926803|
|2025-11-25T00:16:34.926803|
|2025-11-25T00:16:34.926803|
|2025-11-25T00:16:34.926803|
|2025-11-25T00:16:34.926803|
+--------------------------+
only showing top 5 rows



In [40]:
bronze_df = (
    bronze_df
    # published_date : ex "2025-11-24T16:29:38.000Z"
    .withColumn(
        "published_date",
        F.to_timestamp("published_date", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
    )
    # ingest_ts : ex "2025-11-24 20:11:37.866812"
    .withColumn(
        "ingest_ts",
        F.to_timestamp("ingest_ts", "yyyy-MM-dd HH:mm:ss.SSSSSS")
    )
)

In [41]:
bronze_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- published_date: timestamp (nullable = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- category_label: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- images: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- price: decimal(20,0) (nullable = true)



```
bronze/
  avito/
    ventes/
      part-0000...parquet
    locations/
      part-0000...parquet

  mubawab/
    ventes/
    locations/
```


