In [6]:
# %%
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

# ---------------------------------
# 1. Spark session & ADLS config
# ---------------------------------
load_dotenv()

spark = (
    SparkSession.builder
    .appName("RealEstate_Avito_Vente_Silver")
    .getOrCreate()
)

storage_account = "strealestatehamza"
container = "realestate"

adls_key = os.getenv("ADLS_ACCOUNT_KEY")
if not adls_key:
    raise RuntimeError("ADLS_ACCOUNT_KEY missing from .env")

# Configure access for dfs + blob endpoints
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    adls_key,
)
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
    adls_key,
)

# ---------------------------------
# 2. Define Bronze path for Avito vente
# ---------------------------------
# Base path to bronze on ADLS (Gen2)
bronze_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze"

ingest_date = "2025-11-25"

avito_vente_bronze_path = (
    f"{bronze_base}/avito/vente/{ingest_date}"
)

print("Reading from:", avito_vente_bronze_path)

df_bronze_avito_vente = spark.read.parquet(avito_vente_bronze_path)

print("Row count:", df_bronze_avito_vente.count())
df_bronze_avito_vente.printSchema()

Reading from: abfss://realestate@strealestatehamza.dfs.core.windows.net/bronze/avito/vente/2025-11-25
Row count: 2797
root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location: string (nullable = true)
 |-- published_date: timestamp (nullable = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- category_label: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- images: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- price: decimal(20,0) (nullable = true)
 |-- ingest_date: date (nullable = true)



In [None]:
df_bronze_avito_vente.select(lcoation.diiferntvalue)

In [7]:
# %%
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

# ---------------------------------
# 1. Spark session & ADLS config
# ---------------------------------
load_dotenv()

spark = (
    SparkSession.builder
    .appName("RealEstate_Mubawab_Vente_Silver")
    .getOrCreate()
)

storage_account = "strealestatehamza"
container = "realestate"

adls_key = os.getenv("ADLS_ACCOUNT_KEY")
if not adls_key:
    raise RuntimeError("ADLS_ACCOUNT_KEY missing from .env")

# Azure auth for DFS + Blob
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    adls_key,
)
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
    adls_key,
)

# ---------------------------------
# 2. Define Bronze path for Mubawab vente
# ---------------------------------
bronze_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze"

ingest_date = "2025-11-25"   # same date as Avito for consistency

mubawab_vente_bronze_path = (
    f"{bronze_base}/mubawab/vente/{ingest_date}"
)

print("Reading from:", mubawab_vente_bronze_path)

# ---------------------------------
# 3. Load Bronze
# ---------------------------------
df_bronze_mubawab_vente = spark.read.parquet(mubawab_vente_bronze_path)

# ---------------------------------
# 4. Inspect Data
# ---------------------------------
print("Row count:", df_bronze_mubawab_vente.count())
df_bronze_mubawab_vente.printSchema()

# Optional sample preview
df_bronze_mubawab_vente.show(10, truncate=False)

Reading from: abfss://realestate@strealestatehamza.dfs.core.windows.net/bronze/mubawab/vente/2025-11-25
Row count: 3039
root
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- source_site: string (nullable = true)
 |-- offre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- location: string (nullable = true)
 |-- published_date: timestamp (nullable = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- category_label: string (nullable = true)
 |-- breadcrumbs_list: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- images: string (nullable = true)
 |-- equipments: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- seller_url: string (nullable = true)
 |-- price: decimal(20,0) (nullable = true)
 |-- ingest_date: date (nullable = true)

+-------+--------------------------------------------------------------------------------------------------------------------+