In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder     .appName("Bronze_Avito_Mubawab")     .getOrCreate()

spark

In [None]:
avito_ventes_df = (
    spark.read
    .parquet(f"{raw_base}/avito/ventes/*/*/*/*/*.parquet")
)

avito_locations_df = (
    spark.read
    .parquet(f"{raw_base}/avito/locations/*/*/*/*/*.parquet")
)

df_avito = avito_ventes_df.unionByName(avito_locations_df)

In [None]:
mubawab_ventes_df = (
    spark.read
    .parquet(f"{raw_base}/mubawab/ventes/*/*/*/*")
)

mubawab_locations_df = (
    spark.read
    .parquet(f"{raw_base}/mubawab/locations/*/*/*/*")
)

df_mubawab = mubawab_ventes_df.unionByName(mubawab_locations_df)

In [None]:
bronze_avito = df_avito.select(
    # Identifiers
    "id",
    "url",
    "source_site",
    "offre",

    # Main business content
    "title",
    "description",
    "price_text",
    "location",

    # Dates
    "published_date",
    "scraping_time",
    "ingest_ts",

    # Metadata
    "category_label",
    "breadcrumbs",
    "breadcrumbs_list",
    "attributes",

    # Media
    "images",
    "equipments",

    # Seller info
    "seller_name",
    "seller_url",
    "seller_is_store"
)

bronze_avito.printSchema()

In [None]:
bronze_mubawab = df_mubawab.select(
    # Identifiers
    "id",
    "url",
    "source_site",
    "offre",

    # Main business content
    "title",
    "description",
    "price_text",
    "location",

    # Dates
    "published_date",
    "scraping_time",
    "ingest_ts",

    # Metadata
    "category_label",
    "breadcrumbs",
    "breadcrumbs_list",
    "attributes",

    # Media
    "images",
    "equipments",

    # Seller info
    "seller_name",
    "seller_url",
    "seller_is_store"
)

bronze_mubawab.printSchema()

In [None]:
null_count_avito = bronze_avito.filter(
    (F.col("id").isNull()) | (F.col("url").isNull())
).count()

In [None]:
null_count_mubawab = bronze_mubawab.filter(
    (F.col("id").isNull()) | (F.col("url").isNull())
).count()

In [None]:
bronze_avito.groupBy("url").count().filter("count > 1").show()

In [None]:
bronze_mubawab.groupBy("url").count().filter("count > 1").show()

In [None]:
w = Window.partitionBy("url").orderBy(F.col("scraping_time").asc())

bronze_avito = (
    bronze_avito
    .withColumn("rn", F.row_number().over(w))
    .filter("rn = 1")
    .drop("rn")
)

In [None]:
w = Window.partitionBy("url").orderBy(F.col("scraping_time").asc())

bronze_mubawab = (
    bronze_mubawab
    .withColumn("rn", F.row_number().over(w))
    .filter("rn = 1")
    .drop("rn")
)

In [None]:
bronze_avito.groupBy("offre").count().orderBy("count", ascending=False).show(truncate=False)

In [None]:
bronze_mubawab.groupBy("offre").count().orderBy("count", ascending=False).show(truncate=False)

In [None]:
bronze_avito.select("price_text").show(20, truncate=False)

In [None]:
bronze_mubawab.select("price_text").show(20, truncate=False)

In [None]:
bronze_avito = (
    bronze_avito
    .withColumn(
        "price",
        F.when(
            (F.col("price_text").isNull()) |
            (F.lower(F.col("price_text")) == "null") |
            (F.trim(F.col("price_text")) == ""),
            None
        ).otherwise(
            F.regexp_replace(
                F.regexp_replace(
                    F.col("price_text"),
                    r"(?i)\s*dh\s*", ""
                ),
                r"[^\d]", ""
            ).cast("double")
        )
    )
    .drop("price_text")
)

bronze_avito.select("price").show(20, truncate=False)

In [None]:
bronze_mubawab = (
    bronze_mubawab
    .withColumn(
        "price",
        F.when(
            (F.col("price_text").isNull()) |
            (F.lower(F.col("price_text")) == "null") |
            (F.trim(F.col("price_text")) == ""),
            None
        ).otherwise(
            F.regexp_replace(
                F.regexp_replace(
                    F.col("price_text"),
                    r"(?i)\s*dh\s*", ""
                ),
                r"[^\d]", ""
            ).cast("double")
        )
    )
    .drop("price_text")
)

bronze_mubawab.select("price").show(20, truncate=False)

In [None]:
bronze_avito = (
    bronze_avito
    .withColumn(
        "price",
        F.when(
            F.col("price").isNull(),
            None
        ).otherwise(
            F.col("price").cast(DecimalType(20, 0))
        )
    )
)

In [None]:
bronze_mubawab = (
    bronze_mubawab
    .withColumn(
        "price",
        F.when(
            F.col("price").isNull(),
            None
        ).otherwise(
            F.col("price").cast(DecimalType(20, 0))
        )
    )
)

In [None]:
bronze_avito.select("location").show(5, truncate=False)

bronze_avito = bronze_avito.drop("location")

In [None]:
bronze_mubawab.select("location").show(10, truncate=False)

bronze_mubawab = bronze_mubawab.withColumn(
    "location",
    F.trim(
        F.regexp_replace(
            F.col("location"),
            r"\s+",
            " "
        )
    )
)

bronze_mubawab.select("location").show(10, truncate=False)

In [None]:
bronze_avito.select("breadcrumbs").show(5, truncate=False)
bronze_avito.select("breadcrumbs_list").show(5, truncate=False)

In [None]:
bronze_mubawab.select("breadcrumbs").show(5, truncate=False)
bronze_mubawab.select("breadcrumbs_list").show(5, truncate=False)

In [None]:
bronze_avito = bronze_avito.drop("breadcrumbs")
bronze_mubawab = bronze_mubawab.drop("breadcrumbs")

In [None]:
bronze_avito.select("seller_name").show(5, truncate=False)

In [None]:
bronze_mubawab.select("seller_name").show(5, truncate=False)

In [None]:
bronze_avito.groupBy("seller_is_store").count().orderBy("count", ascending=False).show(truncate=False)

In [None]:
bronze_mubawab.groupBy("seller_is_store").count().orderBy("count", ascending=False).show(truncate=False)

In [None]:
bronze_avito = bronze_avito.drop("seller_is_store")
bronze_mubawab = bronze_mubawab.drop("seller_is_store")

In [None]:
bronze_avito.select("category_label").show(5, truncate=False)

In [None]:
bronze_mubawab.select("category_label").show(5, truncate=False)

In [None]:
bronze_avito = bronze_avito.drop("scraping_time")
bronze_mubawab = bronze_mubawab.drop("scraping_time")

In [None]:
bronze_avito.select("published_date").show(5, truncate=False)
bronze_avito.select("ingest_ts").show(5, truncate=False)

In [None]:
bronze_avito = (
    bronze_avito
    .withColumn(
        "published_date",
        F.to_timestamp("published_date", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
    )
    .withColumn(
        "ingest_ts",
        F.to_timestamp("ingest_ts", "yyyy-MM-dd HH:mm:ss.SSSSSS")
    )
)

bronze_avito.printSchema()

In [None]:
bronze_mubawab = (
    bronze_mubawab
    .withColumn(
        "published_date",
        F.to_timestamp("published_date", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
    )
    .withColumn(
        "ingest_ts",
        F.to_timestamp("ingest_ts", "yyyy-MM-dd HH:mm:ss.SSSSSS")
    )
)

bronze_mubawab.printSchema()

In [None]:
bronze_avito.printSchema()

bronze_mubawab.printSchema()

In [None]:
bronze_avito.limit(2).toPandas()

In [None]:
bronze_mubawab.limit(2).toPandas()