In [1]:
import polars as pl

In [2]:
# Import products.csv
products = pl.read_csv("../data/raw/products_macro.csv")

# Show the first 5 rows

print(products.head(5))

shape: (5, 7)
┌──────────┬───────────────┬───────────────┬────────┬───────────────┬───────────────┬──────────────┐
│ Category ┆ name          ┆ subtitle      ┆ price  ┆ discount_pric ┆ main_image_ur ┆ secondary_im │
│ ---      ┆ ---           ┆ ---           ┆ ---    ┆ e             ┆ l             ┆ age_url      │
│ str      ┆ str           ┆ str           ┆ str    ┆ ---           ┆ ---           ┆ ---          │
│          ┆               ┆               ┆        ┆ str           ┆ str           ┆ str          │
╞══════════╪═══════════════╪═══════════════╪════════╪═══════════════╪═══════════════╪══════════════╡
│ Fruta    ┆ Plátano de    ┆ Pieza 170 g   ┆ 0,36 € ┆ null          ┆ https://prod- ┆ https://prod │
│          ┆ Canarias IGP  ┆ aprox.        ┆        ┆               ┆ mercadona.img ┆ -mercadona.i │
│          ┆               ┆               ┆        ┆               ┆ ix.n…         ┆ mgix.n…      │
│ Fruta    ┆ Banana        ┆ Pieza 170 g   ┆ 0,25 € ┆ null          ┆ https:/

In [47]:
# Show the products with high price desceding

print(products.sort("price", descending=True))


shape: (4_766, 6)
┌──────────────────┬─────────────────┬────────┬────────────────┬─────────────────┬─────────────────┐
│ name             ┆ subtitle        ┆ price  ┆ discount_price ┆ main_image_url  ┆ Category        │
│ ---              ┆ ---             ┆ ---    ┆ ---            ┆ ---             ┆ ---             │
│ str              ┆ str             ┆ str    ┆ str            ┆ str             ┆ str             │
╞══════════════════╪═════════════════╪════════╪════════════════╪═════════════════╪═════════════════╡
│ Escalopines de   ┆ Bandeja 400 g   ┆ 9,98 € ┆ null           ┆ https://prod-me ┆ Pescado fresco  │
│ salmón           ┆ aprox.          ┆        ┆                ┆ rcadona.imgix.n ┆                 │
│                  ┆                 ┆        ┆                ┆ …               ┆                 │
│ Patas de pulpo   ┆ Bandeja 250 g   ┆ 9,95 € ┆ null           ┆ https://prod-me ┆ Pescado fresco  │
│ cocido           ┆ aprox.          ┆        ┆                ┆ rcadona.

In [53]:
# Check products with category Perfume y colonia

print(products.filter(pl.col("Category") == "Perfume y colonia"))


shape: (95, 6)
┌──────────────────────┬────────────────┬───────┬────────────────┬─────────────────────┬───────────┐
│ name                 ┆ subtitle       ┆ price ┆ discount_price ┆ main_image_url      ┆ Category  │
│ ---                  ┆ ---            ┆ ---   ┆ ---            ┆ ---                 ┆ ---       │
│ str                  ┆ str            ┆ f32   ┆ str            ┆ str                 ┆ str       │
╞══════════════════════╪════════════════╪═══════╪════════════════╪═════════════════════╪═══════════╡
│ Agua de colonia S3   ┆ Botella 750 ml ┆ 3.4   ┆ null           ┆ https://prod-mercad ┆ Perfume y │
│ Classic fre…         ┆                ┆       ┆                ┆ ona.imgix.n…        ┆ colonia   │
│ Eau de toilette      ┆ Frasco 300 ml  ┆ 2.35  ┆ null           ┆ https://prod-mercad ┆ Perfume y │
│ mujer Deliplus…      ┆                ┆       ┆                ┆ ona.imgix.n…        ┆ colonia   │
│ Eau de parfum mujer  ┆ Frasco 100 ml  ┆ 9.0   ┆ null           ┆ https://p

In [52]:
# Clean price column: Remove the € symbol, replace commas with periods, and convert to float
products = products.with_columns(
    pl.col("price")
    .str.replace("€", "")  # Remove the € symbol
    .str.replace(r"[^\d,.-]", "")  # Remove any non-numeric characters except commas and dots
    .str.replace(",", ".")  # Replace commas with periods for decimal point consistency
    .cast(pl.Float32)  # Convert to float
    .alias("price")
)

# Check the first 5 rows after cleaning
print(products.head(5))


shape: (5, 6)
┌────────────────┬──────────────────────┬───────┬────────────────┬──────────────────────┬──────────┐
│ name           ┆ subtitle             ┆ price ┆ discount_price ┆ main_image_url       ┆ Category │
│ ---            ┆ ---                  ┆ ---   ┆ ---            ┆ ---                  ┆ ---      │
│ str            ┆ str                  ┆ f32   ┆ str            ┆ str                  ┆ str      │
╞════════════════╪══════════════════════╪═══════╪════════════════╪══════════════════════╪══════════╡
│ Plátano de     ┆ Pieza 170 g aprox.   ┆ 0.36  ┆ null           ┆ https://prod-mercado ┆ Fruta    │
│ Canarias IGP   ┆                      ┆       ┆                ┆ na.imgix.n…          ┆          │
│ Banana         ┆ Pieza 190 g aprox.   ┆ 0.28  ┆ null           ┆ https://prod-mercado ┆ Fruta    │
│                ┆                      ┆       ┆                ┆ na.imgix.n…          ┆          │
│ Plátano macho  ┆ Pieza 290 g aprox.   ┆ 0.84  ┆ null           ┆ https://pr

In [51]:
products

name,subtitle,price,discount_price,main_image_url,Category
str,str,str,str,str,str
"""Plátano de Canarias IGP""","""Pieza 170 g aprox.""","""0,36 €""",,"""https://prod-mercadona.imgix.n…","""Fruta"""
"""Banana""","""Pieza 190 g aprox.""","""0,28 €""",,"""https://prod-mercadona.imgix.n…","""Fruta"""
"""Plátano macho""","""Pieza 290 g aprox.""","""0,84 €""",,"""https://prod-mercadona.imgix.n…","""Fruta"""
"""Uva blanca sin semillas""","""Bandeja 500 g aprox.""","""2,65 €""",,"""https://prod-mercadona.imgix.n…","""Fruta"""
"""Uva roja sin semillas""","""Bandeja 500 g aprox.""","""2,65 €""",,"""https://prod-mercadona.imgix.n…","""Fruta"""
…,…,…,…,…,…
"""Crema de calzado blanco autobr…","""Bote 50 ml""","""0,95 €""",,"""https://prod-mercadona.imgix.n…","""Utensilios de limpieza y calza…"
"""Crema de calzado marrón autobr…","""Bote 50 ml""","""0,95 €""",,"""https://prod-mercadona.imgix.n…","""Utensilios de limpieza y calza…"
"""Crema de calzado negro autobri…","""Bote 50 ml""","""0,95 €""",,"""https://prod-mercadona.imgix.n…","""Utensilios de limpieza y calza…"
"""Esponja de calzado incoloro br…","""1 ud.""","""1,75 €""",,"""https://prod-mercadona.imgix.n…","""Utensilios de limpieza y calza…"
