# [Expressions: Expression expansion](https://docs.pola.rs/user-guide/expressions/expression-expansion/)

## col function

In [2]:
import polars as pl

df = pl.DataFrame(
    {  # As of 14th October 2024, ~3pm UTC
        "ticker": ["AAPL", "NVDA", "MSFT", "GOOG", "AMZN"],
        "company_name": ["Apple", "NVIDIA", "Microsoft", "Alphabet (Google)", "Amazon"],
        "price": [229.9, 138.93, 420.56, 166.41, 188.4],
        "day_high": [231.31, 139.6, 424.04, 167.62, 189.83],
        "day_low": [228.6, 136.3, 417.52, 164.78, 188.44],
        "year_high": [237.23, 140.76, 468.35, 193.31, 201.2],
        "year_low": [164.08, 39.23, 324.39, 121.46, 118.35],
    }
)
df

ticker,company_name,price,day_high,day_low,year_high,year_low
str,str,f64,f64,f64,f64,f64
"""AAPL""","""Apple""",229.9,231.31,228.6,237.23,164.08
"""NVDA""","""NVIDIA""",138.93,139.6,136.3,140.76,39.23
"""MSFT""","""Microsoft""",420.56,424.04,417.52,468.35,324.39
"""GOOG""","""Alphabet (Google)""",166.41,167.62,164.78,193.31,121.46
"""AMZN""","""Amazon""",188.4,189.83,188.44,201.2,118.35


### Explicit expansion by column name

In [2]:
eur_usd_rate = 1.09
result = df.with_columns(
    (
        pl.col(
            "price",        # This is a very easy
            "day_high",     # way to modify a bunch
            "day_low"       # of columns simultaneously
        ) / eur_usd_rate
    ).round(5)
)
result

ticker,company_name,price,day_high,day_low,year_high,year_low
str,str,f64,f64,f64,f64,f64
"""AAPL""","""Apple""",210.91743,212.21101,209.72477,237.23,164.08
"""NVDA""","""NVIDIA""",127.45872,128.07339,125.04587,140.76,39.23
"""MSFT""","""Microsoft""",385.83486,389.02752,383.04587,468.35,324.39
"""GOOG""","""Alphabet (Google)""",152.66972,153.77982,151.17431,193.31,121.46
"""AMZN""","""Amazon""",172.84404,174.15596,172.88073,201.2,118.35


### Expansion by data type

In [3]:
df.with_columns((pl.col(pl.Float64) / eur_usd_rate).round(2))

ticker,company_name,price,day_high,day_low,year_high,year_low
str,str,f64,f64,f64,f64,f64
"""AAPL""","""Apple""",210.92,212.21,209.72,217.64,150.53
"""NVDA""","""NVIDIA""",127.46,128.07,125.05,129.14,35.99
"""MSFT""","""Microsoft""",385.83,389.03,383.05,429.68,297.61
"""GOOG""","""Alphabet (Google)""",152.67,153.78,151.17,177.35,111.43
"""AMZN""","""Amazon""",172.84,174.16,172.88,184.59,108.58


if you'r not sure about the precise data type, you can use multiple

In [4]:
df.with_columns(
    (
        pl.col(
            pl.Float32,
            pl.Float64,
        ) / eur_usd_rate
    ).round(2)
)

ticker,company_name,price,day_high,day_low,year_high,year_low
str,str,f64,f64,f64,f64,f64
"""AAPL""","""Apple""",210.92,212.21,209.72,217.64,150.53
"""NVDA""","""NVIDIA""",127.46,128.07,125.05,129.14,35.99
"""MSFT""","""Microsoft""",385.83,389.03,383.05,429.68,297.61
"""GOOG""","""Alphabet (Google)""",152.67,153.78,151.17,177.35,111.43
"""AMZN""","""Amazon""",172.84,174.16,172.88,184.59,108.58


### expansion by pattern matching

In [5]:
result = df.select(pl.col("ticker", "^.*_high$", "^.*low$"))
result

ticker,day_high,year_high,day_low,year_low
str,f64,f64,f64,f64
"""AAPL""",231.31,237.23,228.6,164.08
"""NVDA""",139.6,140.76,136.3,39.23
"""MSFT""",424.04,468.35,417.52,324.39
"""GOOG""",167.62,193.31,164.78,121.46
"""AMZN""",189.83,201.2,188.44,118.35


it's not possible to mix column names and datatypes when making exptressions

## Selecting all columns

In [6]:
df.select(pl.all()).equals(df)

True

## Excluding columns

that's a lot easier

In [7]:
df.select(pl.all().exclude("^day_.*$"))

ticker,company_name,price,year_high,year_low
str,str,f64,f64,f64
"""AAPL""","""Apple""",229.9,237.23,164.08
"""NVDA""","""NVIDIA""",138.93,140.76,39.23
"""MSFT""","""Microsoft""",420.56,468.35,324.39
"""GOOG""","""Alphabet (Google)""",166.41,193.31,121.46
"""AMZN""","""Amazon""",188.4,201.2,118.35


In [8]:
df.select(pl.col(pl.Float64).exclude("^day_.*$"))

price,year_high,year_low
f64,f64,f64
229.9,237.23,164.08
138.93,140.76,39.23
420.56,468.35,324.39
166.41,193.31,121.46
188.4,201.2,118.35


## Column renaming

### renaming a single column

In [11]:
gbp_usd_rate = 1.31
df.select(
    (pl.col("price") / gbp_usd_rate).alias("price (gbp)"),
    (pl.col("price") / eur_usd_rate).alias("price (eur)"),
)

price (gbp),price (eur)
f64,f64
175.496183,210.917431
106.053435,127.458716
321.038168,385.834862
127.030534,152.669725
143.816794,172.844037


### prefix and suffix

In [15]:
df.select(
    (pl.col("^year_.*$") / gbp_usd_rate).name.prefix("in_eur_"),
    (pl.col("day_high","day_low") / eur_usd_rate).name.suffix("_gbp")
)

in_eur_year_high,in_eur_year_low,day_high_gbp,day_low_gbp
f64,f64,f64,f64
181.091603,125.251908,212.211009,209.724771
107.450382,29.946565,128.073394,125.045872
357.519084,247.625954,389.027523,383.045872
147.564885,92.717557,153.779817,151.174312
153.587786,90.343511,174.155963,172.880734


### dynamic name replacement

In [17]:
df.select(pl.all().name.map(str.upper))

TICKER,COMPANY_NAME,PRICE,DAY_HIGH,DAY_LOW,YEAR_HIGH,YEAR_LOW
str,str,f64,f64,f64,f64,f64
"""AAPL""","""Apple""",229.9,231.31,228.6,237.23,164.08
"""NVDA""","""NVIDIA""",138.93,139.6,136.3,140.76,39.23
"""MSFT""","""Microsoft""",420.56,424.04,417.52,468.35,324.39
"""GOOG""","""Alphabet (Google)""",166.41,167.62,164.78,193.31,121.46
"""AMZN""","""Amazon""",188.4,189.83,188.44,201.2,118.35


## Programmatically generating expressions

Use yield. It allows polars parallelise the execution

In [19]:
def amplitude_expressions(time_periods):
    for tp in time_periods:
        yield (pl.col(f"{tp}_high") - pl.col(f"{tp}_low")).alias(f"{tp}_amplitude")

result = df.with_columns(amplitude_expressions(["day", "year"]))
result

ticker,company_name,price,day_high,day_low,year_high,year_low,day_amplitude,year_amplitude
str,str,f64,f64,f64,f64,f64,f64,f64
"""AAPL""","""Apple""",229.9,231.31,228.6,237.23,164.08,2.71,73.15
"""NVDA""","""NVIDIA""",138.93,139.6,136.3,140.76,39.23,3.3,101.53
"""MSFT""","""Microsoft""",420.56,424.04,417.52,468.35,324.39,6.52,143.96
"""GOOG""","""Alphabet (Google)""",166.41,167.62,164.78,193.31,121.46,2.84,71.85
"""AMZN""","""Amazon""",188.4,189.83,188.44,201.2,118.35,1.39,82.85


## More flexible column selection

In [None]:
import polars.selectors as cs

df.select(cs.string() | cs.ends_with("_high"))

ticker,company_name,day_high,year_high
str,str,f64,f64
"""AAPL""","""Apple""",231.31,237.23
"""NVDA""","""NVIDIA""",139.6,140.76
"""MSFT""","""Microsoft""",424.04,468.35
"""GOOG""","""Alphabet (Google)""",167.62,193.31
"""AMZN""","""Amazon""",189.83,201.2


There are a lot of types to choose from: numeric, boolean, date, string [and more ](https://docs.pola.rs/user-guide/expressions/expression-expansion/#selectors-for-data-types)

In [11]:
df.select(cs.numeric())

price,day_high,day_low,year_high,year_low
f64,f64,f64,f64,f64
229.9,231.31,228.6,237.23,164.08
138.93,139.6,136.3,140.76,39.23
420.56,424.04,417.52,468.35,324.39
166.41,167.62,164.78,193.31,121.46
188.4,189.83,188.44,201.2,118.35


In [12]:
df.select(cs.contains("_"))

company_name,day_high,day_low,year_high,year_low
str,f64,f64,f64,f64
"""Apple""",231.31,228.6,237.23,164.08
"""NVIDIA""",139.6,136.3,140.76,39.23
"""Microsoft""",424.04,417.52,468.35,324.39
"""Alphabet (Google)""",167.62,164.78,193.31,121.46
"""Amazon""",189.83,188.44,201.2,118.35


#### Set operators

- `|` for the union of two sets
- `&` for the intersection
- `-` difference (removes results that meet the right-hand condition)
- `^` symmetric difference (elements that are not shared)
- `~` complement


In [15]:
df.select(cs.contains("_") - cs.numeric())

company_name
str
"""Apple"""
"""NVIDIA"""
"""Microsoft"""
"""Alphabet (Google)"""
"""Amazon"""


### Resolving operator ambiguity

Let's say you want negative boolean values in all the columns that start with has.
Below gives the wrong result, because now the `~` operator is executed on the `.starts_with()` method.

In [17]:
people = pl.DataFrame(
    {
        "name": ["Anna", "Bob"],
        "has_partner": [True, False],
        "has_kids": [False, False],
        "has_tattoos": [True, False],
        "is_alive": [True, True],
    }
)
wrong_result = people.select((~cs.starts_with("has_")).name.prefix("not_"))
wrong_result

not_name,not_is_alive
str,bool
"""Anna""",True
"""Bob""",True


To fix this, you have too use the `.as_expression()`

In [21]:
people.select((~cs.starts_with("has_").as_expr()).name.prefix("not_"))

not_has_partner,not_has_kids,not_has_tattoos
bool,bool,bool
False,True,False
True,True,True


### Debugging selectors

To figure out if you made an expression or an selector, you can use the `cs.is_selector()` method.

In [22]:
cs.is_selector(~cs.starts_with("has_").as_expr())

False

it's a;lso possible to see what columns an selector affects.

In [24]:
cs.expand_selector(people, cs.starts_with("has_"))

('has_partner', 'has_kids', 'has_tattoos')