# [Transformations : Joins](https://docs.pola.rs/user-guide/transformations/joins/)

## Equi joins

In [1]:
import polars as pl

props_groups = pl.DataFrame({
    "property_name": ["Old Ken Road", "Whitechapel Road", "The Shire", "Kings Cross Station", "The Angel, Islington"],
    "group": ["brown","brown","fantasy","stations","light_blue"],
    })
props_prices = pl.DataFrame({
    "property_name": ["Old Ken Road", "Whitechapel Road", "Sesame Street", "Kings Cross Station", "The Angel, Islington"],
    "cost": [60,60,100,200,100]
})

print(props_groups, props_prices)

shape: (5, 2)
┌──────────────────────┬────────────┐
│ property_name        ┆ group      │
│ ---                  ┆ ---        │
│ str                  ┆ str        │
╞══════════════════════╪════════════╡
│ Old Ken Road         ┆ brown      │
│ Whitechapel Road     ┆ brown      │
│ The Shire            ┆ fantasy    │
│ Kings Cross Station  ┆ stations   │
│ The Angel, Islington ┆ light_blue │
└──────────────────────┴────────────┘ shape: (5, 2)
┌──────────────────────┬──────┐
│ property_name        ┆ cost │
│ ---                  ┆ ---  │
│ str                  ┆ i64  │
╞══════════════════════╪══════╡
│ Old Ken Road         ┆ 60   │
│ Whitechapel Road     ┆ 60   │
│ Sesame Street        ┆ 100  │
│ Kings Cross Station  ┆ 200  │
│ The Angel, Islington ┆ 100  │
└──────────────────────┴──────┘


By default, polars does an inner join

In [2]:
props_groups.join(props_prices, on="property_name")

property_name,group,cost
str,str,i64
"""Old Ken Road""","""brown""",60
"""Whitechapel Road""","""brown""",60
"""Kings Cross Station""","""stations""",200
"""The Angel, Islington""","""light_blue""",100


In [3]:
props_groups2 = props_groups.with_columns(
    pl.col("property_name").str.to_lowercase(),
)
print(props_groups2)

shape: (5, 2)
┌──────────────────────┬────────────┐
│ property_name        ┆ group      │
│ ---                  ┆ ---        │
│ str                  ┆ str        │
╞══════════════════════╪════════════╡
│ old ken road         ┆ brown      │
│ whitechapel road     ┆ brown      │
│ the shire            ┆ fantasy    │
│ kings cross station  ┆ stations   │
│ the angel, islington ┆ light_blue │
└──────────────────────┴────────────┘


In [4]:
props_prices2 = props_prices.select(
    pl.col("property_name").alias("name"), pl.col("cost")
)
print(props_prices2)

shape: (5, 2)
┌──────────────────────┬──────┐
│ name                 ┆ cost │
│ ---                  ┆ ---  │
│ str                  ┆ i64  │
╞══════════════════════╪══════╡
│ Old Ken Road         ┆ 60   │
│ Whitechapel Road     ┆ 60   │
│ Sesame Street        ┆ 100  │
│ Kings Cross Station  ┆ 200  │
│ The Angel, Islington ┆ 100  │
└──────────────────────┴──────┘


In [5]:
result = props_groups2.join(
    props_prices2,
    left_on="property_name",
    right_on=pl.col("name").str.to_lowercase(),
)
print(result)

shape: (4, 4)
┌──────────────────────┬────────────┬──────────────────────┬──────┐
│ property_name        ┆ group      ┆ name                 ┆ cost │
│ ---                  ┆ ---        ┆ ---                  ┆ ---  │
│ str                  ┆ str        ┆ str                  ┆ i64  │
╞══════════════════════╪════════════╪══════════════════════╪══════╡
│ old ken road         ┆ brown      ┆ Old Ken Road         ┆ 60   │
│ whitechapel road     ┆ brown      ┆ Whitechapel Road     ┆ 60   │
│ kings cross station  ┆ stations   ┆ Kings Cross Station  ┆ 200  │
│ the angel, islington ┆ light_blue ┆ The Angel, Islington ┆ 100  │
└──────────────────────┴────────────┴──────────────────────┴──────┘


the example does not have 'the shire' in it...

## join strategies

### inner join

In [6]:
props_groups.join(props_prices, on="property_name", how="inner")

property_name,group,cost
str,str,i64
"""Old Ken Road""","""brown""",60
"""Whitechapel Road""","""brown""",60
"""Kings Cross Station""","""stations""",200
"""The Angel, Islington""","""light_blue""",100


### Left join

In [7]:
props_groups.join(props_prices, on="property_name", how="left")

property_name,group,cost
str,str,i64
"""Old Ken Road""","""brown""",60.0
"""Whitechapel Road""","""brown""",60.0
"""The Shire""","""fantasy""",
"""Kings Cross Station""","""stations""",200.0
"""The Angel, Islington""","""light_blue""",100.0


### Right join

In [9]:
result = props_groups.join(props_prices, on="property_name", how="right")
result

group,property_name,cost
str,str,i64
"""brown""","""Old Ken Road""",60
"""brown""","""Whitechapel Road""",60
,"""Sesame Street""",100
"""stations""","""Kings Cross Station""",200
"""light_blue""","""The Angel, Islington""",100


In [12]:
result.equals(
    props_prices.join(
        props_groups,
        on="property_name",
        how="left"
    ).select(pl.col("group"), pl.col("property_name"), pl.col("cost"))
)

True

### Full join

In [13]:
props_groups.join(props_prices, on="property_name", how="full")

property_name,group,property_name_right,cost
str,str,str,i64
"""Old Ken Road""","""brown""","""Old Ken Road""",60.0
"""Whitechapel Road""","""brown""","""Whitechapel Road""",60.0
,,"""Sesame Street""",100.0
"""Kings Cross Station""","""stations""","""Kings Cross Station""",200.0
"""The Angel, Islington""","""light_blue""","""The Angel, Islington""",100.0
"""The Shire""","""fantasy""",,


 If we wanted to force join to coalesce the two columns property_name into a single column, we could set coalesce=True explicitly:

In [14]:
props_groups.join(
    props_prices,
    on="property_name",
    how="full",
    coalesce=True
)

property_name,group,cost
str,str,i64
"""Old Ken Road""","""brown""",60.0
"""Whitechapel Road""","""brown""",60.0
"""Sesame Street""",,100.0
"""Kings Cross Station""","""stations""",200.0
"""The Angel, Islington""","""light_blue""",100.0
"""The Shire""","""fantasy""",


### Semi join

Return the rows of the left dataframe that have a match in the right dataframe.
No need to first compare indices like in pandas!

In [15]:
props_groups.join(props_prices, on="property_name", how="semi")

property_name,group
str,str
"""Old Ken Road""","""brown"""
"""Whitechapel Road""","""brown"""
"""Kings Cross Station""","""stations"""
"""The Angel, Islington""","""light_blue"""


## anti join

return the rows of the left dataframe that do not have a match in the right dataframe. this is so much better!

In [16]:
props_groups.join(props_prices, on="property_name", how="anti")

property_name,group
str,str
"""The Shire""","""fantasy"""


## non-equi joins

only join under a given condition (but now join strategy option...?)

In [17]:
players = pl.DataFrame(
    {
        "name": ["Alice", "Bob"],
        "cash": [78, 135],
    }
)
print(players)

shape: (2, 2)
┌───────┬──────┐
│ name  ┆ cash │
│ ---   ┆ ---  │
│ str   ┆ i64  │
╞═══════╪══════╡
│ Alice ┆ 78   │
│ Bob   ┆ 135  │
└───────┴──────┘


In [None]:
players.join_where(props_prices, pl.col("cash") > pl.col("cost"))

name,cash,property_name,cost
str,i64,str,i64
"""Bob""",135,"""Sesame Street""",100
"""Bob""",135,"""The Angel, Islington""",100
"""Bob""",135,"""Old Ken Road""",60
"""Bob""",135,"""Whitechapel Road""",60
"""Alice""",78,"""Old Ken Road""",60
"""Alice""",78,"""Whitechapel Road""",60


## Asof join

a left join where you match to the nearest key. That's damn handy.

In [19]:
from datetime import datetime

df_trades = pl.DataFrame(
    {
        "time": [
            datetime(2020, 1, 1, 9, 1, 0),
            datetime(2020, 1, 1, 9, 1, 0),
            datetime(2020, 1, 1, 9, 3, 0),
            datetime(2020, 1, 1, 9, 6, 0),
        ],
        "stock": ["A", "B", "B", "C"],
        "trade": [101, 299, 301, 500],
    }
)
print(df_trades)

shape: (4, 3)
┌─────────────────────┬───────┬───────┐
│ time                ┆ stock ┆ trade │
│ ---                 ┆ ---   ┆ ---   │
│ datetime[μs]        ┆ str   ┆ i64   │
╞═════════════════════╪═══════╪═══════╡
│ 2020-01-01 09:01:00 ┆ A     ┆ 101   │
│ 2020-01-01 09:01:00 ┆ B     ┆ 299   │
│ 2020-01-01 09:03:00 ┆ B     ┆ 301   │
│ 2020-01-01 09:06:00 ┆ C     ┆ 500   │
└─────────────────────┴───────┴───────┘


In [20]:
df_quotes = pl.DataFrame(
    {
        "time": [
            datetime(2020, 1, 1, 9, 0, 0),
            datetime(2020, 1, 1, 9, 2, 0),
            datetime(2020, 1, 1, 9, 4, 0),
            datetime(2020, 1, 1, 9, 6, 0),
        ],
        "stock": ["A", "B", "C", "A"],
        "quote": [100, 300, 501, 102],
    }
)

print(df_quotes)

shape: (4, 3)
┌─────────────────────┬───────┬───────┐
│ time                ┆ stock ┆ quote │
│ ---                 ┆ ---   ┆ ---   │
│ datetime[μs]        ┆ str   ┆ i64   │
╞═════════════════════╪═══════╪═══════╡
│ 2020-01-01 09:00:00 ┆ A     ┆ 100   │
│ 2020-01-01 09:02:00 ┆ B     ┆ 300   │
│ 2020-01-01 09:04:00 ┆ C     ┆ 501   │
│ 2020-01-01 09:06:00 ┆ A     ┆ 102   │
└─────────────────────┴───────┴───────┘


In [21]:
df_trades.join_asof(df_quotes, on="time", by="stock")

  df_trades.join_asof(df_quotes, on="time", by="stock")


time,stock,trade,quote
datetime[μs],str,i64,i64
2020-01-01 09:01:00,"""A""",101,100.0
2020-01-01 09:01:00,"""B""",299,
2020-01-01 09:03:00,"""B""",301,300.0
2020-01-01 09:06:00,"""C""",500,501.0


In [22]:
df_asof_tolerance_join = df_trades.join_asof(
    df_quotes, on="time", by="stock", tolerance="1m"
)
print(df_asof_tolerance_join)

shape: (4, 4)
┌─────────────────────┬───────┬───────┬───────┐
│ time                ┆ stock ┆ trade ┆ quote │
│ ---                 ┆ ---   ┆ ---   ┆ ---   │
│ datetime[μs]        ┆ str   ┆ i64   ┆ i64   │
╞═════════════════════╪═══════╪═══════╪═══════╡
│ 2020-01-01 09:01:00 ┆ A     ┆ 101   ┆ 100   │
│ 2020-01-01 09:01:00 ┆ B     ┆ 299   ┆ null  │
│ 2020-01-01 09:03:00 ┆ B     ┆ 301   ┆ 300   │
│ 2020-01-01 09:06:00 ┆ C     ┆ 500   ┆ null  │
└─────────────────────┴───────┴───────┴───────┘


  df_asof_tolerance_join = df_trades.join_asof(


**TODO**: have now fully grasped this yet

## Cartesian product

retrieving the set of all ordered combinations between two set's. again, very handy.

In [23]:
tokens = pl.DataFrame({"monopoly_token": ["hat", "shoe", "boat"]})

players.select(pl.col("name")).join(tokens, how="cross")

name,monopoly_token
str,str
"""Alice""","""hat"""
"""Alice""","""shoe"""
"""Alice""","""boat"""
"""Bob""","""hat"""
"""Bob""","""shoe"""
"""Bob""","""boat"""
