## Polars fast UUID4 string generation

In [1]:
import polars as pl
import polars.selectors as cs
import numpy as np
import uuid

import polars_uuid4

In [2]:
pl.__version__

'0.20.3'

##### Make dataframe of with 10 million random numbers

In [3]:
df = pl.DataFrame({
    'Random numbers': np.random.rand(10000000),
    'A string column': "value",
}).with_row_count()
df.tail()

row_nr,Random numbers,A string column
u32,f64,str
9999995,0.410216,"""value"""
9999996,0.072977,"""value"""
9999997,0.763713,"""value"""
9999998,0.536438,"""value"""
9999999,0.703031,"""value"""


##### Create 10 million UUID4s
 * with_uuid4() accepts a variable so you can set the name of the series, defaults to uuid

In [4]:
df.uuid.with_uuid4()

row_nr,Random numbers,A string column,uuid
u32,f64,str,str
0,0.758339,"""value""","""{3952aa21-0957…"
1,0.04649,"""value""","""{0708f057-7e56…"
2,0.498708,"""value""","""{e655242c-cad8…"
3,0.726538,"""value""","""{dc0d153c-71bd…"
4,0.161975,"""value""","""{efef8d80-b8d0…"
5,0.391948,"""value""","""{6b8f3261-3554…"
6,0.341304,"""value""","""{5b1b6c85-96dd…"
7,0.965395,"""value""","""{414e16d9-5c73…"
8,0.689368,"""value""","""{ccbacf05-2857…"
9,0.628131,"""value""","""{cb402ac3-3bfe…"


#### Works with a lazy frame too

In [5]:
df = pl.LazyFrame({
    'Random numbers': np.random.rand(10000000),
    'A string column': "value",
}).with_row_count().uuid.with_uuid4().collect()
df.tail()

row_nr,Random numbers,A string column,uuid
u32,f64,str,str
9999995,0.313736,"""value""","""{a7a4d264-da44…"
9999996,0.833383,"""value""","""{71fd5366-c708…"
9999997,0.5506,"""value""","""{294ec201-2df2…"
9999998,0.339044,"""value""","""{2fbde60d-a46e…"
9999999,0.896245,"""value""","""{1673458a-565e…"


##### My old way to generate a UUID4 for each row
  * Gets job done.  Creates a UUID4 for each row.
  * Uses python uuid module.
  * Takes a long time (in the polars world).
    * 20.7 s ± 91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [6]:
%%timeit
uuids = ["{"+str(uuid.uuid4())+"}" for i in range(len(df))]
uuid_series = pl.Series(name="python_UUID", values=uuids)
df.with_columns(
    uuid_series
)

20.4 s ± 225 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


##### Using pl_uuid to generate a UUID4 for each row 
  * Gets job done.  Creates a UUID4 for each row.
  * Uses rust uuid crate.
  * Much easier to understand/simpler code.
  * ~ 40x faster than using python's uuid module to generate UUID4 when the last column in the df is already a string
  * 540 ms ± 6.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [7]:
%%timeit
df.uuid.with_uuid4()

540 ms ± 6.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


##### Not quite as fast if there isnt an existing string column in the dataframe
  * 656 ms ± 6.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [8]:
df = pl.DataFrame({
    'Random numbers': np.random.rand(10000000),
}).with_row_count()
df.tail()

row_nr,Random numbers
u32,f64
9999995,0.334474
9999996,0.006089
9999997,0.295484
9999998,0.065312
9999999,0.644697


In [9]:
%%timeit
df.uuid.with_uuid4()

656 ms ± 6.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
