In [2]:
import random
import re
from essential_generators import DocumentGenerator
random.seed(42) # for reproducibility
n = 300_000

# Generate random sentences
gen = DocumentGenerator()
sentences = [gen.sentence() for _ in range(n)]

# get random keywords
def get_random_keyword(splitted_words):
    kw = splitted_words[random.randint(0, len(splitted_words)-1)]
    if re.match(r'^[a-zA-Z]+$', kw):
        return kw
    else:
        return ''

keywords = list(set([get_random_keyword(sentences[random.randint(0, n-1)].split(" ")) for _ in range(6000)]).difference({''}))
print(len(keywords)) # 2340

2340


In [3]:
import polars as pl
import pandas as pd
import numpy as np
import duckdb
import dask
import dask.dataframe as dd

In [4]:
print("polas version: %s" % (pl.__version__))
print("pandas version: %s" % (pd.__version__))
print("numpy version: %s" % (np.__version__))
print("duckdb version: %s" % (duckdb.__version__))
print("dask version: %s" % (dask.__version__))

polas version: 0.17.13
pandas version: 2.0.1
numpy version: 1.23.5
duckdb version: 0.7.1
dask version: 2023.5.0


In [8]:
%%timeit -n 3 -r 5
# polars
df = pl.DataFrame({'sentence': sentences})
res_df = df.with_columns([pl.col('sentence').str.contains(keyword).alias(keyword) for keyword in keywords]).to_pandas()
# 1.21 s ± 59.3 ms per loop (mean ± std. dev. of 5 runs, 3 loops each)

1.21 s ± 59.3 ms per loop (mean ± std. dev. of 5 runs, 3 loops each)


In [None]:
%%timeit -n 1 -r 1
# pandas - a
sentence_df = pd.DataFrame({'sentence': sentences})
df = sentence_df.reindex(columns=['sentence'] + keywords, fill_value=False)
for kw in keywords:
  df.loc[df["sentence"].str.contains(kw), kw] = True
# > 2 minutes per loop

In [None]:
%%timeit -n 1 -r 1
# pandas - b
df = pd.DataFrame({'sentence': sentences})
for kw in keywords:
  df[kw] = df["sentence"].str.contains(kw)
# > 2 minutes per loop

In [None]:
%%timeit -n 1 -r 1
# pandas - c
sentence_df = pd.DataFrame({'sentence': sentences})
df = sentence_df.reindex(columns=['sentence'] + keywords, fill_value=False)
for kw in keywords:
  for i in np.where(df["sentence"].str.contains(kw))[0]:
    df.at[i, kw] = True
# > 2 minutes per loop

In [9]:
%%timeit -n 3 -r 2
# duckdb
df = pd.DataFrame({'sentence': sentences})
query_sql = "select sentence," + ",".join([f"case when sentence like '%{keyword}%' then TRUE else FALSE end as {keyword}" for keyword in keywords]) + " from df"
res_df = duckdb.sql(query_sql).to_df()
# 24.4 s ± 177 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

24.4 s ± 177 ms per loop (mean ± std. dev. of 2 runs, 3 loops each)


In [None]:
%%timeit -n 1 -r 1
# numpy - a
out_arr = np.zeros((len(sentence_df), len(keywords)), dtype=bool)
for j, kw in enumerate(keywords):
  out_arr[[i for i, s in enumerate(sentences) if s.find(kw) > 0], j] = True
# 4min 23s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [None]:
%%timeit -n 1 -r 1
# numpy - b
sentences_np = np.array(sentences)
out_arr = np.zeros((len(sentence_df), len(keywords)), dtype=bool)
for j, kw in enumerate(keywords):
  out_arr[np.char.find(sentences_np, kw) >= 0, j] = True
# > 6 minutes per loop

In [None]:
!python setup.py build_ext --inplace --force

In [4]:
import contained_keyword
sentences_np = np.array(sentences)
keywords_np = np.array(keywords)

In [6]:
%%timeit -n 5 -r 3
# Cython + numpy
out_arr = contained_keyword.build_binary_matrix(sentences_np, keywords_np)
# 1.73 s ± 14.7 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)

1.73 s ± 14.7 ms per loop (mean ± std. dev. of 3 runs, 5 loops each)
