## Performance comparison parquet vs (csv, pickle)


In [172]:
import time


def timer(func):
    def wrapper(*args, **kwargs):
        begin = time.time()
        val = func(*args, **kwargs)
        end = time.time()
        print(f"The time taken in {func.__name__}: {end - begin :.2f} seconds")
        return val

    return wrapper

In [173]:
import os
import sys

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

from pathlib import Path

import numpy as np

import predikit as pk

csv_file = Path("./sample_data/Flights 1m.csv")

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}

# df = pk.DataFrameParser(csv_file, index_col=0)  # fixme
df = pk.DataFrameParser(data)

# dp = pk.DataPrepare(clean_missing=True, clean_indicator=True, verbose=True)
# dp = pk.MissingValuesProcessor(
#     strategy=pk.MissingValueStrategy.MEAN,
#     add_indicator=True,
#     verbose=True,
# )

# dp.fit_transform(df, columns=["Age", "Credit"])

display(df)

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,
3,Linda,50.0,50.0
4,Peter,,200.0
5,Meg,50.0,


In [174]:
def init_df(sample_number: int = 0):
    if sample_number < 0 or sample_number > 5:
        raise ValueError("sample_number must be between 0 and 5")

    data = {
        "Name": ["John", "Meg", "Rose", np.nan, "Peter", "Meg"],
        "Age": [np.nan, 10, 25, 50, 20, 50],
        "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
    }
    samples = {
        0: data,
        1: "./sample_data/airline_bumping.csv",
        2: "./sample_data/stations.pickle",
        3: "./sample_data/Flights 1m.csv",
        4: "./sample_data/Flights 1m.parquet",
        5: "./sample_data/weatherAUS.csv",
    }

    return pk.DataFrameParser(path_or_buf=samples[sample_number])


cols = ["Evaporation", "Sunshine", "Cloud3pm"]

df

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,
3,Linda,50.0,50.0
4,Peter,,200.0
5,Meg,50.0,


# Predikit's MissingValueProcessor vs Sklearn's SimpleImputer


In [175]:
from time import time

from sklearn.impute import SimpleImputer

df = init_df()
display(df)

start = time()
si = SimpleImputer(strategy="most_frequent")
X = si.fit_transform(df)
df[cols] = X
end = time()
sklearn_time = end - start

df = init_df()

start = time()
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MODE)
mvp.fit_transform(df)
end = time()
predikit_time = end - start
print(f"From Sklearn: {sklearn_time:.3f} ms")
print(f"From Predikit: {predikit_time:.3f} ms")

speed_pct = (sklearn_time - predikit_time) / sklearn_time * 100

print(
    f"PrediKit's MissingValueProcessor is {speed_pct:.2f}% faster "
    "than Sklearn's SimpleImputer"
)

Unnamed: 0,Name,Age,Credit
0,John,,
1,Meg,10.0,400.0
2,Rose,25.0,
3,,50.0,2000000.0
4,Peter,20.0,1000000.0
5,Meg,50.0,


From Sklearn: 0.004 ms
From Predikit: 0.005 ms
PrediKit's MissingValueProcessor is -24.63% faster than Sklearn's SimpleImputer


In [176]:
# import timeit

# # Define the setup code
# setup_code = """
# from sklearn.impute import SimpleImputer
# import predikit as pk
# import pandas as pd
# dataset = "./sample_data/weatherAUS.csv"
# cols = ["Evaporation", "Sunshine", "Cloud3pm"]
# def init_df():
#     df = pd.read_csv(dataset)
#     return df[cols]
# """

# # Define the code for SimpleImputer
# si_code = """
# df = init_df()
# si = SimpleImputer(strategy=pk.MissingValueStrategy.MEAN)
# X = si.fit_transform(df)
# df[cols] = X
# """

# # Define the code for MissingValuesProcessor
# mvp_code = """
# df = init_df()
# mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MEAN)
# df = mvp.fit_transform(df)
# """

# # Time the execution of the code
# si_time = timeit.timeit(si_code, setup=setup_code, number=100)
# mvp_time = timeit.timeit(mvp_code, setup=setup_code, number=100)

# # Calculate the speed increase
# speed_pct = (si_time - mvp_time) / si_time * 100

# print(
#     f"PrediKit's MissingValueProcessor is {speed_pct:.2f}% "
#     "faster than Sklearn's SimpleImputer"
# )

# Dask


In [177]:
# import dask.dataframe as dd

# df = dd.read_csv("./sample_data/Flights 1m.csv")

# df.head()

from sklearn.impute import SimpleImputer

In [178]:
%%timeit

df = init_df(5)

si = SimpleImputer(strategy="most_frequent")
X = si.fit_transform(df[cols])
df[cols] = X

192 ms ± 8.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [179]:
%%timeit
df = init_df(5)

mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MODE)
x = mvp.fit_transform(df, cols)

177 ms ± 4.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
