## Performance comparison parquet vs (csv, pickle)


In [43]:
import time


def timer(func):
    def wrapper(*args, **kwargs):
        begin = time.time()
        val = func(*args, **kwargs)
        end = time.time()
        print(f"The time taken in {func.__name__}: {end - begin :.2f} seconds")
        return val

    return wrapper

In [44]:
import os
import sys

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

from pathlib import Path

import numpy as np

import predikit as pk

csv_file = Path("./sample_data/Flights 1m.csv")

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}

# df = pk.DataFrameParser(csv_file, index_col=0)  # fixme
df = pk.DataFrameParser(data)

# dp = pk.DataPrepare(clean_missing=True, clean_indicator=True, verbose=True)
# dp = pk.MissingValuesProcessor(
#     strategy=pk.MissingValueStrategy.MEAN,
#     add_indicator=True,
#     verbose=True,
# )

# dp.fit_transform(df, columns=["Age", "Credit"])

display(df)

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,
3,Linda,50.0,50.0
4,Peter,,200.0
5,Meg,50.0,


In [45]:
path = "./sample_data/weatherAUS.csv"


cols = ["Evaporation", "Sunshine", "Cloud3pm"]


def init_df():
    df = pk.DataFrameParser(path)

    return df[cols]


df

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,
3,Linda,50.0,50.0
4,Peter,,200.0
5,Meg,50.0,


# Predikit's MissingValueProcessor vs Sklearn's SimpleImputer


In [49]:
from time import time

from sklearn.impute import SimpleImputer

df = init_df()
display(df)

start = time()
si = SimpleImputer(strategy="most_frequent")
X = si.fit_transform(df)
df[cols] = X
end = time()
sklearn_time = end - start

df = init_df()

start = time()
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MODE)
mvp.fit_transform(df)
end = time()
predikit_time = end - start
print(f"From Sklearn: {sklearn_time:.3f} ms")
print(f"From Predikit: {predikit_time:.3f} ms")

speed_pct = (sklearn_time - predikit_time) / sklearn_time * 100

print(
    f"PrediKit's MissingValueProcessor is {speed_pct:.2f}% faster "
    "than Sklearn's SimpleImputer"
)

Unnamed: 0,Evaporation,Sunshine,Cloud3pm
0,,,
1,,,
2,,,2.0
3,,,
4,,,8.0
...,...,...,...
142188,,,
142189,,,
142190,,,
142191,,,


From Sklearn: 0.034 ms
From Predikit: 0.023 ms
PrediKit's MissingValueProcessor is 31.52% faster than Sklearn's SimpleImputer


In [47]:
import timeit

# Define the setup code
setup_code = """
from sklearn.impute import SimpleImputer
import predikit as pk
import pandas as pd
dataset = "./sample_data/weatherAUS.csv"
cols = ["Evaporation", "Sunshine", "Cloud3pm"]
def init_df():
    df = pd.read_csv(dataset)
    return df[cols]
"""

# Define the code for SimpleImputer
si_code = """
df = init_df()
si = SimpleImputer(strategy=pk.MissingValueStrategy.MEAN)
X = si.fit_transform(df)
df[cols] = X
"""

# Define the code for MissingValuesProcessor
mvp_code = """
df = init_df()
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MEAN)
df = mvp.fit_transform(df)
"""

# Time the execution of the code
si_time = timeit.timeit(si_code, setup=setup_code, number=100)
mvp_time = timeit.timeit(mvp_code, setup=setup_code, number=100)

# Calculate the speed increase
speed_pct = (si_time - mvp_time) / si_time * 100

print(
    f"PrediKit's MissingValueProcessor is {speed_pct:.2f}% "
    "faster than Sklearn's SimpleImputer"
)

PrediKit's MissingValueProcessor is 21.67% faster than Sklearn's SimpleImputer


# Dask


In [1]:
# import dask.dataframe as dd

# df = dd.read_csv("./sample_data/Flights 1m.csv")

# df.head()

NameError: name 'df' is not defined