In [1]:
import os
import sys

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import numpy as np
import pandas as pd

import predikit as pk

# Test Load DataFrame


In [2]:
from io import BytesIO
from pathlib import Path


def load_show(path_or_buf, extension=None, n=3, label=None, **kwargs):
    df = pk.DataFrameParser(
        path_or_buf=path_or_buf,
        extension=extension,
        **kwargs,
    )  # type: ignore
    if label:
        display(f"From {label}")
    display(df.head(n))


# from a Buffered Input Stream
f = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(f, extension="csv", label="BytesIO")

# from a csv file
f = Path("./sample_data/airline_bumping.csv")
load_show(f, n=4, label="csv")

# from a pickle file
f = Path("./sample_data/stations.pickle")
load_show(f, n=5, label="pickle")

# from a parquet file
f = Path("./sample_data/Flights 1m.parquet")
load_show(f, n=3, label="parquet")

# from a dictionary
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}
load_show(data, n=3, label="dict")

# from a list of dictionaries
data = [
    {"Name": "John", "Age": np.nan, "Credit": np.nan},
    {"Name": "Andrea", "Age": 23, "Credit": 400},
    {"Name": "Rose", "Age": 25, "Credit": np.nan},
    {"Name": "Linda", "Age": 50, "Credit": 50},
    {"Name": "Peter", "Age": np.nan, "Credit": 200},
    {"Name": "Meg", "Age": 50, "Credit": np.nan},
]
load_show(data, label="list of dicts")

# from a dictionary of Series
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
load_show(data, label="dict of Series")

# from a 2d array
data = np.array(
    [
        ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        [np.nan, 23, 25, 50, np.nan, 50],
        [np.nan, 400, np.nan, 50, 200, np.nan],
    ]
)

data = data.T
load_show(data, label="2d array", columns=["Name", "Age", "Credit"])


# ToDo => add tests for JSON and EXCEL

'From BytesIO'

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


'From csv'

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765


'From pickle'

Unnamed: 0,station_id,station_name,location
0,40010,Austin-Forest Park,"(41.870851, -87.776812)"
1,40020,Harlem-Lake,"(41.886848, -87.803176)"
2,40030,Pulaski-Lake,"(41.885412, -87.725404)"
3,40040,Quincy/Wells,"(41.878723, -87.63374)"
4,40050,Davis,"(42.04771, -87.683543)"


'From parquet'

Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5,19,350,2475,9.083333,12.483334
1,2006-01-02,167,216,343,2475,11.783334,15.766666
2,2006-01-03,-7,-2,344,2475,8.883333,12.133333


'From dict'

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


'From list of dicts'

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


'From dict of Series'

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


'From 2d array'

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


# Handling Missing Values


In [3]:
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}

df = pk.DataFrameParser(data)
display(df)

mvp = pk.MissingValuesProcessor()

mvp.fit_transform(df)
df

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,
3,Linda,50.0,50.0
4,Peter,,200.0
5,Meg,50.0,


Unnamed: 0,Name,Age,Credit
0,John,37.0,216.666667
1,Andrea,23.0,400.0
2,Rose,25.0,216.666667
3,Linda,50.0,50.0
4,Peter,37.0,200.0
5,Meg,50.0,216.666667


In [4]:
enc = pk.EncodingProcessor(
    pk.CategoricalEncodingStrategies.OneHotEncoder,
    verbose=True,
    sparse_output=True,
)
cols = ["Name"]
enc.fit_transform(df, cols=cols)


# ohe = OneHotEncoder(sparse_output=True)
# cols = ["Name", "Age"]
# ohe.fit_transform(df[cols])

Unnamed: 0,Age,Credit,Name_Andrea,Name_John,Name_Linda,Name_Meg,Name_Peter,Name_Rose
0,37.0,216.666667,0.0,1.0,0.0,0.0,0.0,0.0
1,23.0,400.0,1.0,0.0,0.0,0.0,0.0,0.0
2,25.0,216.666667,0.0,0.0,0.0,0.0,0.0,1.0
3,50.0,50.0,0.0,0.0,1.0,0.0,0.0,0.0
4,37.0,200.0,0.0,0.0,0.0,0.0,1.0,0.0
5,50.0,216.666667,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
import platform

print(f"Running on {platform.system()}")

# cross platform
dfe = pk.DataFrameExporter(df, extension=pk.FileExtension.CSV)
dfe.export()

[32;10m2023-12-12 02:35:52,099 (output.py:50) - INFO: [0mExporting to /home/mghali/predikit_out/out.csv ...


Running on Linux
