In [1]:
import sys
import os

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import pandas as pd
import numpy as np

# Test Load DataFrame

In [6]:
from io import BytesIO
from predikit.data.io.input import DataFrameParser
from pathlib import Path


def load_show(file, extension=None, n=3, **kwargs):
    parser = DataFrameParser(file=file, extension=extension, **kwargs)
    df = parser.load()
    display(df.head(n))


file = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(file, extension="csv")

file = Path("./sample_data/airline_bumping.csv")
load_show(file, n=4)

file = Path("./sample_data/movies.p")
load_show(file, n=5)

file = Path("./sample_data/Flights 1m.parquet")
load_show(file, n=3)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765


Unnamed: 0,id,title,popularity,release_date
0,257,Oliver Twist,20.415572,2005-09-23
1,14290,Better Luck Tomorrow,3.877036,2002-01-12
2,38365,Grown Ups,38.864027,2010-06-24
3,9672,Infamous,3.680896,2006-11-16
4,12819,Alpha and Omega,12.300789,2010-09-17


Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5,19,350,2475,9.083333,12.483334
1,2006-01-02,167,216,343,2475,11.783334,15.766666
2,2006-01-03,-7,-2,344,2475,8.883333,12.133333


# Handling Missing Values

In [3]:
import predikit.data.preprocessors.data_cleansing as dc
from predikit.data.preprocessors._base import MissingValueStrategy

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}

df = pd.DataFrame(data)

mvp = dc.MissingValuesProcessor(
    strategy=MissingValueStrategy.MEAN, add_indicator=True, verbose=True
)

mvp.fit_transform(df, cols=["Age", "Credit"])

df



Unnamed: 0,Name,Age,Credit,Age_isNA,Credit_isNA
0,John,37.0,216.666667,1,1
1,Andrea,23.0,400.0,0,0
2,Rose,25.0,216.666667,0,1
3,Linda,50.0,50.0,0,0
4,Peter,37.0,200.0,1,0
5,Meg,50.0,216.666667,0,1
