In [12]:
import predikit as pk
import pandas as pd
import numpy as np
import os
import sys

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

# Test Load DataFrame


In [13]:
from io import BytesIO
from pathlib import Path


def load_show(path_or_buf, extension=None, n=3, label=None, **kwargs):
    df = pk.DataFrameParser(
        path_or_buf=path_or_buf,
        extension=extension,
        **kwargs,
    )
    if label:
        display(f"From {label}")

    display(df.get_column_names())
    display(df.get_column_types())
    display(df.get_column_types(parsed=True))
    display(df.get_numeric_columns())
    display(df.get_non_numeric_columns())
    display(df.head(n))


# from a Buffered Input Stream
f = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(f, extension="csv", label="BytesIO")

# from a csv file
f = Path("./sample_data/airline_bumping.csv")
print(f)
load_show(f, n=4, label="csv")

# from a pickle file
f = Path("./sample_data/stations.pickle")
load_show(f, n=5, label="pickle")

# from a parquet file
f = Path("./sample_data/Flights 1m.parquet")
load_show(f, n=3, label="parquet")

# from a dictionary
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}
load_show(data, n=3, label="dict")

# from a list of dictionaries
data = [
    {"Name": "John", "Age": np.nan, "Credit": np.nan},
    {"Name": "Andrea", "Age": 23, "Credit": 400},
    {"Name": "Rose", "Age": 25, "Credit": np.nan},
    {"Name": "Linda", "Age": 50, "Credit": 50},
    {"Name": "Peter", "Age": np.nan, "Credit": 200},
    {"Name": "Meg", "Age": 50, "Credit": np.nan},
]
load_show(data, label="list of dicts")

# from a dictionary of Series
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
load_show(data, label="dict of Series")

# from a 2d array
data = np.array(
    [
        ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        [np.nan, 23, 25, 50, np.nan, 50],
        [np.nan, 400, np.nan, 50, 200, np.nan],
    ]
)

data = data.T
load_show(data, label="2d array", columns=["Name", "Age", "Credit"])


# ToDo => add tests for JSON and EXCEL

'From BytesIO'

['a', 'b', 'c']

{'a': dtype('int64'), 'b': dtype('int64'), 'c': dtype('int64')}

{'a': 'i', 'b': 'i', 'c': 'i'}

['a', 'b', 'c']

None

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


sample_data/airline_bumping.csv


'From csv'

['airline', 'year', 'nb_bumped', 'total_passengers']

{'airline': dtype('O'),
 'year': dtype('int64'),
 'nb_bumped': dtype('int64'),
 'total_passengers': dtype('int64')}

{'airline': 'O', 'year': 'i', 'nb_bumped': 'i', 'total_passengers': 'i'}

['year', 'nb_bumped', 'total_passengers']

['airline']

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765


'From pickle'

['station_id', 'station_name', 'location']

{'station_id': dtype('O'), 'station_name': dtype('O'), 'location': dtype('O')}

{'station_id': 'O', 'station_name': 'O', 'location': 'O'}

None

['station_id', 'station_name', 'location']

Unnamed: 0,station_id,station_name,location
0,40010,Austin-Forest Park,"(41.870851, -87.776812)"
1,40020,Harlem-Lake,"(41.886848, -87.803176)"
2,40030,Pulaski-Lake,"(41.885412, -87.725404)"
3,40040,Quincy/Wells,"(41.878723, -87.63374)"
4,40050,Davis,"(42.04771, -87.683543)"


'From parquet'

['FL_DATE',
 'DEP_DELAY',
 'ARR_DELAY',
 'AIR_TIME',
 'DISTANCE',
 'DEP_TIME',
 'ARR_TIME']

{'FL_DATE': dtype('O'),
 'DEP_DELAY': dtype('int16'),
 'ARR_DELAY': dtype('int16'),
 'AIR_TIME': dtype('int16'),
 'DISTANCE': dtype('int16'),
 'DEP_TIME': dtype('float32'),
 'ARR_TIME': dtype('float32')}

{'FL_DATE': 'O',
 'DEP_DELAY': 'i',
 'ARR_DELAY': 'i',
 'AIR_TIME': 'i',
 'DISTANCE': 'i',
 'DEP_TIME': 'f',
 'ARR_TIME': 'f'}

['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']

['FL_DATE']

Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5,19,350,2475,9.083333,12.483334
1,2006-01-02,167,216,343,2475,11.783334,15.766666
2,2006-01-03,-7,-2,344,2475,8.883333,12.133333


'From dict'

['Name', 'Age', 'Credit']

{'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}

{'Name': 'O', 'Age': 'f', 'Credit': 'f'}

['Age', 'Credit']

['Name']

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


'From list of dicts'

['Name', 'Age', 'Credit']

{'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}

{'Name': 'O', 'Age': 'f', 'Credit': 'f'}

['Age', 'Credit']

['Name']

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


'From dict of Series'

['one', 'two']

{'one': dtype('float64'), 'two': dtype('float64')}

{'one': 'f', 'two': 'f'}

['one', 'two']

None

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


'From 2d array'

['Name', 'Age', 'Credit']

{'Name': dtype('O'), 'Age': dtype('O'), 'Credit': dtype('O')}

{'Name': 'O', 'Age': 'O', 'Credit': 'O'}

None

['Name', 'Age', 'Credit']

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


# Helper Functions


In [14]:
from result import Result


def unwrap_value_or_error(result: Result):
    if result.is_ok():
        return result.unwrap()
    return result.unwrap_err()


def init_df_sample(sample_number: int = 1):
    if sample_number < 0 or sample_number > 4:
        raise ValueError("sample_number must be between 0 and 4")

    data = {
        "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        "Age": [np.nan, 200, 25, 50, np.nan, 50],
        "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
    }
    samples = {
        1: data,
        2: "./sample_data/airline_bumping.csv",
        3: "./sample_data/Flights 1m.parquet",
        4: "./sample_data/stations.pickle",
    }

    return pk.DataFrameParser(path_or_buf=samples[sample_number])

# Data Cleansing


## Handling Missing Values


In [15]:
df = init_df_sample()
display(df)
# bug in MODE doesn't fill all NaNs
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MEDIAN)
result = mvp.fit_transform(df)

if result.is_err():
    raise ValueError(
        "Operation must be done for later operation (Outliers Detection)"
    )

df = result.unwrap()

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,200.0,400.0
2,Rose,25.0,
3,Linda,50.0,2000000.0
4,Peter,,1000000.0
5,Meg,50.0,


## Handling Outliers


In [16]:
display(df)
op = pk.OutliersProcessor(
    pk.OutlierDetectionMethod.Z_SCORE,
    threshold=3,
    verbose=True,
    add_indicator=True,
)
df = op.fit_transform(df, columns=["Age"]).unwrap()

df

Unnamed: 0,Name,Age,Credit
0,John,50.0,1000000.0
1,Andrea,200.0,400.0
2,Rose,25.0,1000000.0
3,Linda,50.0,2000000.0
4,Peter,50.0,1000000.0
5,Meg,50.0,1000000.0


[32;10m2023-12-18 02:50:51,146 (data_cleansing.py:606) - INFO: [0mNumber of outliers detected: 2 in Feature Age
[32;10m2023-12-18 02:50:51,148 (data_cleansing.py:610) - INFO: [0mProportion of outlier detected: 33.3%


Unnamed: 0,Name,Age,Credit,Age_isOutlier_Z_SCORE
0,John,50.0,1000000.0,0
1,Andrea,50.0,400.0,1
2,Rose,50.0,1000000.0,1
3,Linda,50.0,2000000.0,0
4,Peter,50.0,1000000.0,0
5,Meg,50.0,1000000.0,0


# Encoding Features


In [17]:
enc = pk.EncodingProcessor(
    pk.EncodingStrategies.OneHotEncoder,
    verbose=True,
    sparse_output=True,
)
cols = ["Name"]
enc.fit_transform(df, columns=cols).unwrap()

Unnamed: 0,Age,Credit,Age_isOutlier_Z_SCORE,Name_Andrea,Name_John,Name_Linda,Name_Meg,Name_Peter,Name_Rose
0,50.0,1000000.0,0,0.0,1.0,0.0,0.0,0.0,0.0
1,50.0,400.0,1,1.0,0.0,0.0,0.0,0.0,0.0
2,50.0,1000000.0,1,0.0,0.0,0.0,0.0,0.0,1.0
3,50.0,2000000.0,0,0.0,0.0,1.0,0.0,0.0,0.0
4,50.0,1000000.0,0,0.0,0.0,0.0,0.0,1.0,0.0
5,50.0,1000000.0,0,0.0,0.0,0.0,1.0,0.0,0.0


# Filtering Data


In [29]:
df = init_df_sample(2)
col: str = "year"
# display(df)
bf = pk.BasicFilteringProcessor(
    "in",
    value="2016",
    case_sensitive=False,
    verbose=True,
)

# bf
res = bf.fit_transform(df, column=col)

unwrap_value_or_error(res)
# result_unpacked = result.unwrap()
# print(result_unpacked)
# bf
# display(df)


# bf.set_params(operator=pk.FilterOperator.EQUAL, value=50)
# display(bf.fit_transform(df, column="Age"))

[32;10m2023-12-18 02:52:05,741 (data_filtering.py:105) - INFO: [0mFiltering data by => [year.str.contains('2016', case=False)]


Unnamed: 0,airline,year,nb_bumped,total_passengers
12,DELTA AIR LINES,2016,912,97237060
13,VIRGIN AMERICA,2016,77,5927938
14,JETBLUE AIRWAYS,2016,2140,25990828
15,UNITED AIRLINES,2016,2874,64438132
16,HAWAIIAN AIRLINES,2016,30,8154838
17,EXPRESSJET AIRLINES,2016,2541,16119866
18,SKYWEST AIRLINES,2016,2177,22575383
19,AMERICAN AIRLINES,2016,6598,99348093
20,ALASKA AIRLINES,2016,734,17725197
21,SOUTHWEST AIRLINES,2016,11907,112153048


# String Modifications


In [19]:
data = {
    "Name": [
        "   John   .!,@  ",
        "#A nd@rea$",
        "Rose",
        "L0ind9a",
        "P!e1t@e_r",
        "M+e - g$",
    ],
    "Age": [np.nan, 200, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
}
df = pk.DataFrameParser(data)

sop = pk.StringOperationsProcessor(
    "title",
    trim=True,
    remove_letters=False,
    remove_whitespace=True,
    remove_numbers=True,
    remove_punctuation=True,
    verbose=False,
)


display(df)
result = sop.fit_transform(df, columns=["Name"])
unwrap_value_or_error(result)

Unnamed: 0,Name,Age,Credit
0,"John .!,@",,
1,#A nd@rea$,200.0,400.0
2,Rose,25.0,
3,L0ind9a,50.0,2000000.0
4,P!e1t@e_r,,1000000.0
5,M+e - g$,50.0,


Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,200.0,400.0
2,Rose,25.0,
3,Linda,50.0,2000000.0
4,Peter,,1000000.0
5,Meg,50.0,


# Data Cleanse as a Whole


In [34]:
data = {
    "Name": [
        "   John   .!,@  ",
        "#A nd@rea$",
        "Rose",
        "L0ind9a",
        "P!e1t@e_r",
        "M+e - g$",
    ],
    "Age": [np.nan, 200, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
}

df = init_df_sample(2)

dc = pk.DataCleanser(
    missing_clean=True,
    missing_strategy="median",
    missing_indicator=True,
    missing_fill_value="",
    outlier_clean=True,
    outlier_method="z",
    outlier_threshold=1.5,
    outlier_indicator=True,
    str_trim=True,
    str_remove_whitespace=True,
    str_remove_letters=True,
    str_remove_numbers=True,
    str_remove_punctuation=True,
    verbose=False,
)

result = dc.fit_transform(df, columns=["total_passengers"])
unwrap_value_or_error(result)

[32;10m2023-12-18 04:25:17,773 (data_cleansing.py:972) - INFO: [0m> Cleansing
[32;10m2023-12-18 04:25:17,775 (data_cleansing.py:175) - INFO: [0mNo missing values in features.
[32;10m2023-12-18 04:25:17,777 (data_cleansing.py:986) - INFO: [0m> Outliers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[outlier_indicator] = 0
  data.loc[outliers_mask, column] = data[column].median()


Unnamed: 0,total_passengers,total_passengers_isOutlier_Z_SCORE
0,20696653.5,1
1,6090029.0,0
2,27255038.0,0
3,20696653.5,1
4,8422734.0,0
5,11738812.0,0
6,24516354.0,0
7,20696653.5,1
8,18817924.0,0
9,20696653.5,1


# Output Dataset


In [21]:
import platform

print(f"Running on {platform.system()}")

# cross platform
dfe = pk.DataFrameExporter(df, extension=pk.FileExtension.CSV)
dfe.export()

[32;10m2023-12-18 02:50:51,279 (output.py:87) - INFO: [0mExporting to /home/ahmed/predikit_out/out.csv ...


Running on Linux


In [22]:
from predikit.preprocessing import FeatureSelection

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [21, 23, 25, 50, 48, 50],
    "Credit": [74512, 400, 56132, 50, 1_000_000, 45121],
}

df = pk.DataFrameParser(data)
fs = FeatureSelection(exclude_dtypes=["object"], verbose=True)
# new_df = fs.fit_transform(df, columns=["Age"], dtypes=["object"])
# new_df = fs.fit_transform(df)
# new_df = fs.fit_transform(df, dtypes=["object"])
# new_df = fs.fit_transform(df, columns=["Age"])
# new_df = fs.fit_transform(df, columns=["Name"], dtypes=["number"])
result = fs.fit_transform(df, columns=["Credit", "Age"])
unwrap_value_or_error(result)

'This results in an empty data frame'