In [61]:
import os
import sys

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import numpy as np
import pandas as pd

import predikit as pk

# Test Load DataFrame


In [62]:
from io import BytesIO
from pathlib import Path


def load_show(path_or_buf, extension=None, n=3, label=None, **kwargs):
    df = pk.DataFrameParser(
        path_or_buf=path_or_buf,
        extension=extension,
        **kwargs,
    )
    if label:
        display(f"From {label}")

    display(df.get_column_names())
    display(df.get_column_types())
    display(df.get_column_types(parsed=True))
    display(df.get_numeric_columns())
    display(df.get_non_numeric_columns())
    display(df.head(n))


# from a Buffered Input Stream
f = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(f, extension="csv", label="BytesIO")

# from a csv file
f = Path("./sample_data/airline_bumping.csv")
print(f)
load_show(f, n=4, label="csv")

# from a pickle file
f = Path("./sample_data/stations.pickle")
load_show(f, n=5, label="pickle")

# from a parquet file
f = Path("./sample_data/Flights 1m.parquet")
load_show(f, n=3, label="parquet")

# from a dictionary
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}
load_show(data, n=3, label="dict")

# from a list of dictionaries
data = [
    {"Name": "John", "Age": np.nan, "Credit": np.nan},
    {"Name": "Andrea", "Age": 23, "Credit": 400},
    {"Name": "Rose", "Age": 25, "Credit": np.nan},
    {"Name": "Linda", "Age": 50, "Credit": 50},
    {"Name": "Peter", "Age": np.nan, "Credit": 200},
    {"Name": "Meg", "Age": 50, "Credit": np.nan},
]
load_show(data, label="list of dicts")

# from a dictionary of Series
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
load_show(data, label="dict of Series")

# from a 2d array
data = np.array(
    [
        ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        [np.nan, 23, 25, 50, np.nan, 50],
        [np.nan, 400, np.nan, 50, 200, np.nan],
    ]
)

data = data.T
load_show(data, label="2d array", columns=["Name", "Age", "Credit"])


# ToDo => add tests for JSON and EXCEL

'From BytesIO'

['a', 'b', 'c']

{'a': dtype('int64'), 'b': dtype('int64'), 'c': dtype('int64')}

{'a': 'i', 'b': 'i', 'c': 'i'}

['a', 'b', 'c']

None

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


sample_data/airline_bumping.csv


'From csv'

['airline', 'year', 'nb_bumped', 'total_passengers']

{'airline': dtype('O'),
 'year': dtype('int64'),
 'nb_bumped': dtype('int64'),
 'total_passengers': dtype('int64')}

{'airline': 'O', 'year': 'i', 'nb_bumped': 'i', 'total_passengers': 'i'}

['year', 'nb_bumped', 'total_passengers']

['airline']

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765


'From pickle'

['station_id', 'station_name', 'location']

{'station_id': dtype('O'), 'station_name': dtype('O'), 'location': dtype('O')}

{'station_id': 'O', 'station_name': 'O', 'location': 'O'}

None

['station_id', 'station_name', 'location']

Unnamed: 0,station_id,station_name,location
0,40010,Austin-Forest Park,"(41.870851, -87.776812)"
1,40020,Harlem-Lake,"(41.886848, -87.803176)"
2,40030,Pulaski-Lake,"(41.885412, -87.725404)"
3,40040,Quincy/Wells,"(41.878723, -87.63374)"
4,40050,Davis,"(42.04771, -87.683543)"


'From parquet'

['FL_DATE',
 'DEP_DELAY',
 'ARR_DELAY',
 'AIR_TIME',
 'DISTANCE',
 'DEP_TIME',
 'ARR_TIME']

{'FL_DATE': dtype('O'),
 'DEP_DELAY': dtype('int16'),
 'ARR_DELAY': dtype('int16'),
 'AIR_TIME': dtype('int16'),
 'DISTANCE': dtype('int16'),
 'DEP_TIME': dtype('float32'),
 'ARR_TIME': dtype('float32')}

{'FL_DATE': 'O',
 'DEP_DELAY': 'i',
 'ARR_DELAY': 'i',
 'AIR_TIME': 'i',
 'DISTANCE': 'i',
 'DEP_TIME': 'f',
 'ARR_TIME': 'f'}

['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']

['FL_DATE']

Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5,19,350,2475,9.083333,12.483334
1,2006-01-02,167,216,343,2475,11.783334,15.766666
2,2006-01-03,-7,-2,344,2475,8.883333,12.133333


'From dict'

['Name', 'Age', 'Credit']

{'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}

{'Name': 'O', 'Age': 'f', 'Credit': 'f'}

['Age', 'Credit']

['Name']

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


'From list of dicts'

['Name', 'Age', 'Credit']

{'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}

{'Name': 'O', 'Age': 'f', 'Credit': 'f'}

['Age', 'Credit']

['Name']

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


'From dict of Series'

['one', 'two']

{'one': dtype('float64'), 'two': dtype('float64')}

{'one': 'f', 'two': 'f'}

['one', 'two']

None

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


'From 2d array'

['Name', 'Age', 'Credit']

{'Name': dtype('O'), 'Age': dtype('O'), 'Credit': dtype('O')}

{'Name': 'O', 'Age': 'O', 'Credit': 'O'}

None

['Name', 'Age', 'Credit']

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


# Helper Functions


In [63]:
from result import Result


def unwrap_value_or_error(result: Result):
    if result.is_ok():
        return result.unwrap()
    return result.unwrap_err()


def init_df_sample(sample_number: int = 1):
    if sample_number < 0 or sample_number > 5:
        raise ValueError("sample_number must be between 0 and 4")

    data = {
        "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        "Age": [np.nan, 200, 25, 50, np.nan, 50],
        "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
    }
    samples = {
        1: data,
        2: "./sample_data/airline_bumping.csv",
        3: "./sample_data/stations.pickle",
        4: "./sample_data/Flights 1m.csv",
        5: "./sample_data/Flights 1m.parquet",
    }

    return pk.DataFrameParser(path_or_buf=samples[sample_number])

# Data Cleansing


## Handling Missing Values


In [64]:
df = init_df_sample()
display(df)
# bug in MODE doesn't fill all NaNs
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MEDIAN)
result = mvp.fit_transform(df)

if result.is_err():
    raise ValueError(
        "Operation must be done for later operation (Outliers Detection)"
    )

df = result.unwrap()

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,200.0,400.0
2,Rose,25.0,
3,Linda,50.0,2000000.0
4,Peter,,1000000.0
5,Meg,50.0,


## Handling Outliers


In [65]:
df = init_df_sample(2)
display(df)
op = pk.OutliersProcessor(
    "z_score",
    # threshold=3,
    verbose=True,
    add_indicator=True,
)
result = op.fit_transform(df, columns=["total_passengers"])

unwrap_value_or_error(result)

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765
4,HAWAIIAN AIRLINES,2017,92,8422734
5,EXPRESSJET AIRLINES,2017,785,11738812
6,SKYWEST AIRLINES,2017,917,24516354
7,AMERICAN AIRLINES,2017,4517,98017132
8,ALASKA AIRLINES,2017,658,18817924
9,SOUTHWEST AIRLINES,2017,6678,115988988


[32;10m2023-12-18 05:45:43,444 (data_cleansing.py:606) - INFO: [0mNumber of outliers detected: 8 in Feature total_passengers
[32;10m2023-12-18 05:45:43,445 (data_cleansing.py:610) - INFO: [0mProportion of outlier detected: 33.3%
  data.loc[outliers_mask, column] = data[column].median()


Unnamed: 0,airline,year,nb_bumped,total_passengers,total_passengers_isOutlier_Z_SCORE
0,DELTA AIR LINES,2017,679,20696653.5,1
1,VIRGIN AMERICA,2017,165,6090029.0,0
2,JETBLUE AIRWAYS,2017,1475,27255038.0,0
3,UNITED AIRLINES,2017,2067,20696653.5,1
4,HAWAIIAN AIRLINES,2017,92,8422734.0,0
5,EXPRESSJET AIRLINES,2017,785,11738812.0,0
6,SKYWEST AIRLINES,2017,917,24516354.0,0
7,AMERICAN AIRLINES,2017,4517,20696653.5,1
8,ALASKA AIRLINES,2017,658,18817924.0,0
9,SOUTHWEST AIRLINES,2017,6678,20696653.5,1


# Filtering Data


In [66]:
df = init_df_sample(2)
col: str = "year"
# display(df)
bf = pk.BasicFilteringProcessor(
    "in",
    case_sensitive=False,
    verbose=True,
)

# bf
res = bf.fit_transform(df, column=None)

unwrap_value_or_error(res)
# result_unpacked = result.unwrap()
# print(result_unpacked)
# bf
# display(df)


# bf.set_params(operator=pk.FilterOperator.EQUAL, value=50)
# display(bf.fit_transform(df, column="Age"))

'Column name must be provided'

# String Modifications


In [67]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }
# df = pk.DataFrameParser(data)


df = init_df_sample(2)

sop = pk.StringOperationsProcessor(
    "title",
    trim=True,
    remove_letters=False,
    remove_whitespace=True,
    remove_numbers=True,
    remove_punctuation=True,
    verbose=False,
)


display(df.head(3))
# result = sop.fit_transform(df, columns=["year"])
unwrap_value_or_error(result)

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038


Unnamed: 0,airline,year,nb_bumped,total_passengers,total_passengers_isOutlier_Z_SCORE
0,DELTA AIR LINES,2017,679,20696653.5,1
1,VIRGIN AMERICA,2017,165,6090029.0,0
2,JETBLUE AIRWAYS,2017,1475,27255038.0,0
3,UNITED AIRLINES,2017,2067,20696653.5,1
4,HAWAIIAN AIRLINES,2017,92,8422734.0,0
5,EXPRESSJET AIRLINES,2017,785,11738812.0,0
6,SKYWEST AIRLINES,2017,917,24516354.0,0
7,AMERICAN AIRLINES,2017,4517,20696653.5,1
8,ALASKA AIRLINES,2017,658,18817924.0,0
9,SOUTHWEST AIRLINES,2017,6678,20696653.5,1


# Data Cleanse as a Whole


In [68]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }

df = init_df_sample(2)

# dc = pk.DataCleanser(
#     missing_clean=True,
#     missing_strategy="median",
#     missing_indicator=True,
#     missing_fill_value=0,
#     outlier_clean=True,
#     outlier_method="z",
#     outlier_threshold=1.5,
#     outlier_indicator=True,
#     str_trim=True,
#     str_remove_whitespace=True,
#     str_remove_letters=True,
#     str_remove_numbers=True,
#     str_remove_punctuation=True,
#     verbose=False,
# )
props = {
    "removeOutliers": True,
    "outlierMethod": "z_score",
    "replaceNulls": False,
    "replaceNullWith": "mean",
    "fillValue": "",
    "modifyCase": "lower",
    "removeWhitespace": True,
    "removePunctuation": False,
    "removeNumbers": True,
    "removeLetters": False,
    "trim": False,
    "selectedColumns": [],
}

dc = pk.DataCleanser(  # TODO: add more options
    missing_clean=False,
    # missing_strategy=pk.MissingValueStrategy(props["replaceNullWith"]),
    # missing_fill_value=props["fillValue"],
    outlier_clean=props["removeOutliers"],
    outlier_method=pk.OutlierDetectionMethod(props["outlierMethod"]),
    outlier_indicator=True,
    str_operations=False,
    # str_remove_letters=props["removeLetters"],
    # str_remove_numbers=props["removeNumbers"],
    # str_remove_punctuation=props["removePunctuation"],
    # str_remove_whitespace=props["removeWhitespace"],
    # str_case_modifier_method=pk.CaseModifyingMethod.from_str(
    #     props["modifyCase"]
    verbose=True,
)

result = dc.fit_transform(df)
# result
unwrap_value_or_error(result)

[32;10m2023-12-18 05:45:43,538 (data_cleansing.py:986) - INFO: [0m> Outliers
[32;10m2023-12-18 05:45:43,541 (data_cleansing.py:1001) - INFO: [0m['airline', 'year', 'nb_bumped', 'total_passengers']
[32;10m2023-12-18 05:45:43,551 (data_cleansing.py:606) - INFO: [0mNumber of outliers detected: 4 in Feature nb_bumped
[32;10m2023-12-18 05:45:43,553 (data_cleansing.py:610) - INFO: [0mProportion of outlier detected: 16.7%
[32;10m2023-12-18 05:45:43,559 (data_cleansing.py:606) - INFO: [0mNumber of outliers detected: 8 in Feature total_passengers
[32;10m2023-12-18 05:45:43,562 (data_cleansing.py:610) - INFO: [0mProportion of outlier detected: 33.3%
  data.loc[outliers_mask, column] = data[column].median()
  data.loc[outliers_mask, column] = data[column].median()
  data.loc[outliers_mask, column] = data[column].median()


Unnamed: 0,airline,year,nb_bumped,total_passengers,year_isOutlier_Z_SCORE,nb_bumped_isOutlier_Z_SCORE,total_passengers_isOutlier_Z_SCORE
0,DELTA AIR LINES,2017.0,679.0,20696653.5,0,0,1
1,VIRGIN AMERICA,2017.0,165.0,6090029.0,0,0,0
2,JETBLUE AIRWAYS,2017.0,1475.0,27255038.0,0,0,0
3,UNITED AIRLINES,2017.0,2067.0,20696653.5,0,0,1
4,HAWAIIAN AIRLINES,2017.0,92.0,8422734.0,0,0,0
5,EXPRESSJET AIRLINES,2017.0,785.0,11738812.0,0,0,0
6,SKYWEST AIRLINES,2017.0,917.0,24516354.0,0,0,0
7,AMERICAN AIRLINES,2017.0,1167.5,20696653.5,0,1,1
8,ALASKA AIRLINES,2017.0,658.0,18817924.0,0,0,0
9,SOUTHWEST AIRLINES,2017.0,1167.5,20696653.5,0,1,1


# Encoding Features


In [69]:
df = init_df_sample(1)
enc = pk.EncodingProcessor(
    pk.EncodingStrategies.OneHotEncoder,
    verbose=True,
    sparse_output=True,
)
cols = ["Name"]
result = enc.fit_transform(df, columns=cols)
unwrap_value_or_error(result)

Unnamed: 0,Age,Credit,Name_Andrea,Name_John,Name_Linda,Name_Meg,Name_Peter,Name_Rose
0,,,0.0,1.0,0.0,0.0,0.0,0.0
1,200.0,400.0,1.0,0.0,0.0,0.0,0.0,0.0
2,25.0,,0.0,0.0,0.0,0.0,0.0,1.0
3,50.0,2000000.0,0.0,0.0,1.0,0.0,0.0,0.0
4,,1000000.0,0.0,0.0,0.0,0.0,1.0,0.0
5,50.0,,0.0,0.0,0.0,1.0,0.0,0.0


# Output Dataset


In [70]:
import platform

print(f"Running on {platform.system()}")

# cross platform
dfe = pk.DataFrameExporter(df, extension=pk.FileExtension.CSV)
dfe.export()

[32;10m2023-12-18 05:45:43,642 (output.py:87) - INFO: [0mExporting to /home/mghali/predikit_out/out.csv ...


Running on Linux


In [71]:
from predikit.preprocessing import FeatureSelection

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [21, 23, 25, 50, 48, 50],
    "Credit": [74512, 400, 56132, 50, 1_000_000, 45121],
}

df = pk.DataFrameParser(data)
fs = FeatureSelection(exclude_dtypes=["object"], verbose=True)
# new_df = fs.fit_transform(df, columns=["Age"], dtypes=["object"])
# new_df = fs.fit_transform(df)
# new_df = fs.fit_transform(df, dtypes=["object"])
# new_df = fs.fit_transform(df, columns=["Age"])
# new_df = fs.fit_transform(df, columns=["Name"], dtypes=["number"])
result = fs.fit_transform(df, columns=["Credit", "Age"])
unwrap_value_or_error(result)

'This results in an empty data frame'

In [72]:
df = init_df_sample(4)
print(df.shape)
df = init_df_sample(5)
print(df.shape)

(1000000, 8)
(1000000, 7)
