In [None]:
import os
import sys

import numpy as np
import pandas as pd

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import predikit as pk

# Test Load DataFrame


In [None]:
from io import BytesIO
from pathlib import Path


def load_show(path_or_buf, extension=None, n=3, label=None, **kwargs):
    df = pk.DataFrameParser(
        path_or_buf=path_or_buf,
        extension=extension,
        verbose=True,
        **kwargs,
    )
    if label:
        display(f"From {label}")

    display(f"Columns: {df.get_column_names()}")
    display(f"Columns type: {df.get_column_types()}")
    display(f"Parsed column types: {df.get_column_types(parsed=True)}")
    display(f"Numeric columns: {df.get_numeric_columns()}")
    display(f"Non numeric columns: {df.get_non_numeric_columns()}")
    display(df.head(n))


# from a Buffered Input Stream
f = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(f, extension="csv", label="BytesIO")

# from a csv file
f = Path("./sample_data/airline_bumping.csv")
print(f)
load_show(f, n=4, label="csv")

# from a pickle file
f = Path("./sample_data/stations.pickle")
load_show(f, n=5, label="pickle")

# from a parquet file
f = Path("./sample_data/Flights 1m.parquet")
load_show(f, n=3, label="parquet")

# from a dictionary
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}
load_show(data, n=3, label="dict")

# from a list of dictionaries
data = [
    {"Name": "John", "Age": np.nan, "Credit": np.nan},
    {"Name": "Andrea", "Age": 23, "Credit": 400},
    {"Name": "Rose", "Age": 25, "Credit": np.nan},
    {"Name": "Linda", "Age": 50, "Credit": 50},
    {"Name": "Peter", "Age": np.nan, "Credit": 200},
    {"Name": "Meg", "Age": 50, "Credit": np.nan},
]
load_show(data, label="list of dicts")

# from a dictionary of Series
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
load_show(data, label="dict of Series")

# from a 2d array
data = np.array(
    [
        ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        [np.nan, 23, 35, 50, np.nan, 50],
        [np.nan, 400, np.nan, 50, 200, np.nan],
    ]
)

data = data.T
load_show(data, label="2d array", columns=["Name", "Age", "Credit"])


# ToDo => add tests for JSON and EXCEL

# Helper Functions


In [None]:
from result import Result


def unwrap_value_or_error(result: Result):
    if result.is_ok():
        return result.unwrap()
    return result.unwrap_err()


def init_df_sample(sample_number: int = 1):
    if sample_number < 0 or sample_number > 5:
        raise ValueError("sample_number must be between 0 and 4")

    data = {
        "Name": ["John", "Meg", "Rose", np.nan, "Peter", "Meg"],
        "Age": [np.nan, 10, 25, 50, 20, 50],
        "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
    }
    samples = {
        1: data,
        2: "./sample_data/airline_bumping.csv",
        3: "./sample_data/stations.pickle",
        4: "./sample_data/Flights 1m.csv",
        5: "./sample_data/Flights 1m.parquet",
    }

    return pk.DataFrameParser(path_or_buf=samples[sample_number])

# Data Cleansing


## Handling Missing Values


In [None]:
df = init_df_sample(1)
display(df)

# bug in MODE doesn't fill all NaNs
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MODE)
result = mvp.fit_transform(df)

if result.is_err():
    raise ValueError(
        "Operation must be done for later operation (Outliers Detection)"
    )

df = result.unwrap()
df

## Handling Outliers


In [None]:
df = init_df_sample(2)
display(df)
op = pk.OutliersProcessor(
    "z_score",
    # threshold=3,
    verbose=True,
    add_indicator=True,
)
result = op.fit_transform(df, columns=["total_passengers"])

unwrap_value_or_error(result)

# Filtering Data


In [None]:
df = init_df_sample(2)
col: str = "year"
# display(df)
bf = pk.BasicFilteringProcessor(
    "!=",
    value="2015",
    case_sensitive=False,
    verbose=True,
)

# bf
display(df)
result = bf.fit_transform(df, column=col)

unwrap_value_or_error(result)
# result_unpacked = result.unwrap()
# print(result_unpacked)
# bf
# display(df)


# bf.set_params(operator=pk.FilterOperator.EQUAL, value=50)
# display(bf.fit_transform(df, column="Age"))

# String Modifications


In [None]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }
# df = pk.DataFrameParser(data)


df = init_df_sample(2)

sop = pk.StringOperationsProcessor(
    "title",
    trim=True,
    remove_letters=False,
    remove_whitespace=True,
    remove_numbers=True,
    remove_punctuation=True,
    verbose=False,
)


display(df.head(3))
# result = sop.fit_transform(df, columns=["year"])
unwrap_value_or_error(result)

# Data Cleanse as a Whole


In [None]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }

df = init_df_sample(2)

# dc = pk.DataCleanser(
#     missing_clean=True,
#     missing_strategy="median",
#     missing_indicator=True,
#     missing_fill_value=0,
#     outlier_clean=True,
#     outlier_method="z",
#     outlier_threshold=1.5,
#     outlier_indicator=True,
#     str_trim=True,
#     str_remove_whitespace=True,
#     str_remove_letters=True,
#     str_remove_numbers=True,
#     str_remove_punctuation=True,
#     verbose=False,
# )
# props = {
#     "removeOutliers": True,
#     "outlierMethod": "z_score",
#     "replaceNulls": False,
#     "replaceNullWith": "mean",
#     "fillValue": "",
#     "modifyCase": "lower",
#     "removeWhitespace": True,
#     "removePunctuation": False,
#     "removeNumbers": True,
#     "removeLetters": False,
#     "trim": False,
#     "selectedColumns": [],
# }


props = {
    "missingClean": False,
    "missingStrategy": "median",
    "missingFillValue": "",
    "missingIndicator": False,
    "outlierClean": False,
    "outlierMethod": "z_score",
    "outlierThreshold": 3,
    "outlierIndicator": False,
    "strOperations": True,
    "strCaseModifierMethod": "lower",
    "strTrim": False,
    "strRemoveWhitespace": False,
    "strRemoveNumbers": False,
    "strRemoveLetters": False,
    "strRemovePunctuation": False,
    "selectedColumns": [],
}

dc = pk.DataCleanser(  # TODO: add more options
    missing_clean=props["missingClean"],
    missing_strategy=props["missingStrategy"],
    missing_fill_value=props["missingFillValue"],
    missing_indicator=props["missingIndicator"],
    outlier_clean=props["outlierClean"],
    outlier_method=props["outlierMethod"],
    outlier_indicator=props["outlierIndicator"],
    str_operations=props["strOperations"],
    str_remove_letters=props["strRemoveLetters"],
    str_remove_numbers=props["strRemoveNumbers"],
    str_remove_punctuation=props["strRemovePunctuation"],
    str_remove_whitespace=props["strRemoveWhitespace"],
    str_case_modifier_method=props["strCaseModifierMethod"],
    verbose=True,
)

result = dc.fit_transform(df)
# result
unwrap_value_or_error(result)

# Encoding Features


In [None]:
df = init_df_sample(1)
enc = pk.EncodingProcessor(
    pk.EncodingStrategies.OneHotEncoder,
    verbose=True,
    sparse_output=True,
)
cols = ["Name"]
result = enc.fit_transform(df, columns=cols)
unwrap_value_or_error(result)

# Output Dataset


In [None]:
import platform

print(f"Running on {platform.system()}")

# cross platform
dfe = pk.DataFrameExporter(df, extension=pk.FileExtension.CSV)
dfe.export()

In [None]:
from predikit.preprocessing import FeatureSelection

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [21, 23, 25, 50, 48, 50],
    "Credit": [74512, 400, 56132, 50, 1_000_000, 45121],
}

df = pk.DataFrameParser(data)
fs = FeatureSelection(exclude_dtypes=["object"], verbose=True)
# new_df = fs.fit_transform(df, columns=["Age"], dtypes=["object"])
# new_df = fs.fit_transform(df)
# new_df = fs.fit_transform(df, dtypes=["object"])
# new_df = fs.fit_transform(df, columns=["Age"])
# new_df = fs.fit_transform(df, columns=["Name"], dtypes=["number"])
result = fs.fit_transform(df, columns=["Credit", "Age"])
unwrap_value_or_error(result)

In [None]:
df = pd.read_excel(
    "./sample_data/merged_quotation_data.xlsx",
    header=0,
    index_col=None,
    nrows=10,
)


# df.query("`priority level`.str.contains('high', case=False)")
cmp_ziegler = df[(df["Company"] == "ZIEGLER FRANCE SA")]
cmp_bollore = df[(df["Company"] == "BOLLORE LOGISTICS")]
print(cmp_ziegler.index)
print(cmp_bollore.index)

print(len(cmp_ziegler.index))
print(len(cmp_bollore.index))
df
# target = "DN NUMBER"
# res = df[cmp_ziegler[target] == cmp_bollore[target]]
# res