In [1]:
import os
import sys

import numpy as np
import pandas as pd

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import predikit as pk

# Test Load DataFrame


In [2]:
from io import BytesIO
from pathlib import Path


def load_show(path_or_buf, extension=None, n=3, label=None, **kwargs):
    df = pk.DataFrameParser(
        path_or_buf=path_or_buf,
        extension=extension,
        verbose=True,
        **kwargs,
    )
    if label:
        display(f"From {label}")

    display(f"Columns: {df.get_column_names()}")
    display(f"Columns type: {df.get_column_types()}")
    display(f"Parsed column types: {df.get_column_types(parsed=True)}")
    display(f"Numeric columns: {df.get_numeric_columns()}")
    display(f"Non numeric columns: {df.get_non_numeric_columns()}")
    display(df.head(n))


# from a Buffered Input Stream
f = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(f, extension="csv", label="BytesIO")

# from a csv file
f = Path("./sample_data/airline_bumping.csv")
print(f)
load_show(f, n=4, label="csv")

# from a pickle file
f = Path("./sample_data/stations.pickle")
load_show(f, n=5, label="pickle")

# from a parquet file
f = Path("./sample_data/Flights 1m.parquet")
load_show(f, n=3, label="parquet")

# from a dictionary
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}
load_show(data, n=3, label="dict")

# from a list of dictionaries
data = [
    {"Name": "John", "Age": np.nan, "Credit": np.nan},
    {"Name": "Andrea", "Age": 23, "Credit": 400},
    {"Name": "Rose", "Age": 25, "Credit": np.nan},
    {"Name": "Linda", "Age": 50, "Credit": 50},
    {"Name": "Peter", "Age": np.nan, "Credit": 200},
    {"Name": "Meg", "Age": 50, "Credit": np.nan},
]
load_show(data, label="list of dicts")

# from a dictionary of Series
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
load_show(data, label="dict of Series")

# from a 2d array
data = np.array(
    [
        ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        [np.nan, 23, 35, 50, np.nan, 50],
        [np.nan, 400, np.nan, 50, 200, np.nan],
    ]
)

data = data.T
load_show(data, label="2d array", columns=["Name", "Age", "Credit"])


# ToDo => add tests for JSON and EXCEL

[35;10m2024-02-22 20:51:46,233 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:46,235 (input.py:267) - DEBUG: [0mLoading DataFrame from <_io.BytesIO object at 0x000001E90A314AE0> ...
[35;10m2024-02-22 20:51:46,249 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\traitlets\config\application.py", line 1077, in launch_instance
    app.start()
  File "d:\Proje

DataFrame head:    a  b  c
0  1  2  3
1  4  5  6
2  7  8  9 


'From BytesIO'

"Columns: ['a', 'b', 'c']"

"Columns type: {'a': dtype('int64'), 'b': dtype('int64'), 'c': dtype('int64')}"

"Parsed column types: {'a': 'i', 'b': 'i', 'c': 'i'}"

"Numeric columns: ['a', 'b', 'c']"

'Non numeric columns: None'

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


[35;10m2024-02-22 20:51:46,551 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:46,553 (input.py:267) - DEBUG: [0mLoading DataFrame from sample_data\airline_bumping.csv ...
[35;10m2024-02-22 20:51:46,568 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen 

sample_data\airline_bumping.csv
DataFrame head:            airline  year  nb_bumped  total_passengers
0  DELTA AIR LINES  2017        679          99796155
1   VIRGIN AMERICA  2017        165           6090029
2  JETBLUE AIRWAYS  2017       1475          27255038 


'From csv'

"Columns: ['airline', 'year', 'nb_bumped', 'total_passengers']"

"Columns type: {'airline': dtype('O'), 'year': dtype('int64'), 'nb_bumped': dtype('int64'), 'total_passengers': dtype('int64')}"

"Parsed column types: {'airline': 'O', 'year': 'i', 'nb_bumped': 'i', 'total_passengers': 'i'}"

"Numeric columns: ['year', 'nb_bumped', 'total_passengers']"

"Non numeric columns: ['airline']"

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765


[35;10m2024-02-22 20:51:46,608 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:46,610 (input.py:267) - DEBUG: [0mLoading DataFrame from sample_data\stations.pickle ...
[35;10m2024-02-22 20:51:46,625 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runp

DataFrame head:   station_id        station_name                 location
0      40010  Austin-Forest Park  (41.870851, -87.776812)
1      40020         Harlem-Lake  (41.886848, -87.803176)
2      40030        Pulaski-Lake  (41.885412, -87.725404) 


'From pickle'

"Columns: ['station_id', 'station_name', 'location']"

"Columns type: {'station_id': dtype('O'), 'station_name': dtype('O'), 'location': dtype('O')}"

"Parsed column types: {'station_id': 'O', 'station_name': 'O', 'location': 'O'}"

'Numeric columns: None'

"Non numeric columns: ['station_id', 'station_name', 'location']"

Unnamed: 0,station_id,station_name,location
0,40010,Austin-Forest Park,"(41.870851, -87.776812)"
1,40020,Harlem-Lake,"(41.886848, -87.803176)"
2,40030,Pulaski-Lake,"(41.885412, -87.725404)"
3,40040,Quincy/Wells,"(41.878723, -87.63374)"
4,40050,Davis,"(42.04771, -87.683543)"


[35;10m2024-02-22 20:51:46,664 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:46,665 (input.py:267) - DEBUG: [0mLoading DataFrame from sample_data\Flights 1m.parquet ...
[35;10m2024-02-22 20:51:47,629 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen r

DataFrame head:       FL_DATE  DEP_DELAY  ARR_DELAY  AIR_TIME  DISTANCE   DEP_TIME   ARR_TIME
0  2006-01-01          5         19       350      2475   9.083333  12.483334
1  2006-01-02        167        216       343      2475  11.783334  15.766666
2  2006-01-03         -7         -2       344      2475   8.883333  12.133333 


'From parquet'

"Columns: ['FL_DATE', 'DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']"

"Columns type: {'FL_DATE': dtype('O'), 'DEP_DELAY': dtype('int16'), 'ARR_DELAY': dtype('int16'), 'AIR_TIME': dtype('int16'), 'DISTANCE': dtype('int16'), 'DEP_TIME': dtype('float32'), 'ARR_TIME': dtype('float32')}"

"Parsed column types: {'FL_DATE': 'O', 'DEP_DELAY': 'i', 'ARR_DELAY': 'i', 'AIR_TIME': 'i', 'DISTANCE': 'i', 'DEP_TIME': 'f', 'ARR_TIME': 'f'}"

"Numeric columns: ['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']"

"Non numeric columns: ['FL_DATE']"

Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5,19,350,2475,9.083333,12.483334
1,2006-01-02,167,216,343,2475,11.783334,15.766666
2,2006-01-03,-7,-2,344,2475,8.883333,12.133333


[35;10m2024-02-22 20:51:47,991 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:47,993 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\ipykernel_launch

DataFrame head:      Name   Age  Credit
0    John   NaN     NaN
1  Andrea  23.0   400.0
2    Rose  25.0     NaN 


'From dict'

"Columns: ['Name', 'Age', 'Credit']"

"Columns type: {'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}"

"Parsed column types: {'Name': 'O', 'Age': 'f', 'Credit': 'f'}"

"Numeric columns: ['Age', 'Credit']"

"Non numeric columns: ['Name']"

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


[35;10m2024-02-22 20:51:48,029 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:48,031 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\ipykernel_launch

DataFrame head:      Name   Age  Credit
0    John   NaN     NaN
1  Andrea  23.0   400.0
2    Rose  25.0     NaN 


'From list of dicts'

"Columns: ['Name', 'Age', 'Credit']"

"Columns type: {'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}"

"Parsed column types: {'Name': 'O', 'Age': 'f', 'Credit': 'f'}"

"Numeric columns: ['Age', 'Credit']"

"Non numeric columns: ['Name']"

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


[35;10m2024-02-22 20:51:48,071 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:48,077 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\ipykernel_launch

DataFrame head:    one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0 


'From dict of Series'

"Columns: ['one', 'two']"

"Columns type: {'one': dtype('float64'), 'two': dtype('float64')}"

"Parsed column types: {'one': 'f', 'two': 'f'}"

"Numeric columns: ['one', 'two']"

'Non numeric columns: None'

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


[35;10m2024-02-22 20:51:48,114 (input.py:164) - DEBUG: [0mStarting data ingestion process...
[35;10m2024-02-22 20:51:48,116 (input.py:184) - DEBUG: [0mâœ… Done! Data ingestion process completed. DataFrame is ready for use.
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2705' in position 48: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\ipykernel_launch

DataFrame head:      Name  Age Credit
0    John  nan    nan
1  Andrea   23    400
2    Rose   35    nan 


'From 2d array'

"Columns: ['Name', 'Age', 'Credit']"

"Columns type: {'Name': dtype('O'), 'Age': dtype('O'), 'Credit': dtype('O')}"

"Parsed column types: {'Name': 'O', 'Age': 'O', 'Credit': 'O'}"

'Numeric columns: None'

"Non numeric columns: ['Name', 'Age', 'Credit']"

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,35.0,


# Helper Functions


In [3]:
from result import Result


def unwrap_value_or_error(result: Result):
    if result.is_ok():
        return result.unwrap()
    return result.unwrap_err()


def init_df_sample(sample_number: int = 1):
    if sample_number < 0 or sample_number > 5:
        raise ValueError("sample_number must be between 0 and 4")

    data = {
        "Name": ["John", "Meg", "Rose", np.nan, "Peter", "Meg"],
        "Age": [np.nan, 10, 25, 50, 20, 50],
        "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
    }
    samples = {
        1: data,
        2: "./sample_data/airline_bumping.csv",
        3: "./sample_data/stations.pickle",
        4: "./sample_data/Flights 1m.csv",
        5: "./sample_data/Flights 1m.parquet",
    }

    return pk.DataFrameParser(path_or_buf=samples[sample_number])

# Data Cleansing


## Handling Missing Values


In [4]:
df = init_df_sample(1)
display(df)

# bug in MODE doesn't fill all NaNs
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MODE)
result = mvp.fit_transform(df)

if result.is_err():
    raise ValueError(
        "Operation must be done for later operation (Outliers Detection)"
    )

df = result.unwrap()
df

Unnamed: 0,Name,Age,Credit
0,John,,
1,Meg,10.0,400.0
2,Rose,25.0,
3,,50.0,2000000.0
4,Peter,20.0,1000000.0
5,Meg,50.0,


Unnamed: 0,Name,Age,Credit
0,John,50.0,400.0
1,Meg,10.0,400.0
2,Rose,25.0,400.0
3,Meg,50.0,2000000.0
4,Peter,20.0,1000000.0
5,Meg,50.0,400.0


## Handling Outliers


In [5]:
df = init_df_sample(2)
display(df)
op = pk.OutliersProcessor(
    "z_score",
    # threshold=3,
    verbose=True,
    add_indicator=True,
)
result = op.fit_transform(df, columns=["total_passengers"])

unwrap_value_or_error(result)

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765
4,HAWAIIAN AIRLINES,2017,92,8422734
5,EXPRESSJET AIRLINES,2017,785,11738812
6,SKYWEST AIRLINES,2017,917,24516354
7,AMERICAN AIRLINES,2017,4517,98017132
8,ALASKA AIRLINES,2017,658,18817924
9,SOUTHWEST AIRLINES,2017,6678,115988988


[35;10m2024-02-22 20:51:48,267 (data_cleansing.py:611) - DEBUG: [0mNumber of outliers detected: 8 in Feature total_passengers
[35;10m2024-02-22 20:51:48,269 (data_cleansing.py:615) - DEBUG: [0mProportion of outlier detected: 33.3%
  data.loc[outliers_mask, column] = data[column].median()


Unnamed: 0,airline,year,nb_bumped,total_passengers,total_passengers_isOutlier_Z_SCORE
0,DELTA AIR LINES,2017,679,20696653.5,1
1,VIRGIN AMERICA,2017,165,6090029.0,0
2,JETBLUE AIRWAYS,2017,1475,27255038.0,0
3,UNITED AIRLINES,2017,2067,20696653.5,1
4,HAWAIIAN AIRLINES,2017,92,8422734.0,0
5,EXPRESSJET AIRLINES,2017,785,11738812.0,0
6,SKYWEST AIRLINES,2017,917,24516354.0,0
7,AMERICAN AIRLINES,2017,4517,20696653.5,1
8,ALASKA AIRLINES,2017,658,18817924.0,0
9,SOUTHWEST AIRLINES,2017,6678,20696653.5,1


# Filtering Data


In [6]:
df = init_df_sample(2)
col: str = "year"
# display(df)
bf = pk.BasicFilteringProcessor(
    "!=",
    value="2016",
    case_sensitive=False,
    verbose=True,
)

# bf
display(df)
res = bf.fit_transform(df, column=col)

unwrap_value_or_error(res)
# result_unpacked = result.unwrap()
# print(result_unpacked)
# bf
# display(df)


# bf.set_params(operator=pk.FilterOperator.EQUAL, value=50)
# display(bf.fit_transform(df, column="Age"))

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765
4,HAWAIIAN AIRLINES,2017,92,8422734
5,EXPRESSJET AIRLINES,2017,785,11738812
6,SKYWEST AIRLINES,2017,917,24516354
7,AMERICAN AIRLINES,2017,4517,98017132
8,ALASKA AIRLINES,2017,658,18817924
9,SOUTHWEST AIRLINES,2017,6678,115988988


[35;10m2024-02-22 20:51:48,313 (data_filtering.py:106) - DEBUG: [0mFiltering data by => [`year` != 2016]


Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038
3,UNITED AIRLINES,2017,2067,70030765
4,HAWAIIAN AIRLINES,2017,92,8422734
5,EXPRESSJET AIRLINES,2017,785,11738812
6,SKYWEST AIRLINES,2017,917,24516354
7,AMERICAN AIRLINES,2017,4517,98017132
8,ALASKA AIRLINES,2017,658,18817924
9,SOUTHWEST AIRLINES,2017,6678,115988988


# String Modifications


In [7]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }
# df = pk.DataFrameParser(data)


df = init_df_sample(2)

sop = pk.StringOperationsProcessor(
    "title",
    trim=True,
    remove_letters=False,
    remove_whitespace=True,
    remove_numbers=True,
    remove_punctuation=True,
    verbose=False,
)


display(df.head(3))
# result = sop.fit_transform(df, columns=["year"])
unwrap_value_or_error(result)

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,DELTA AIR LINES,2017,679,99796155
1,VIRGIN AMERICA,2017,165,6090029
2,JETBLUE AIRWAYS,2017,1475,27255038


Unnamed: 0,airline,year,nb_bumped,total_passengers,total_passengers_isOutlier_Z_SCORE
0,DELTA AIR LINES,2017,679,20696653.5,1
1,VIRGIN AMERICA,2017,165,6090029.0,0
2,JETBLUE AIRWAYS,2017,1475,27255038.0,0
3,UNITED AIRLINES,2017,2067,20696653.5,1
4,HAWAIIAN AIRLINES,2017,92,8422734.0,0
5,EXPRESSJET AIRLINES,2017,785,11738812.0,0
6,SKYWEST AIRLINES,2017,917,24516354.0,0
7,AMERICAN AIRLINES,2017,4517,20696653.5,1
8,ALASKA AIRLINES,2017,658,18817924.0,0
9,SOUTHWEST AIRLINES,2017,6678,20696653.5,1


# Data Cleanse as a Whole


In [8]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }

df = init_df_sample(2)

# dc = pk.DataCleanser(
#     missing_clean=True,
#     missing_strategy="median",
#     missing_indicator=True,
#     missing_fill_value=0,
#     outlier_clean=True,
#     outlier_method="z",
#     outlier_threshold=1.5,
#     outlier_indicator=True,
#     str_trim=True,
#     str_remove_whitespace=True,
#     str_remove_letters=True,
#     str_remove_numbers=True,
#     str_remove_punctuation=True,
#     verbose=False,
# )
# props = {
#     "removeOutliers": True,
#     "outlierMethod": "z_score",
#     "replaceNulls": False,
#     "replaceNullWith": "mean",
#     "fillValue": "",
#     "modifyCase": "lower",
#     "removeWhitespace": True,
#     "removePunctuation": False,
#     "removeNumbers": True,
#     "removeLetters": False,
#     "trim": False,
#     "selectedColumns": [],
# }


props = {
    "missingClean": False,
    "missingStrategy": "median",
    "missingFillValue": "",
    "missingIndicator": False,
    "outlierClean": False,
    "outlierMethod": "z_score",
    "outlierThreshold": 3,
    "outlierIndicator": False,
    "strOperations": True,
    "strCaseModifierMethod": "lower",
    "strTrim": False,
    "strRemoveWhitespace": False,
    "strRemoveNumbers": False,
    "strRemoveLetters": False,
    "strRemovePunctuation": False,
    "selectedColumns": [],
}

dc = pk.DataCleanser(  # TODO: add more options
    missing_clean=props["missingClean"],
    missing_strategy=props["missingStrategy"],
    missing_fill_value=props["missingFillValue"],
    missing_indicator=props["missingIndicator"],
    outlier_clean=props["outlierClean"],
    outlier_method=props["outlierMethod"],
    outlier_indicator=props["outlierIndicator"],
    str_operations=props["strOperations"],
    str_remove_letters=props["strRemoveLetters"],
    str_remove_numbers=props["strRemoveNumbers"],
    str_remove_punctuation=props["strRemovePunctuation"],
    str_remove_whitespace=props["strRemoveWhitespace"],
    str_case_modifier_method=props["strCaseModifierMethod"],
    verbose=True,
)

result = dc.fit_transform(df)
# result
unwrap_value_or_error(result)

[35;10m2024-02-22 20:51:48,383 (data_cleansing.py:1025) - DEBUG: [0m> String Operations


Unnamed: 0,airline,year,nb_bumped,total_passengers
0,delta air lines,2017,679,99796155
1,virgin america,2017,165,6090029
2,jetblue airways,2017,1475,27255038
3,united airlines,2017,2067,70030765
4,hawaiian airlines,2017,92,8422734
5,expressjet airlines,2017,785,11738812
6,skywest airlines,2017,917,24516354
7,american airlines,2017,4517,98017132
8,alaska airlines,2017,658,18817924
9,southwest airlines,2017,6678,115988988


# Encoding Features


In [9]:
df = init_df_sample(1)
enc = pk.EncodingProcessor(
    pk.EncodingStrategies.OneHotEncoder,
    verbose=True,
    sparse_output=True,
)
cols = ["Name"]
result = enc.fit_transform(df, columns=cols)
unwrap_value_or_error(result)

Unnamed: 0,Age,Credit,Name_John,Name_Meg,Name_Peter,Name_Rose,Name_nan
0,,,1.0,0.0,0.0,0.0,0.0
1,10.0,400.0,0.0,1.0,0.0,0.0,0.0
2,25.0,,0.0,0.0,0.0,1.0,0.0
3,50.0,2000000.0,0.0,0.0,0.0,0.0,1.0
4,20.0,1000000.0,0.0,0.0,1.0,0.0,0.0
5,50.0,,0.0,1.0,0.0,0.0,0.0


# Output Dataset


In [10]:
import platform

print(f"Running on {platform.system()}")

# cross platform
dfe = pk.DataFrameExporter(df, extension=pk.FileExtension.CSV)
dfe.export()

[35;10m2024-02-22 20:51:48,447 (output.py:103) - DEBUG: [0mðŸš€ Exporting to C:\Users\mohanad\predikit_out\out.csv ...


Running on Windows


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\logging\__init__.py", line 1163, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\mohanad\AppData\Local\Programs\Python\Python312\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f680' in position 49: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "d:\Projects\Graduation Project\PrediKit\.venv\Lib\site-packages\traitlets\config\application.py", line 1077, in launch_instance
    app.start()
  File "d:\P

In [5]:
from pandas import DataFrame
class OneHotEncoder:
    def __init__(self, columns: list[str] = None, drop: str =None) -> None:
        self.columns: list[str] = columns
        self.encodings: dict[str, int] = {}
        self.drop: str = drop
        self.feature_names_out: list[str] = []

    def fit(self, df) -> None:
        if self.columns is None:
            self.columns = df.columns
        for column in self.columns:
            unique_values = df[column].unique()
            if self.drop == "first":
                unique_values = np.delete(unique_values, 0)  # Drop the first category
            elif self.drop == "if_binary" and len(unique_values) == 2:
                unique_values = unique_values[1:]  # Drop the first category if binary
            self.encodings[column] = {value: np.eye(len(unique_values))[i] for i, value in enumerate(unique_values)}
            self.feature_names_out.extend([f"{column}_{value}" for value in unique_values])

    def transform(self, df) -> DataFrame:
        df_encoded = df
        for column in self.columns:
            if column in df_encoded:  # Check if column exists after dropping (if_binary)
                for value, _ in self.encodings[column].items():
                    df_encoded[column + '_' + str(value)] = (df_encoded[column] == value).astype(int)
                df_encoded.drop(column, axis= 1, inplace = True)
        return df_encoded

    def fit_transform(self, df) -> DataFrame:
        self.fit(df)
        return self.transform(df)

    def get_features_names_out(self) -> list[str]:
        """
        Returns the names of the encoded features.

        This method should be called after fitting the encoder.

        Returns:
            list: A list of string, the names of the encoded features.
        """
        if len(self.feature_names_out) < 1:
            raise ValueError("Please fit the encoder before calling get_features_names_out.")
        return self.feature_names_out

In [1]:
import os
import sys

import numpy as np
import pandas as pd

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import predikit as pk

In [2]:
from predikit.preprocessing import FeatureSelection, EncodingProcessor

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "City": ["Texas", "Mexico", "Florida", "LA", "NY", "LA"],
    # "Name": ["John", "John", "John", "Linda", "Linda", "Linda"],
    "Age": [21, 23, 25, 50, 48, 50],
    "Credit": [74512, 400, 56132, 50, 1_000_000, 45121],
}

df = pk.DataFrameParser(data)
# fs = FeatureSelection(exclude_dtypes=["object"], verbose=True)
# # new_df = fs.fit_transform(df, columns=["Age"], dtypes=["object"])
# # new_df = fs.fit_transform(df)
# # new_df = fs.fit_transform(df, dtypes=["object"])
# # new_df = fs.fit_transform(df, columns=["Age"])
# # new_df = fs.fit_transform(df, columns=["Name"], dtypes=["number"])
# result = fs.fit_transform(df, columns=["Credit", "Age"])
# unwrap_value_or_error(result)

In [3]:
encoder = EncodingProcessor(strategy= 'SumEncoder',cols= ['City'] )
encoder.fit_transform(df)
display(df)



Unnamed: 0,Name,Age,Credit,intercept,City_0,City_1,City_2,City_3
0,John,21,74512,1,1.0,0.0,0.0,0.0
1,Andrea,23,400,1,0.0,1.0,0.0,0.0
2,Rose,25,56132,1,0.0,0.0,1.0,0.0
3,Linda,50,50,1,0.0,0.0,0.0,1.0
4,Peter,48,1000000,1,-1.0,-1.0,-1.0,-1.0
5,Meg,50,45121,1,0.0,0.0,0.0,1.0


In [10]:
encoder.get_feature_names_out()

['Name_Andrea', 'Name_Rose', 'Name_Linda', 'Name_Peter', 'Name_Meg']

In [11]:
from category_encoders import HelmertEncoder
encod = HelmertEncoder()
encoded_df = encod.fit_transform(df)
encoded_df



Unnamed: 0,intercept,Name_0,Name_1,Name_2,Name_3,Name_4,City_0,City_1,City_2,City_3,Age,Credit
0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,21,74512
1,1,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,23,400
2,1,0.0,2.0,-1.0,-1.0,-1.0,0.0,2.0,-1.0,-1.0,25,56132
3,1,0.0,0.0,3.0,-1.0,-1.0,0.0,0.0,3.0,-1.0,50,50
4,1,0.0,0.0,0.0,4.0,-1.0,0.0,0.0,0.0,4.0,48,1000000
5,1,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,-1.0,50,45121


In [10]:
from category_encoders import SumEncoder
encod = SumEncoder()
encoded_df = encod.fit_transform(df)
encoded_df



Unnamed: 0,intercept,Name_0,Name_1,Name_2,Name_3,Name_4,City_0,City_1,City_2,City_3,Age,Credit
0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,21,74512
1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,23,400
2,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,25,56132
3,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,50,50
4,1,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0,-1.0,-1.0,48,1000000
5,1,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,1.0,50,45121
