In [1]:
import os
import sys

import numpy as np
import pandas as pd

root = os.path.dirname(os.path.abspath("."))
sys.path.append(root)

import predikit as pk

# Test Load DataFrame


In [29]:
from io import BytesIO
from pathlib import Path


def load_show(path_or_buf, extension=None, n=3, label=None, **kwargs):
    df = pk.DataFrameParser(
        path_or_buf=path_or_buf,
        extension=extension,
        verbose=True,
        **kwargs,
    )
    if label:
        display(f"From {label}")

    display(f"Columns: {df.get_column_names()}")
    display(f"Columns type: {df.get_column_types()}")
    display(f"Parsed column types: {df.get_column_types(parsed=True)}")
    display(f"Numeric columns: {df.get_numeric_columns()}")
    display(f"Non numeric columns: {df.get_non_numeric_columns()}")
    display(df.head(n))


# from a Buffered Input Stream
f = BytesIO(b"a,b,c\n1,2,3\n4,5,6\n7,8,9")
load_show(f, extension="csv", label="BytesIO")

# from a csv file
f = Path("./sample_data/airline_bumping.csv")
print(f)
load_show(f, n=4, label="csv")

# from a pickle file
f = Path("./sample_data/stations.pickle")
load_show(f, n=5, label="pickle")

# from a parquet file
f = Path("./sample_data/Flights 1m.parquet")
load_show(f, n=3, label="parquet")

# from a dictionary
data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [np.nan, 23, 25, 50, np.nan, 50],
    "Credit": [np.nan, 400, np.nan, 50, 200, np.nan],
}
load_show(data, n=3, label="dict")

# from a list of dictionaries
data = [
    {"Name": "John", "Age": np.nan, "Credit": np.nan},
    {"Name": "Andrea", "Age": 23, "Credit": 400},
    {"Name": "Rose", "Age": 25, "Credit": np.nan},
    {"Name": "Linda", "Age": 50, "Credit": 50},
    {"Name": "Peter", "Age": np.nan, "Credit": 200},
    {"Name": "Meg", "Age": 50, "Credit": np.nan},
]
load_show(data, label="list of dicts")

# from a dictionary of Series
data = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
}
load_show(data, label="dict of Series")

# from a 2d array
data = np.array(
    [
        ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
        [np.nan, 23, 35, 50, np.nan, 50],
        [np.nan, 400, np.nan, 50, 200, np.nan],
    ]
)

data = data.T
load_show(data, label="2d array", columns=["Name", "Age", "Credit"])


# ToDo => add tests for JSON and EXCEL

[32;10m2023-12-19 04:13:41,561 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:41,563 (input.py:266) - INFO: [0mLoading DataFrame from <_io.BytesIO object at 0x7f4cf1993c40> ...
[32;10m2023-12-19 04:13:41,565 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:41,566 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 3
							 _______
							| rows  | 3
[32;10m2023-12-19 04:13:41,568 (input.py:199) - INFO: [0mDataFrame size in memory: 0.20 KB 
[32;10m2023-12-19 04:13:41,571 (input.py:200) - INFO: [0mDataFrame dtypes
a    int64
b    int64
c    int64
dtype: object 


DataFrame head:    a  b  c
0  1  2  3
1  4  5  6
2  7  8  9 


'From BytesIO'

"Columns: ['a', 'b', 'c']"

"Columns type: {'a': dtype('int64'), 'b': dtype('int64'), 'c': dtype('int64')}"

"Parsed column types: {'a': 'i', 'b': 'i', 'c': 'i'}"

"Numeric columns: ['a', 'b', 'c']"

'Non numeric columns: None'

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


[32;10m2023-12-19 04:13:41,586 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:41,588 (input.py:266) - INFO: [0mLoading DataFrame from sample_data/airline_bumping.csv ...
[32;10m2023-12-19 04:13:41,591 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:41,592 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 24
							 _______
							| rows  | 4
[32;10m2023-12-19 04:13:41,595 (input.py:199) - INFO: [0mDataFrame size in memory: 2.38 KB 
[32;10m2023-12-19 04:13:41,597 (input.py:200) - INFO: [0mDataFrame dtypes
airline              object
year                float64
nb_bumped             int64
total_passengers      int64
dtype: object 


sample_data/airline_bumping.csv
DataFrame head:                                         airline    year  nb_bumped  \
0        12312312DELTA12@#@#!#!#!@ AIRLINES      2017.0        679   
1                         VIRGIN1234@#! AMERICA  2017.0        165   
2   #!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@  2017.0       1475   

   total_passengers  
0          99796155  
1           6090029  
2          27255038   


'From csv'

"Columns: ['airline', 'year', 'nb_bumped', 'total_passengers']"

"Columns type: {'airline': dtype('O'), 'year': dtype('float64'), 'nb_bumped': dtype('int64'), 'total_passengers': dtype('int64')}"

"Parsed column types: {'airline': 'O', 'year': 'f', 'nb_bumped': 'i', 'total_passengers': 'i'}"

"Numeric columns: ['year', 'nb_bumped', 'total_passengers']"

"Non numeric columns: ['airline']"

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,99796155
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038
3,UNITED AiRlINeS,,2067,70030765


[32;10m2023-12-19 04:13:41,617 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:41,618 (input.py:266) - INFO: [0mLoading DataFrame from sample_data/stations.pickle ...
[32;10m2023-12-19 04:13:41,621 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:41,622 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 144
							 _______
							| rows  | 3
[32;10m2023-12-19 04:13:41,625 (input.py:199) - INFO: [0mDataFrame size in memory: 27.41 KB 
[32;10m2023-12-19 04:13:41,627 (input.py:200) - INFO: [0mDataFrame dtypes
station_id      object
station_name    object
location        object
dtype: object 


DataFrame head:   station_id        station_name                 location
0      40010  Austin-Forest Park  (41.870851, -87.776812)
1      40020         Harlem-Lake  (41.886848, -87.803176)
2      40030        Pulaski-Lake  (41.885412, -87.725404) 


'From pickle'

"Columns: ['station_id', 'station_name', 'location']"

"Columns type: {'station_id': dtype('O'), 'station_name': dtype('O'), 'location': dtype('O')}"

"Parsed column types: {'station_id': 'O', 'station_name': 'O', 'location': 'O'}"

'Numeric columns: None'

"Non numeric columns: ['station_id', 'station_name', 'location']"

Unnamed: 0,station_id,station_name,location
0,40010,Austin-Forest Park,"(41.870851, -87.776812)"
1,40020,Harlem-Lake,"(41.886848, -87.803176)"
2,40030,Pulaski-Lake,"(41.885412, -87.725404)"
3,40040,Quincy/Wells,"(41.878723, -87.63374)"
4,40050,Davis,"(42.04771, -87.683543)"


[32;10m2023-12-19 04:13:41,644 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:41,645 (input.py:266) - INFO: [0mLoading DataFrame from sample_data/Flights 1m.parquet ...


[32;10m2023-12-19 04:13:41,724 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:41,725 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 1000000
							 _______
							| rows  | 7
[32;10m2023-12-19 04:13:41,988 (input.py:199) - INFO: [0mDataFrame size in memory: 54687.63 KB 
[32;10m2023-12-19 04:13:41,990 (input.py:200) - INFO: [0mDataFrame dtypes
FL_DATE       object
DEP_DELAY      int16
ARR_DELAY      int16
AIR_TIME       int16
DISTANCE       int16
DEP_TIME     float32
ARR_TIME     float32
dtype: object 


DataFrame head:       FL_DATE  DEP_DELAY  ARR_DELAY  AIR_TIME  DISTANCE   DEP_TIME   ARR_TIME
0  2006-01-01          5         19       350      2475   9.083333  12.483334
1  2006-01-02        167        216       343      2475  11.783334  15.766666
2  2006-01-03         -7         -2       344      2475   8.883333  12.133333 


'From parquet'

"Columns: ['FL_DATE', 'DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']"

"Columns type: {'FL_DATE': dtype('O'), 'DEP_DELAY': dtype('int16'), 'ARR_DELAY': dtype('int16'), 'AIR_TIME': dtype('int16'), 'DISTANCE': dtype('int16'), 'DEP_TIME': dtype('float32'), 'ARR_TIME': dtype('float32')}"

"Parsed column types: {'FL_DATE': 'O', 'DEP_DELAY': 'i', 'ARR_DELAY': 'i', 'AIR_TIME': 'i', 'DISTANCE': 'i', 'DEP_TIME': 'f', 'ARR_TIME': 'f'}"

"Numeric columns: ['DEP_DELAY', 'ARR_DELAY', 'AIR_TIME', 'DISTANCE', 'DEP_TIME', 'ARR_TIME']"

"Non numeric columns: ['FL_DATE']"

Unnamed: 0,FL_DATE,DEP_DELAY,ARR_DELAY,AIR_TIME,DISTANCE,DEP_TIME,ARR_TIME
0,2006-01-01,5,19,350,2475,9.083333,12.483334
1,2006-01-02,167,216,343,2475,11.783334,15.766666
2,2006-01-03,-7,-2,344,2475,8.883333,12.133333


[32;10m2023-12-19 04:13:42,067 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:42,069 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:42,070 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 6
							 _______
							| rows  | 3
[32;10m2023-12-19 04:13:42,073 (input.py:199) - INFO: [0mDataFrame size in memory: 0.54 KB 
[32;10m2023-12-19 04:13:42,075 (input.py:200) - INFO: [0mDataFrame dtypes
Name       object
Age       float64
Credit    float64
dtype: object 


DataFrame head:      Name   Age  Credit
0    John   NaN     NaN
1  Andrea  23.0   400.0
2    Rose  25.0     NaN 


'From dict'

"Columns: ['Name', 'Age', 'Credit']"

"Columns type: {'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}"

"Parsed column types: {'Name': 'O', 'Age': 'f', 'Credit': 'f'}"

"Numeric columns: ['Age', 'Credit']"

"Non numeric columns: ['Name']"

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


[32;10m2023-12-19 04:13:42,101 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:42,103 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:42,105 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 6
							 _______
							| rows  | 3
[32;10m2023-12-19 04:13:42,106 (input.py:199) - INFO: [0mDataFrame size in memory: 0.54 KB 
[32;10m2023-12-19 04:13:42,108 (input.py:200) - INFO: [0mDataFrame dtypes
Name       object
Age       float64
Credit    float64
dtype: object 


DataFrame head:      Name   Age  Credit
0    John   NaN     NaN
1  Andrea  23.0   400.0
2    Rose  25.0     NaN 


'From list of dicts'

"Columns: ['Name', 'Age', 'Credit']"

"Columns type: {'Name': dtype('O'), 'Age': dtype('float64'), 'Credit': dtype('float64')}"

"Parsed column types: {'Name': 'O', 'Age': 'f', 'Credit': 'f'}"

"Numeric columns: ['Age', 'Credit']"

"Non numeric columns: ['Name']"

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,25.0,


[32;10m2023-12-19 04:13:42,133 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:42,137 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:42,139 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 4
							 _______
							| rows  | 2
[32;10m2023-12-19 04:13:42,141 (input.py:199) - INFO: [0mDataFrame size in memory: 0.26 KB 
[32;10m2023-12-19 04:13:42,143 (input.py:200) - INFO: [0mDataFrame dtypes
one    float64
two    float64
dtype: object 


DataFrame head:    one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0 


'From dict of Series'

"Columns: ['one', 'two']"

"Columns type: {'one': dtype('float64'), 'two': dtype('float64')}"

"Parsed column types: {'one': 'f', 'two': 'f'}"

"Numeric columns: ['one', 'two']"

'Non numeric columns: None'

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0


[32;10m2023-12-19 04:13:42,169 (input.py:163) - INFO: [0mStarting data ingestion process ...
[32;10m2023-12-19 04:13:42,171 (input.py:183) - INFO: [0m✅ Done! Data ingestion process completed. DataFrame is ready for use.
[32;10m2023-12-19 04:13:42,172 (input.py:190) - INFO: [0mDataFrame Shape	 🔻
							 _______
							|columns| 6
							 _______
							| rows  | 3
[32;10m2023-12-19 04:13:42,173 (input.py:199) - INFO: [0mDataFrame size in memory: 1.05 KB 
[32;10m2023-12-19 04:13:42,176 (input.py:200) - INFO: [0mDataFrame dtypes
Name      object
Age       object
Credit    object
dtype: object 


DataFrame head:      Name  Age Credit
0    John  nan    nan
1  Andrea   23    400
2    Rose   35    nan 


'From 2d array'

"Columns: ['Name', 'Age', 'Credit']"

"Columns type: {'Name': dtype('O'), 'Age': dtype('O'), 'Credit': dtype('O')}"

"Parsed column types: {'Name': 'O', 'Age': 'O', 'Credit': 'O'}"

'Numeric columns: None'

"Non numeric columns: ['Name', 'Age', 'Credit']"

Unnamed: 0,Name,Age,Credit
0,John,,
1,Andrea,23.0,400.0
2,Rose,35.0,


# Helper Functions


In [30]:
from result import Result


def unwrap_value_or_error(result: Result):
    if result.is_ok():
        return result.unwrap()
    return result.unwrap_err()


def init_df_sample(sample_number: int = 1):
    if sample_number < 0 or sample_number > 5:
        raise ValueError("sample_number must be between 0 and 4")

    data = {
        "Name": ["John", "Meg", "Rose", np.nan, "Peter", "Meg"],
        "Age": [np.nan, 10, 25, 50, 20, 50],
        "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
    }
    samples = {
        1: data,
        2: "./sample_data/airline_bumping.csv",
        3: "./sample_data/stations.pickle",
        4: "./sample_data/Flights 1m.csv",
        5: "./sample_data/Flights 1m.parquet",
    }

    return pk.DataFrameParser(path_or_buf=samples[sample_number])

# Data Cleansing


## Handling Missing Values


In [31]:
df = init_df_sample(1)
display(df)

# bug in MODE doesn't fill all NaNs
mvp = pk.MissingValuesProcessor(strategy=pk.MissingValueStrategy.MODE)
result = mvp.fit_transform(df)

if result.is_err():
    raise ValueError(
        "Operation must be done for later operation (Outliers Detection)"
    )

df = result.unwrap()
df

Unnamed: 0,Name,Age,Credit
0,John,,
1,Meg,10.0,400.0
2,Rose,25.0,
3,,50.0,2000000.0
4,Peter,20.0,1000000.0
5,Meg,50.0,


Unnamed: 0,Name,Age,Credit
0,John,50.0,400.0
1,Meg,10.0,400.0
2,Rose,25.0,400.0
3,Meg,50.0,2000000.0
4,Peter,20.0,1000000.0
5,Meg,50.0,400.0


## Handling Outliers


In [32]:
df = init_df_sample(2)
display(df)
op = pk.OutliersProcessor(
    "z_score",
    # threshold=3,
    verbose=True,
    add_indicator=True,
)
result = op.fit_transform(df, columns=["total_passengers"])

unwrap_value_or_error(result)

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,99796155
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038
3,UNITED AiRlINeS,,2067,70030765
4,HAWAIIAN 13123123AIRLINES13214,2017.0,92,8422734
5,EXPRESSJET AIRLINES,2017.0,785,11738812
6,@SKYWEST AIRLINES,2017.0,917,24516354
7,AMERICAN AIRLINES,2017.0,4517,98017132
8,ALASKA 0AIRLINES,2017.0,658,18817924
9,#SOUTHWEST_ AIRLINES,2017.0,6678,115988988


[32;10m2023-12-19 04:13:42,298 (data_cleansing.py:611) - INFO: [0mNumber of outliers detected: 8 in Feature total_passengers
[32;10m2023-12-19 04:13:42,299 (data_cleansing.py:615) - INFO: [0mProportion of outlier detected: 33.3%
  data.loc[outliers_mask, column] = data[column].median()


Unnamed: 0,airline,year,nb_bumped,total_passengers,total_passengers_isOutlier_Z_SCORE
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,20696653.5,1
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029.0,0
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038.0,0
3,UNITED AiRlINeS,,2067,20696653.5,1
4,HAWAIIAN 13123123AIRLINES13214,2017.0,92,8422734.0,0
5,EXPRESSJET AIRLINES,2017.0,785,11738812.0,0
6,@SKYWEST AIRLINES,2017.0,917,24516354.0,0
7,AMERICAN AIRLINES,2017.0,4517,20696653.5,1
8,ALASKA 0AIRLINES,2017.0,658,18817924.0,0
9,#SOUTHWEST_ AIRLINES,2017.0,6678,20696653.5,1


# Filtering Data


In [33]:
df = init_df_sample(2)
col: str = "year"
# display(df)
bf = pk.BasicFilteringProcessor(
    "!=",
    value="2016",
    case_sensitive=False,
    verbose=True,
)

# bf
display(df)
res = bf.fit_transform(df, column=col)

unwrap_value_or_error(res)
# result_unpacked = result.unwrap()
# print(result_unpacked)
# bf
# display(df)


# bf.set_params(operator=pk.FilterOperator.EQUAL, value=50)
# display(bf.fit_transform(df, column="Age"))

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,99796155
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038
3,UNITED AiRlINeS,,2067,70030765
4,HAWAIIAN 13123123AIRLINES13214,2017.0,92,8422734
5,EXPRESSJET AIRLINES,2017.0,785,11738812
6,@SKYWEST AIRLINES,2017.0,917,24516354
7,AMERICAN AIRLINES,2017.0,4517,98017132
8,ALASKA 0AIRLINES,2017.0,658,18817924
9,#SOUTHWEST_ AIRLINES,2017.0,6678,115988988


[32;10m2023-12-19 04:13:42,345 (data_filtering.py:106) - INFO: [0mFiltering data by => [year != 2016]


Unnamed: 0,airline,year,nb_bumped,total_passengers
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,99796155
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038
3,UNITED AiRlINeS,,2067,70030765
4,HAWAIIAN 13123123AIRLINES13214,2017.0,92,8422734
5,EXPRESSJET AIRLINES,2017.0,785,11738812
6,@SKYWEST AIRLINES,2017.0,917,24516354
7,AMERICAN AIRLINES,2017.0,4517,98017132
8,ALASKA 0AIRLINES,2017.0,658,18817924
9,#SOUTHWEST_ AIRLINES,2017.0,6678,115988988


# String Modifications


In [34]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }
# df = pk.DataFrameParser(data)


df = init_df_sample(2)

sop = pk.StringOperationsProcessor(
    "title",
    trim=True,
    remove_letters=False,
    remove_whitespace=True,
    remove_numbers=True,
    remove_punctuation=True,
    verbose=False,
)


display(df.head(3))
# result = sop.fit_transform(df, columns=["year"])
unwrap_value_or_error(result)

Unnamed: 0,airline,year,nb_bumped,total_passengers
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,99796155
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038


Unnamed: 0,airline,year,nb_bumped,total_passengers,total_passengers_isOutlier_Z_SCORE
0,12312312DELTA12@#@#!#!#!@ AIRLINES,2017.0,679,20696653.5,1
1,VIRGIN1234@#! AMERICA,2017.0,165,6090029.0,0
2,#!#!@#@!#20J012312441ETBLUE A1I2R3WAYS@!@!@,2017.0,1475,27255038.0,0
3,UNITED AiRlINeS,,2067,20696653.5,1
4,HAWAIIAN 13123123AIRLINES13214,2017.0,92,8422734.0,0
5,EXPRESSJET AIRLINES,2017.0,785,11738812.0,0
6,@SKYWEST AIRLINES,2017.0,917,24516354.0,0
7,AMERICAN AIRLINES,2017.0,4517,20696653.5,1
8,ALASKA 0AIRLINES,2017.0,658,18817924.0,0
9,#SOUTHWEST_ AIRLINES,2017.0,6678,20696653.5,1


# Data Cleanse as a Whole


In [35]:
# data = {
#     "Name": [
#         "   John   .!,@  ",
#         "#A nd@rea$",
#         "Rose",
#         "L0ind9a",
#         "P!e1t@e_r",
#         "M+e - g$",
#     ],
#     "Age": [np.nan, 200, 25, 50, np.nan, 50],
#     "Credit": [np.nan, 400, np.nan, 200_00_00, 1_000_000, np.nan],
# }

df = init_df_sample(2)

# dc = pk.DataCleanser(
#     missing_clean=True,
#     missing_strategy="median",
#     missing_indicator=True,
#     missing_fill_value=0,
#     outlier_clean=True,
#     outlier_method="z",
#     outlier_threshold=1.5,
#     outlier_indicator=True,
#     str_trim=True,
#     str_remove_whitespace=True,
#     str_remove_letters=True,
#     str_remove_numbers=True,
#     str_remove_punctuation=True,
#     verbose=False,
# )
# props = {
#     "removeOutliers": True,
#     "outlierMethod": "z_score",
#     "replaceNulls": False,
#     "replaceNullWith": "mean",
#     "fillValue": "",
#     "modifyCase": "lower",
#     "removeWhitespace": True,
#     "removePunctuation": False,
#     "removeNumbers": True,
#     "removeLetters": False,
#     "trim": False,
#     "selectedColumns": [],
# }


props = {
    "missingClean": False,
    "missingStrategy": "median",
    "missingFillValue": "",
    "missingIndicator": False,
    "outlierClean": False,
    "outlierMethod": "z_score",
    "outlierThreshold": 3,
    "outlierIndicator": False,
    "strOperations": True,
    "strCaseModifierMethod": "lower",
    "strTrim": False,
    "strRemoveWhitespace": False,
    "strRemoveNumbers": False,
    "strRemoveLetters": False,
    "strRemovePunctuation": False,
    "selectedColumns": [],
}

dc = pk.DataCleanser(  # TODO: add more options
    missing_clean=props["missingClean"],
    missing_strategy=props["missingStrategy"],
    missing_fill_value=props["missingFillValue"],
    missing_indicator=props["missingIndicator"],
    outlier_clean=props["outlierClean"],
    outlier_method=props["outlierMethod"],
    outlier_indicator=props["outlierIndicator"],
    str_operations=props["strOperations"],
    str_remove_letters=props["strRemoveLetters"],
    str_remove_numbers=props["strRemoveNumbers"],
    str_remove_punctuation=props["strRemovePunctuation"],
    str_remove_whitespace=props["strRemoveWhitespace"],
    str_case_modifier_method=props["strCaseModifierMethod"],
    verbose=True,
)

result = dc.fit_transform(df)
# result
unwrap_value_or_error(result)

[32;10m2023-12-19 04:13:42,403 (data_cleansing.py:1025) - INFO: [0m> String Operations


Unnamed: 0,airline,year,nb_bumped,total_passengers
0,12312312delta12@#@#!#!#!@ airlines,2017.0,679,99796155
1,virgin1234@#! america,2017.0,165,6090029
2,#!#!@#@!#20j012312441etblue a1i2r3ways@!@!@,2017.0,1475,27255038
3,united airlines,,2067,70030765
4,hawaiian 13123123airlines13214,2017.0,92,8422734
5,expressjet airlines,2017.0,785,11738812
6,@skywest airlines,2017.0,917,24516354
7,american airlines,2017.0,4517,98017132
8,alaska 0airlines,2017.0,658,18817924
9,#southwest_ airlines,2017.0,6678,115988988


# Encoding Features


In [36]:
df = init_df_sample(1)
enc = pk.EncodingProcessor(
    pk.EncodingStrategies.OneHotEncoder,
    verbose=True,
    sparse_output=True,
)
cols = ["Name"]
result = enc.fit_transform(df, columns=cols)
unwrap_value_or_error(result)

Unnamed: 0,Age,Credit,Name_John,Name_Meg,Name_Peter,Name_Rose,Name_nan
0,,,1.0,0.0,0.0,0.0,0.0
1,10.0,400.0,0.0,1.0,0.0,0.0,0.0
2,25.0,,0.0,0.0,0.0,1.0,0.0
3,50.0,2000000.0,0.0,0.0,0.0,0.0,1.0
4,20.0,1000000.0,0.0,0.0,1.0,0.0,0.0
5,50.0,,0.0,1.0,0.0,0.0,0.0


# Output Dataset


In [37]:
import platform

print(f"Running on {platform.system()}")

# cross platform
dfe = pk.DataFrameExporter(df, extension=pk.FileExtension.CSV)
dfe.export()

[32;10m2023-12-19 04:13:42,468 (output.py:103) - INFO: [0m🚀 Exporting to /home/mghali/predikit_out/out.csv ...


Running on Linux


In [38]:
from predikit.preprocessing import FeatureSelection

data = {
    "Name": ["John", "Andrea", "Rose", "Linda", "Peter", "Meg"],
    "Age": [21, 23, 25, 50, 48, 50],
    "Credit": [74512, 400, 56132, 50, 1_000_000, 45121],
}

df = pk.DataFrameParser(data)
fs = FeatureSelection(exclude_dtypes=["object"], verbose=True)
# new_df = fs.fit_transform(df, columns=["Age"], dtypes=["object"])
# new_df = fs.fit_transform(df)
# new_df = fs.fit_transform(df, dtypes=["object"])
# new_df = fs.fit_transform(df, columns=["Age"])
# new_df = fs.fit_transform(df, columns=["Name"], dtypes=["number"])
result = fs.fit_transform(df, columns=["Credit", "Age"])
unwrap_value_or_error(result)

'This results in an empty data frame'

In [2]:
df = pd.read_csv('sample_data/penguins_size.csv')
df.dropna(inplace= True)
df.drop(df[df['sex'] == '.'].index, inplace= True)
df = pd.get_dummies(df,columns=['island', 'sex'])
df.drop(['island_Torgersen','sex_MALE'], axis= 1, inplace= True)
input_cols = df.iloc[:,1:]
target_col = df['species']
df

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,sex_FEMALE
0,Adelie,39.1,18.7,181.0,3750.0,False,False,False
1,Adelie,39.5,17.4,186.0,3800.0,False,False,True
2,Adelie,40.3,18.0,195.0,3250.0,False,False,True
4,Adelie,36.7,19.3,193.0,3450.0,False,False,True
5,Adelie,39.3,20.6,190.0,3650.0,False,False,False
...,...,...,...,...,...,...,...,...
338,Gentoo,47.2,13.7,214.0,4925.0,True,False,True
340,Gentoo,46.8,14.3,215.0,4850.0,True,False,True
341,Gentoo,50.4,15.7,222.0,5750.0,True,False,False
342,Gentoo,45.2,14.8,212.0,5200.0,True,False,True


In [3]:
from predikit import Classifiers
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_cols, target_col, test_size= 0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((233, 7), (100, 7), (233,), (100,))

In [21]:
model = Classifiers('LGBMClassifier' , verbose = -1)
model.fit(X_train, y_train)

In [22]:
model.score(X_test, y_test)

0.98