In [1]:
from pyspark.sql import SparkSession  # Import the SparkSession class

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

print("Spark Version : " + spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/18 22:31:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/18 22:31:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark Version : 3.5.1


In [1]:

import numpy as np
import pandas as pd
from modeltrack.feature_engineering import TreeBinner, CutBinner, WOEEncoder, FeaturePipeline

df = pd.DataFrame({
    "idade": [22, 25, 45, 33, 40, 50, 60, 35],
    "renda": [1500, 1800, 7000, 2500, 3000, 10000, 8000, 1200],
    "sexo": ["M", "F", "M", "F", "M", "M", "F", "M"],
    "target": [0, 0, 1, 0, 1, 1, 0, 1]
})

transformations = {
    "idade": [TreeBinner(max_depth=2), WOEEncoder()],
    "renda": [CutBinner(bins=[2000, 5000, 8000]), WOEEncoder()],
    "sexo": [WOEEncoder()],
}



# === TreeBinner ===
tree_binner = TreeBinner(max_depth=2)
bins_idade = tree_binner.fit(df["idade"], df["target"])
print(">>> Bins idade (TreeBinner):", bins_idade)

idade_binned = tree_binner.transform(df["idade"])
print("\n>>> Idade binned (TreeBinner):")
print(idade_binned)

# === CutBinner ===
cut_binner = CutBinner(bins=[2000, 5000, 8000])
bins_renda = cut_binner.fit(df["renda"], df["target"])
print("\n>>> Bins renda (CutBinner):", bins_renda)

renda_binned = cut_binner.transform(df["renda"])
print("\n>>> Renda binned (CutBinner):")
print(renda_binned)


# pipeline = FeaturePipeline(transformations)
# df_transformed = pipeline.fit_transform(df.drop(columns="target"), df["target"])

# print(">>> transformed df")
# print(df_transformed)
# print("\n>>> binning rules")
# print(pipeline.binning_rules)   # <-- aqui estão os cuts calculados no fit
# print("\n>>> woe maps")
# print(pipeline.woe_maps)        # <-- woe maps (quando gerados)

>>> Bins idade (TreeBinner): [34.0, 55.0]

>>> Idade binned (TreeBinner):
0         <34
1         <34
2    [34, 55)
3         <34
4    [34, 55)
5    [34, 55)
6        >=55
7    [34, 55)
Name: idade, dtype: object

>>> Bins renda (CutBinner): [2000, 5000, 8000]

>>> Renda binned (CutBinner):
0           <2000
1           <2000
2    [5000, 8000)
3    [2000, 5000)
4    [2000, 5000)
5          >=8000
6          >=8000
7           <2000
Name: renda, dtype: object
