In [1]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer


df = pd.read_csv('all_penguins_clean.csv')
df = df.dropna() # drop all missing values

In [2]:
kbins = KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='quantile')

columns = df[['Culmen Length (mm)']]
columns

Unnamed: 0,Culmen Length (mm)
0,39.1
1,39.5
2,40.3
4,36.7
5,39.3
...,...
338,47.2
340,46.8
341,50.4
342,45.2


In [3]:
kbins.fit(columns)
t = kbins.transform(columns)
print(t.shape)
print()

(334, 5)



In [4]:
t

array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [5]:
kbins.bin_edges_

array([array([32.1 , 38.6 , 42.02, 46.08, 49.5 , 59.6 ])], dtype=object)

# BONUS: set the strategy parameter to 'uniform' and see how the edges change

In [21]:
kbins = KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='uniform')
columns = df[['Culmen Length (mm)']]
kbins.fit(columns)
t = kbins.transform(columns)
print(t.shape)
print()

(334, 5)



In [22]:
t

array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [23]:
kbins.bin_edges_

array([array([32.1, 37.6, 43.1, 48.6, 54.1, 59.6])], dtype=object)

# BONUS: create nice labels

In [7]:
edges = kbins.bin_edges_[0].round(1)
labels = []
for i in range(len(edges)-1):
    edge1 = edges[i]
    edge2 = edges[i+1]
    labels.append(f"{edge1}_to_{edge2}")

# create a DataFrame

In [8]:
df_bins = pd.DataFrame(t, columns=labels)
print(df_bins)

     32.1_to_38.6  38.6_to_42.0  42.0_to_46.1  46.1_to_49.5  49.5_to_59.6
0             0.0           1.0           0.0           0.0           0.0
1             0.0           1.0           0.0           0.0           0.0
2             0.0           1.0           0.0           0.0           0.0
3             1.0           0.0           0.0           0.0           0.0
4             0.0           1.0           0.0           0.0           0.0
..            ...           ...           ...           ...           ...
329           0.0           0.0           0.0           1.0           0.0
330           0.0           0.0           0.0           1.0           0.0
331           0.0           0.0           0.0           0.0           1.0
332           0.0           0.0           1.0           0.0           0.0
333           0.0           0.0           0.0           0.0           1.0

[334 rows x 5 columns]


In [12]:
import numpy as np

In [16]:
np.unique(t,axis=0)

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [15]:
t

array([[0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])