Discretization

In [1]:
import sklearn.datasets as ds
import sklearn.preprocessing as pre
import pandas as pd


##### Generate a synthetic data set

# ds.make_classification() generates a synthetic data for a classification task
#   - n_samples = 10 data objects (rows)
#   - n_features = 3 features (columns)
#   - n_redudandant = 0 (no features should be redundant, e.g., shouldn't repeat a number)
#   - n_classes = 2 (the target variable should have 2 classes)
#   - The function returns the input features X and the target variable y as matrices.
X, y = ds.make_classification(n_samples=10, n_features=3, n_redundant=0, n_classes=2)

In [2]:
# Print the input feature X
print(X)
print(y) # This is a 1-D vector. Row vector. You have to kinda flit it and read it from top to bottom. You know what I mean. Classic Vector Notation


[[-1.31052477  1.23216817  2.06710146]
 [ 0.690775   -1.45231791 -0.71977971]
 [ 0.07999455 -1.48120813 -0.54973229]
 [ 1.35113191  0.92269922  0.24769704]
 [ 0.52201864  1.3824162  -1.23908557]
 [-0.83868029 -1.22070972  2.02198089]
 [-1.87530192  1.89870344 -2.07542685]
 [ 1.26600403 -0.82558901 -1.32058733]
 [-0.88849825 -0.82111895  0.22266962]
 [-1.40298476  1.38543511  0.74887052]]
[1 0 0 1 1 0 1 0 0 1]


In [3]:
# X and y are numpy arrays
type(X)
type(y)


numpy.ndarray

In [4]:
# The underlying data types are numerical (float64)
X.dtype
y.dtype

dtype('int64')

In [7]:
##### Equal Width Binning

# We use pre.KBinsDiscretizer()
#
#   The "initialize-fit-transform" process
#       In scikit-learn, the process of data preprocessing often involves a sequence of steps
#       that can be summarized as "initialize-fit-transform". This sequence is commonly used
#       with various preprocessing classes, such as scalers, encoders, and transformers.
#
#   1. Initialize
#       Create an instance of the KBinsDiscretizer() class by specifying the parameters:
#       - n_bins = 3 (we want to have 3 intervals/bin)
#       - strategy = 'uniform' (we want to use Equal Width Binning) -> same bin width (e.g., 0-10, 10-20, ...)
#       - encode = 'ordinal' (the interval identifiers are encoded as integer values)
#       - This results in 4 bin edges (edges = bins + 1)
ewb = pre.KBinsDiscretizer(n_bins=3, strategy='uniform', encode='ordinal')
#
#   2. Fit
#       Calculate the bin edges based on the specified number of bins (3) and the strategy used ('uniform').
#       The bin edges are stored in the attribute 'bin_edges_' of 'ewb'.
ewb.fit(X)
print(ewb.bin_edges_)
#       For each of the 3 features, we get 4 bin edges (thus 3 bins)
#       Remember that we stored 3 features in X - ewb.fit() was applied to each of them separately.
#
#   3. Transform
#       Maps the original numerical values to the corresponding bins.
X_ewb = ewb.transform(X)

print(X_ewb)

[array([-1.87530192, -0.79982398,  0.27565397,  1.35113191])
 array([-1.48120813, -0.35457094,  0.77206625,  1.89870344])
 array([-2.07542685, -0.69458408,  0.68625869,  2.06710146])]
[[0. 2. 2.]
 [2. 0. 0.]
 [1. 0. 1.]
 [2. 2. 1.]
 [2. 2. 0.]
 [0. 0. 2.]
 [0. 2. 0.]
 [2. 0. 0.]
 [0. 0. 1.]
 [0. 2. 2.]]


In [8]:
##### Equal Frequency Binning
#   To use it, we must again follow the "initialize-fit-transform" process.
#
#   1. Initialize
#       Create an instance of the KBinsDiscretizer() class by specifying the parameters:
#       - n_bins = 3 (we want to have 3 intervals/bin)
#       - strategy = 'quantile' (we want Equal Frequency Binning now) -> same amount of observations in bins
#       - encode = 'ordinal' (the interval identifiers are encoded as an integer values, enumerate them)
efb = pre.KBinsDiscretizer(n_bins=3, strategy='quantile', encode='ordinal')
#
#   2. Fit
#       Calculate the bin edges.
efb.fit(X)
print(efb.bin_edges_)
#   3. Transform
#       Map the values accordingly.
X_efb = efb.transform(X)

print(X_efb)

[array([-1.87530192, -0.88849825,  0.52201864,  1.35113191])
 array([-1.48120813, -0.82558901,  1.23216817,  1.89870344])
 array([-2.07542685, -0.71977971,  0.24769704,  2.06710146])]
[[0. 1. 2.]
 [2. 0. 0.]
 [1. 0. 1.]
 [2. 1. 1.]
 [1. 2. 0.]
 [1. 0. 2.]
 [0. 2. 0.]
 [2. 1. 0.]
 [1. 1. 1.]
 [0. 2. 2.]]


In [None]:
##### Are the results on a categorical scale?


# The KBinsDiscretizer encodes the bin identifiers as floats 0.,1.,2.:
type(X_ewb) # numpy.ndarray
X_ewb.dtype # dtype('float64')
# This means that the data type is still numerical.
# The KBins*Discretizer* only *discretizes* the numerical values, it does not make them categorical!

# Note:
#   - Discrete variables with a numerical data type can be both, categorical scale or numerical scale!
#   - Which of them applies depends on the semantic of the attributes, because the semantic tells us
#     which mathematical operations make sense.
#   - Example: 'age'
#       - The data type of 'age' (measured in years) is integer, thus numerical.
#       - The scale level of 'age' is ratio, because it makes sense to calculate age differences and fractions.
#         Thus, the scale level is also numerical.
#       - Both, data type and scale level are numerical.
#   - Example: 'age class (EWB)'
#       - Assume we have 10 age classes of equal length.
#       - Assume the classes are encoded as integers: 1 = [1,10], 2 = [11,20], 3 = [21-30], ..., 10=[91,100]
#       - The data type of 'age class (EWB)' is integer, thus numerical.
#       - The scale level of 'age class (EWB)' is interval, because it makes sense calculate differences of age classes.
#         Thus, the scale level is also numerical.
#       - Both, data type and scale level are numerical.
#   - Example: 'age class (EFB)'
#       - Assume we have 4 age classes of equal frequency and unequal length.
#       - Assume the classes are encoded as integers: 1 = [1,20], 2 = [21,25], 3 = [25-30], 4=[41-100].
#       - The data type of 'age class (EFB)' is integer, thus numerical.
#       - The scale level of 'age class (EFB)' is ordinal: it does NOT make sense to calculate differences of age classes,
#         but it makes sense to put them into the above order.
#         Thus, the scale level is categorical.
#       - The data type is numerical while the scale level is numerical.

# Important note:
#   - Algorithms cannot know the semantic of attributes. Thus, they infer the scale level from the data type.
#   - If scale level and data type don't match, you might get nonsensical results!

In [9]:
##### How to make sure an algorithm infers a categorical scale after discretization?

# To achieve this, we can convert the bin identifiers to string:
X_ewb_cat = X_ewb.astype(str)

# If we want nicer strings, we can first convert to integer, then to string:
X_ewb_cat = X_ewb.astype(int).astype(str)

# If we need a date frame, we can convert the ndarray to DataFrame using the method .DataFrame() from pandas:
X_ewb_cat_df = pd.DataFrame(X_ewb_cat)

# Did it work?
type(X_ewb_cat_df) # It's indeed a data frame now
X_ewb_cat_df.info() # All features are categorical ('object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       10 non-null     object
 1   1       10 non-null     object
 2   2       10 non-null     object
dtypes: object(3)
memory usage: 372.0+ bytes


In [None]:
# We can assign new bin labels
    # Look up the old labels:
X_ewb_cat_df[0].unique() # we only check feature 0, because we know that all features have the same bin labels
    # Define a dictionary for mapping old values to new values:
rename_mapping = {
    '2': 'High',
    '1': 'Meduim',
    '0': 'Low'
}
    # Use the pandas method replace() to rename them:
X_ewb_cat_df = X_ewb_cat_df.replace(rename_mapping)

# We can also change the feature names
X_ewb_cat_df.columns = ["Temperature", "Income", "Mood"]

print(X_ewb_cat_df)