# MODULE discretize.py

## Imports

In [1]:
from sme import discretize
import pandas as pd
import numpy as np

## Functions

### discretize

Discretization is the process of transforming a continuous-valued variable into a discrete one by creating a set of contiguous intervals (or equivalently a set of cutpoints) that spans the range of the variable's values. 

There are diferent discretization methods, for example:

- Equal interval width discretization: all bins have equal width.
- Equal frequency discretization: all bins contain the same number of numerical values.
- k-means discretization:  consists of applying the classical K-means clustering to one-dimensional continuous data, k is equal to the number of bins.

In [2]:
help(discretize.discretize)

Help on function discretize in module sme.discretize:

discretize(df, num_bins, method='frequency')
    Description:
        Discretize 
    
    Parameters:
        x (continuous columns): array or dataframe
        method (string): "interval" (equal interval width), 
                        "frequency" (equal frequency), 
                        "cluster" (k-means clustering).
        num.bins (int): number of intervals.
    Returns:
        An Interval object that contains two attributes:
            - discretized (list): result of x discretization
            - discretized_with_interval (list): interval ranges
            - interval (dict): interval ranges with id




## Tests

In [3]:
v_example_1 = [11.5, 10.2, 1.2, 0.5, 5.3, 20.5, 8.4]
v_example_2 = [0,4,12,16,16,18,24,26,28]
df_example_3 = pd.DataFrame([[0,16,24], [4,16,26], [12, 18, 28]])
matrix_example_4 = pd.array([[0,16,24], [4,16,26], [12, 18, 28]])

### Test 1: Equal Interval Width Discretization

In [4]:
ew_discretize_example1 = discretize.discretize(v_example_1, method="interval", num_bins = 4)
print("Intervalos\n",ew_discretize_example1.intervals, "\nResultado con Intervalos\n", ew_discretize_example1.discretized_with_interval, "\nResultado\n", ew_discretize_example1.discretized)


Intervalos
 {'(10.5, 15.5]': 'I3', '(5.5, 10.5]': 'I2', '(-Inf, 5.5]': 'I1', '(15.5, Inf]': 'I4'} 
Resultado con Intervalos
 [('(10.5, 15.5]', 'I3'), ('(5.5, 10.5]', 'I2'), ('(-Inf, 5.5]', 'I1'), ('(-Inf, 5.5]', 'I1'), ('(-Inf, 5.5]', 'I1'), ('(15.5, Inf]', 'I4'), ('(5.5, 10.5]', 'I2')] 
Resultado
 [['I3' 'I2' 'I1' 'I1' 'I1' 'I4' 'I2']]


In [5]:
ew_discretize_example2 = discretize.discretize(v_example_2, method="interval", num_bins = 3)
print("Intervalos\n",ew_discretize_example2.intervals, "\nResultado con Intervalos\n", ew_discretize_example2.discretized_with_interval, "\nResultado\n", ew_discretize_example2.discretized)


Intervalos
 {'(-Inf, 9.33]': 'I1', '(9.33, 18.67]': 'I2', '(18.67, Inf]': 'I3'} 
Resultado con Intervalos
 [('(-Inf, 9.33]', 'I1'), ('(-Inf, 9.33]', 'I1'), ('(9.33, 18.67]', 'I2'), ('(9.33, 18.67]', 'I2'), ('(9.33, 18.67]', 'I2'), ('(9.33, 18.67]', 'I2'), ('(18.67, Inf]', 'I3'), ('(18.67, Inf]', 'I3'), ('(18.67, Inf]', 'I3')] 
Resultado
 [['I1' 'I1' 'I2' 'I2' 'I2' 'I2' 'I3' 'I3' 'I3']]


In [6]:
ew_discretize_example3 = discretize.discretize(df_example_3, method="interval", num_bins = 3)
print("Intervalos\n",ew_discretize_example3.intervals, "\nResultado con Intervalos\n", ew_discretize_example3.discretized_with_interval, "\nResultado\n", ew_discretize_example3.discretized)


Intervalos
 {'(-Inf, 9.33]': 'I1', '(9.33, 18.67]': 'I2', '(18.67, Inf]': 'I3'} 
Resultado con Intervalos
 [('(-Inf, 9.33]', 'I1'), ('(9.33, 18.67]', 'I2'), ('(18.67, Inf]', 'I3'), ('(-Inf, 9.33]', 'I1'), ('(9.33, 18.67]', 'I2'), ('(18.67, Inf]', 'I3'), ('(9.33, 18.67]', 'I2'), ('(9.33, 18.67]', 'I2'), ('(18.67, Inf]', 'I3')] 
Resultado
 [['I1' 'I2' 'I3']
 ['I1' 'I2' 'I3']
 ['I2' 'I2' 'I3']]


In [7]:
ew_discretize_example4 = discretize.discretize(matrix_example_4, method="interval", num_bins = 2)
print("Intervalos\n",ew_discretize_example4.intervals, "\nResultado con Intervalos\n", ew_discretize_example4.discretized_with_interval, "\nResultado\n", ew_discretize_example4.discretized)

Intervalos
 {'(-Inf, 14.0]': 'I1', '(14.0, Inf]': 'I2'} 
Resultado con Intervalos
 [('(-Inf, 14.0]', 'I1'), ('(14.0, Inf]', 'I2'), ('(14.0, Inf]', 'I2'), ('(-Inf, 14.0]', 'I1'), ('(14.0, Inf]', 'I2'), ('(14.0, Inf]', 'I2'), ('(-Inf, 14.0]', 'I1'), ('(14.0, Inf]', 'I2'), ('(14.0, Inf]', 'I2')] 
Resultado
 [['I1' 'I2' 'I2']
 ['I1' 'I2' 'I2']
 ['I1' 'I2' 'I2']]


### Test 2: Equal Frequency Discretization

In [8]:
ef_discretize_example1 = discretize.discretize(v_example_1, method="frequency", num_bins = 4)
print("Intervalos\n",ef_discretize_example1.intervals, "\nResultado con Intervalos\n", ef_discretize_example1.discretized_with_interval, "\nResultado\n", ef_discretize_example1.discretized)


Intervalos
 {'(8.4, 11.5]': 'I3', '(-Inf, 1.2]': 'I1', '(1.2, 8.4]': 'I2', '(11.5, Inf]': 'I4'} 
Resultado con Intervalos
 [('(8.4, 11.5]', 'I3'), ('(8.4, 11.5]', 'I3'), ('(-Inf, 1.2]', 'I1'), ('(-Inf, 1.2]', 'I1'), ('(1.2, 8.4]', 'I2'), ('(11.5, Inf]', 'I4'), ('(1.2, 8.4]', 'I2')] 
Resultado
 [['I3' 'I3' 'I1' 'I1' 'I2' 'I4' 'I2']]


In [9]:
ef_discretize_example2 = discretize.discretize(v_example_2, method="frequency", num_bins = 3)
print("Intervalos\n",ef_discretize_example2.intervals, "\nResultado con Intervalos\n", ef_discretize_example2.discretized_with_interval, "\nResultado\n", ef_discretize_example2.discretized)


Intervalos
 {'(-Inf, 12]': 'I1', '(12, 18]': 'I2', '(18, Inf]': 'I3'} 
Resultado con Intervalos
 [('(-Inf, 12]', 'I1'), ('(-Inf, 12]', 'I1'), ('(-Inf, 12]', 'I1'), ('(12, 18]', 'I2'), ('(12, 18]', 'I2'), ('(12, 18]', 'I2'), ('(18, Inf]', 'I3'), ('(18, Inf]', 'I3'), ('(18, Inf]', 'I3')] 
Resultado
 [['I1' 'I1' 'I1' 'I2' 'I2' 'I2' 'I3' 'I3' 'I3']]


In [10]:
ef_discretize_example3 = discretize.discretize(df_example_3, method="frequency", num_bins = 3)
print("Intervalos\n",ef_discretize_example3.intervals, "\nResultado con Intervalos\n", ef_discretize_example3.discretized_with_interval, "\nResultado\n", ef_discretize_example3.discretized)


Intervalos
 {'(-Inf, 12]': 'I1', '(12, 18]': 'I2', '(18, Inf]': 'I3'} 
Resultado con Intervalos
 [('(-Inf, 12]', 'I1'), ('(12, 18]', 'I2'), ('(18, Inf]', 'I3'), ('(-Inf, 12]', 'I1'), ('(12, 18]', 'I2'), ('(18, Inf]', 'I3'), ('(-Inf, 12]', 'I1'), ('(12, 18]', 'I2'), ('(18, Inf]', 'I3')] 
Resultado
 [['I1' 'I2' 'I3']
 ['I1' 'I2' 'I3']
 ['I1' 'I2' 'I3']]


In [11]:
ef_discretize_example4 = discretize.discretize(matrix_example_4, method="frequency", num_bins = 2)
print("Intervalos\n",ef_discretize_example4.intervals, "\nResultado con Intervalos\n", ef_discretize_example4.discretized_with_interval, "\nResultado\n", ef_discretize_example4.discretized)

Intervalos
 {'(-Inf, 16]': 'I1', '(16, Inf]': 'I2'} 
Resultado con Intervalos
 [('(-Inf, 16]', 'I1'), ('(-Inf, 16]', 'I1'), ('(16, Inf]', 'I2'), ('(-Inf, 16]', 'I1'), ('(-Inf, 16]', 'I1'), ('(16, Inf]', 'I2'), ('(-Inf, 16]', 'I1'), ('(16, Inf]', 'I2'), ('(16, Inf]', 'I2')] 
Resultado
 [['I1' 'I1' 'I2']
 ['I1' 'I1' 'I2']
 ['I1' 'I2' 'I2']]


### Test 3: Clustering Discretization

In [12]:
clustering_discretize_example1 = discretize.discretize(v_example_1, method="clustering",num_bins =  4)
print("Intervalos\n",clustering_discretize_example1.intervals, "\nResultado con Intervalos\n", clustering_discretize_example1.discretized_with_interval, "\nResultado\n", clustering_discretize_example1.discretized)


Intervalos
 {'(10.03, Inf]': 'I4', '(0.85, 5.3]': 'I2', '(-Inf, 0.85]': 'I1', '(5.3, 10.03]': 'I3'} 
Resultado con Intervalos
 [('(10.03, Inf]', 'I4'), ('(10.03, Inf]', 'I4'), ('(0.85, 5.3]', 'I2'), ('(-Inf, 0.85]', 'I1'), ('(0.85, 5.3]', 'I2'), ('(10.03, Inf]', 'I4'), ('(5.3, 10.03]', 'I3')] 
Resultado
 [['I4' 'I4' 'I2' 'I1' 'I2' 'I4' 'I3']]


In [13]:
clustering_discretize_example2 = discretize.discretize(v_example_2, method="clustering", num_bins = 3)
print("Intervalos\n",clustering_discretize_example2.intervals, "\nResultado con Intervalos\n", clustering_discretize_example2.discretized_with_interval, "\nResultado\n", clustering_discretize_example2.discretized)


Intervalos
 {'(-Inf, 2.0]': 'I1', '(2.0, 15.5]': 'I2', '(15.5, Inf]': 'I3'} 
Resultado con Intervalos
 [('(-Inf, 2.0]', 'I1'), ('(2.0, 15.5]', 'I2'), ('(2.0, 15.5]', 'I2'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3')] 
Resultado
 [['I1' 'I2' 'I2' 'I3' 'I3' 'I3' 'I3' 'I3' 'I3']]


In [14]:
clustering_discretize_example3 = discretize.discretize(df_example_3, method="clustering", num_bins = 3)
print("Intervalos\n",clustering_discretize_example3.intervals, "\nResultado con Intervalos\n", clustering_discretize_example3.discretized_with_interval, "\nResultado\n", clustering_discretize_example3.discretized)


Intervalos
 {'(-Inf, 2.0]': 'I1', '(15.5, Inf]': 'I3', '(2.0, 15.5]': 'I2'} 
Resultado con Intervalos
 [('(-Inf, 2.0]', 'I1'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3'), ('(2.0, 15.5]', 'I2'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3'), ('(2.0, 15.5]', 'I2'), ('(15.5, Inf]', 'I3'), ('(15.5, Inf]', 'I3')] 
Resultado
 [['I1' 'I3' 'I3']
 ['I2' 'I3' 'I3']
 ['I2' 'I3' 'I3']]


In [15]:
clustering_discretize_example4 = discretize.discretize(matrix_example_4, method="clustering", num_bins = 2)
print("Intervalos\n",clustering_discretize_example4.intervals, "\nResultado con Intervalos\n", clustering_discretize_example4.discretized_with_interval, "\nResultado\n", clustering_discretize_example4.discretized)


Intervalos
 {'(-Inf, 5.33]': 'I1', '(5.33, Inf]': 'I2'} 
Resultado con Intervalos
 [('(-Inf, 5.33]', 'I1'), ('(5.33, Inf]', 'I2'), ('(5.33, Inf]', 'I2'), ('(-Inf, 5.33]', 'I1'), ('(5.33, Inf]', 'I2'), ('(5.33, Inf]', 'I2'), ('(5.33, Inf]', 'I2'), ('(5.33, Inf]', 'I2'), ('(5.33, Inf]', 'I2')] 
Resultado
 [['I1' 'I2' 'I2']
 ['I1' 'I2' 'I2']
 ['I2' 'I2' 'I2']]
