In [1]:
import pandas as pd
import numpy as np

# Discretization of Continuous Data

In [2]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]

In [3]:
bins = [18,25,35,60,100]

In [8]:
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [9]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [10]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [11]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

# To change the default inclusive put right=False

In [13]:
cats2 = pd.cut(ages,bins,right=False)
cats2

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [14]:
pd.value_counts(cats2)

[25, 35)     4
[18, 25)     4
[35, 60)     3
[60, 100)    1
dtype: int64

# Setting our own bin names

In [16]:
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
cats3 = pd.cut(ages,bins,labels=group_names)
cats3

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [17]:
pd.value_counts(cats3)

Youth         5
MiddleAged    3
YoungAdult    3
Senior        1
dtype: int64

# Giving integer as bins to create equal number of bins of interger count

In [18]:
data = np.random.randn(20)
pd.cut(data,4,precision=2)

[(-1.74, -0.49], (-1.74, -0.49], (-0.49, 0.76], (-0.49, 0.76], (-0.49, 0.76], ..., (-0.49, 0.76], (-0.49, 0.76], (-0.49, 0.76], (-1.74, -0.49], (-0.49, 0.76]]
Length: 20
Categories (4, interval[float64]): [(-1.74, -0.49] < (-0.49, 0.76] < (0.76, 2.01] < (2.01, 3.26]]

# Quantile cut - Creates bins such that each bin has equal number of data points

In [19]:
data = np.random.randn(1000)
cats = pd.qcut(data,4) #Cut in to quartiles
cats

[(0.0208, 0.732], (-0.651, 0.0208], (0.732, 3.331], (0.732, 3.331], (0.0208, 0.732], ..., (-0.651, 0.0208], (0.0208, 0.732], (-2.952, -0.651], (-0.651, 0.0208], (0.732, 3.331]]
Length: 1000
Categories (4, interval[float64]): [(-2.952, -0.651] < (-0.651, 0.0208] < (0.0208, 0.732] < (0.732, 3.331]]

In [20]:
pd.value_counts(cats)

(0.732, 3.331]      250
(0.0208, 0.732]     250
(-0.651, 0.0208]    250
(-2.952, -0.651]    250
dtype: int64

In [21]:
pd.qcut(data,[0,0.1,0.5,0.9,1])

[(0.0208, 1.291], (-1.188, 0.0208], (0.0208, 1.291], (1.291, 3.331], (0.0208, 1.291], ..., (-1.188, 0.0208], (0.0208, 1.291], (-2.952, -1.188], (-1.188, 0.0208], (1.291, 3.331]]
Length: 1000
Categories (4, interval[float64]): [(-2.952, -1.188] < (-1.188, 0.0208] < (0.0208, 1.291] < (1.291, 3.331]]