In [2]:
import pandas as pd
import numpy as np
from numpy import nan as NA

'''
Continuous data is often discretized or otherwise separated into “bins” for analysis.
Suppose you have data about a group of people in a study, and you want to group
them into discrete age buckets.
Let’s divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To
do so, you have to use cut , a function in pandas
'''

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [6]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [8]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [9]:
# You can change which
# side is closed by passing right=False :
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [12]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [19]:
# If you pass an integer number of bins to cut instead of explicit bin edges, it will com‐
# pute equal-length bins based on the minimum and maximum values in the data.
# Consider the case of some uniformly distributed data chopped into fourths
# The precision=2 option limits the decimal precision to two digits
data = np.random.random(24)
pd.cut(data,4 , precision=4)

[(0.7639, 0.987], (0.09347, 0.3175], (0.09347, 0.3175], (0.09347, 0.3175], (0.3175, 0.5407], ..., (0.09347, 0.3175], (0.5407, 0.7639], (0.3175, 0.5407], (0.7639, 0.987], (0.3175, 0.5407]]
Length: 24
Categories (4, interval[float64]): [(0.09347, 0.3175] < (0.3175, 0.5407] < (0.5407, 0.7639] < (0.7639, 0.987]]

In [7]:
# A closely related function, qcut , bins the data based on sample quantiles. Depending
# on the distribution of the data, using cut will not usually result in each bin having the
# same number of data points. Since qcut uses sample quantiles instead, by definition
# you will obtain roughly equal-size bins
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
pd.value_counts(cats)

(0.708, 3.333]       250
(-0.00542, 0.708]    250
(-0.71, -0.00542]    250
(-2.673, -0.71]      250
dtype: int64

In [8]:
# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):
cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
pd.value_counts(cats)

(-0.00542, 1.406]     400
(-1.242, -0.00542]    400
(1.406, 3.333]        100
(-2.673, -1.242]      100
dtype: int64