# Pandas cut()

This is a notebook for the medium article [All Pandas cut() you should know for transforming numerical data into categorical data](https://bindichen.medium.com/all-pandas-cut-you-should-know-for-transforming-numerical-data-into-categorical-data-1370cf7f4c4f)

Please check out article for instructions

**License**: [BSD 2-Clause](https://opensource.org/licenses/BSD-2-Clause)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'age': [2, 67, 40, 32, 4, 15, 82, 99, 26, 30]})

In [3]:
df

Unnamed: 0,age
0,2
1,67
2,40
3,32
4,4
5,15
6,82
7,99
8,26
9,30


## 1. Discretizing into equal-sized bins with `bins=int`

In [4]:
# Discretize into three equal-sized bins.
df['age_group'] = pd.cut(df['age'], 3)

In [5]:
df

Unnamed: 0,age,age_group
0,2,"(1.903, 34.333]"
1,67,"(66.667, 99.0]"
2,40,"(34.333, 66.667]"
3,32,"(1.903, 34.333]"
4,4,"(1.903, 34.333]"
5,15,"(1.903, 34.333]"
6,82,"(66.667, 99.0]"
7,99,"(66.667, 99.0]"
8,26,"(1.903, 34.333]"
9,30,"(1.903, 34.333]"


In [6]:
df['age_group']

0     (1.903, 34.333]
1      (66.667, 99.0]
2    (34.333, 66.667]
3     (1.903, 34.333]
4     (1.903, 34.333]
5     (1.903, 34.333]
6      (66.667, 99.0]
7      (66.667, 99.0]
8     (1.903, 34.333]
9     (1.903, 34.333]
Name: age_group, dtype: category
Categories (3, interval[float64]): [(1.903, 34.333] < (34.333, 66.667] < (66.667, 99.0]]

## 2. Adding custom bins

In [7]:
df['age_group'] = pd.cut(df['age'], bins=[0, 12, 19, 61, 100])

In [8]:
df

Unnamed: 0,age,age_group
0,2,"(0, 12]"
1,67,"(61, 100]"
2,40,"(19, 61]"
3,32,"(19, 61]"
4,4,"(0, 12]"
5,15,"(12, 19]"
6,82,"(61, 100]"
7,99,"(61, 100]"
8,26,"(19, 61]"
9,30,"(19, 61]"


In [9]:
df['age_group']

0      (0, 12]
1    (61, 100]
2     (19, 61]
3     (19, 61]
4      (0, 12]
5     (12, 19]
6    (61, 100]
7    (61, 100]
8     (19, 61]
9     (19, 61]
Name: age_group, dtype: category
Categories (4, interval[int64]): [(0, 12] < (12, 19] < (19, 61] < (61, 100]]

In [10]:
# Sort by age_group
df.sort_values('age_group')

Unnamed: 0,age,age_group
0,2,"(0, 12]"
4,4,"(0, 12]"
5,15,"(12, 19]"
2,40,"(19, 61]"
3,32,"(19, 61]"
8,26,"(19, 61]"
9,30,"(19, 61]"
1,67,"(61, 100]"
6,82,"(61, 100]"
7,99,"(61, 100]"


In [11]:
df['age_group'].value_counts().sort_index()

(0, 12]      2
(12, 19]     1
(19, 61]     4
(61, 100]    3
Name: age_group, dtype: int64

## 3. Adding labels to bins

In [12]:
bins=[0, 12, 19, 61, 100]
labels=['<12', 'Teen', 'Adult', 'Older']

In [13]:
df['age_group'] = pd.cut(df['age'], bins, labels=labels)
df

Unnamed: 0,age,age_group
0,2,<12
1,67,Older
2,40,Adult
3,32,Adult
4,4,<12
5,15,Teen
6,82,Older
7,99,Older
8,26,Adult
9,30,Adult


In [14]:
df['age_group']

0      <12
1    Older
2    Adult
3    Adult
4      <12
5     Teen
6    Older
7    Older
8    Adult
9    Adult
Name: age_group, dtype: category
Categories (4, object): ['<12' < 'Teen' < 'Adult' < 'Older']

In [15]:
df.sort_values('age_group')

Unnamed: 0,age,age_group
0,2,<12
4,4,<12
5,15,Teen
2,40,Adult
3,32,Adult
8,26,Adult
9,30,Adult
1,67,Older
6,82,Older
7,99,Older


In [16]:
df['age_group'].value_counts().sort_index()

<12      2
Teen     1
Adult    4
Older    3
Name: age_group, dtype: int64

## 4. Configuring leftmost edge or rightmost edge with `right=False`

In [17]:
pd.cut(df['age'], bins=[0, 12, 19, 61, 100], right=False)

0      [0, 12)
1    [61, 100)
2     [19, 61)
3     [19, 61)
4      [0, 12)
5     [12, 19)
6    [61, 100)
7    [61, 100)
8     [19, 61)
9     [19, 61)
Name: age, dtype: category
Categories (4, interval[int64]): [[0, 12) < [12, 19) < [19, 61) < [61, 100)]

## 5. Including the lowest value with `include_lowest=True`

In [18]:
df['age_group'] = pd.cut(df['age'], bins=[2, 12, 19, 61, 100])
df

Unnamed: 0,age,age_group
0,2,
1,67,"(61.0, 100.0]"
2,40,"(19.0, 61.0]"
3,32,"(19.0, 61.0]"
4,4,"(2.0, 12.0]"
5,15,"(12.0, 19.0]"
6,82,"(61.0, 100.0]"
7,99,"(61.0, 100.0]"
8,26,"(19.0, 61.0]"
9,30,"(19.0, 61.0]"


In [19]:
df['age_group'] = pd.cut(
    df['age'], 
    bins=[2, 12, 19, 61, 100], 
    include_lowest=True
)
df

Unnamed: 0,age,age_group
0,2,"(1.999, 12.0]"
1,67,"(61.0, 100.0]"
2,40,"(19.0, 61.0]"
3,32,"(19.0, 61.0]"
4,4,"(1.999, 12.0]"
5,15,"(12.0, 19.0]"
6,82,"(61.0, 100.0]"
7,99,"(61.0, 100.0]"
8,26,"(19.0, 61.0]"
9,30,"(19.0, 61.0]"


## 6. Passing an IntervalIndex to bins 
Passing an IntervalIndex for bins results in those categories exactly. Notice that values not covered by the IntervalIndex are set to NaN. 

In [21]:
bins = pd.IntervalIndex.from_tuples([(0, 12), (19, 61), (61, 100)])

In [22]:
bins

IntervalIndex([(0, 12], (19, 61], (61, 100]],
              closed='right',
              dtype='interval[int64]')

In [23]:
df['age_group'] = pd.cut(df['age'], bins)

In [24]:
df

Unnamed: 0,age,age_group
0,2,"(0.0, 12.0]"
1,67,"(61.0, 100.0]"
2,40,"(19.0, 61.0]"
3,32,"(19.0, 61.0]"
4,4,"(0.0, 12.0]"
5,15,
6,82,"(61.0, 100.0]"
7,99,"(61.0, 100.0]"
8,26,"(19.0, 61.0]"
9,30,"(19.0, 61.0]"


## 7. Returning bins with `retbins=True`

In [25]:
# Tt is useful when bins is passed as a single number value
result, bins = pd.cut(
    df['age'], 
    bins=4,     # A single number value
    retbins=True
)

In [26]:
bins

array([ 1.903, 26.25 , 50.5  , 74.75 , 99.   ])

## 8. Creating unordered categories with `ordered=False`

New in version 1.1.0.

`ordered=False` will result in unordered categories when labels are passed. This parameter can be used to allow non-unique labels: 


In [27]:
df['age_group'] = pd.cut(
    df['age'], 
    bins=[0, 12, 19, 61, 100], 
    labels=['<12', 'Teen', 'Adult', 'Older'], 
    ordered=False,
)

In [28]:
df['age_group']

0      <12
1    Older
2    Adult
3    Adult
4      <12
5     Teen
6    Older
7    Older
8    Adult
9    Adult
Name: age_group, dtype: category
Categories (4, object): ['<12', 'Teen', 'Adult', 'Older']