# Pandas Custom Sort

This is a notebook for the medium article [How to do a Custom Sort on Pandas DataFrame](https://bindichen.medium.com/how-to-do-a-custom-sort-on-pandas-dataframe-ac18e7ea5320)

Please check out article for instructions

**License**: [BSD 2-Clause](https://opensource.org/licenses/BSD-2-Clause)

In [2]:
import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

In [3]:
df = pd.DataFrame({
    'cloth_id': [1001, 1002, 1003, 1004, 1005, 1006],
    'size': ['S', 'XL', 'M', 'XS', 'L', 'S'],
})

In [4]:
df

Unnamed: 0,cloth_id,size
0,1001,S
1,1002,XL
2,1003,M
3,1004,XS
4,1005,L
5,1006,S


## 1. Take a look at the problem

In [5]:
df.sort_values('size')

Unnamed: 0,cloth_id,size
4,1005,L
2,1003,M
0,1001,S
5,1006,S
1,1002,XL
3,1004,XS


## 2. Create a new column for custom sorting

In [6]:
df_mapping = pd.DataFrame({
    'size': ['XS', 'S', 'M', 'L', 'XL'],
})

sort_mapping = df_mapping.reset_index().set_index('size')

In [7]:
sort_mapping

Unnamed: 0_level_0,index
size,Unnamed: 1_level_1
XS,0
S,1
M,2
L,3
XL,4


In [8]:
# Create a new column size_num
df['size_num'] = df['size'].map(sort_mapping['index'])
df

Unnamed: 0,cloth_id,size,size_num
0,1001,S,1
1,1002,XL,4
2,1003,M,2
3,1004,XS,0
4,1005,L,3
5,1006,S,1


In [9]:
df.sort_values('size_num')

Unnamed: 0,cloth_id,size,size_num
3,1004,XS,0
0,1001,S,1
5,1006,S,1
2,1003,M,2
4,1005,L,3
1,1002,XL,4


## 3. Cast data to category type with orderedness using `CategoricalDtype`

In [10]:
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({
    'cloth_id': [1001, 1002, 1003, 1004, 1005, 1006],
    'size': ['S', 'XL', 'M', 'XS', 'L', 'S'],
})

In [11]:
cat_size_order = CategoricalDtype(
    ['XS', 'S', 'M', 'L', 'XL'], 
    ordered=True
)

In [12]:
df['size'] = df['size'].astype(cat_size_order)

In [13]:
df['size']

0     S
1    XL
2     M
3    XS
4     L
5     S
Name: size, dtype: category
Categories (5, object): [XS < S < M < L < XL]

In [14]:
df.sort_values('size')

Unnamed: 0,cloth_id,size
3,1004,XS
0,1001,S
5,1006,S
2,1003,M
4,1005,L
1,1002,XL


### 3.1 View category codes property with the `Series.cat` accessor

In [15]:
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({
    'cloth_id': [1001, 1002, 1003, 1004, 1005, 1006],
    'size': ['S', 'XL', 'M', 'XS', 'L', 'S'],
})

cat_size_order = CategoricalDtype(
    ['XS', 'S', 'M', 'L', 'XL'], 
    ordered=True
)
df['size'] = df['size'].astype(cat_size_order)
df.sort_values('size')

Unnamed: 0,cloth_id,size
3,1004,XS
0,1001,S
5,1006,S
2,1003,M
4,1005,L
1,1002,XL


In [16]:
df['codes'] = df['size'].cat.codes

In [17]:
df

Unnamed: 0,cloth_id,size,codes
0,1001,S,1
1,1002,XL,4
2,1003,M,2
3,1004,XS,0
4,1005,L,3
5,1006,S,1


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   cloth_id  6 non-null      int64   
 1   size      6 non-null      category
 2   codes     6 non-null      int8    
dtypes: category(1), int64(1), int8(1)
memory usage: 388.0 bytes


## 4. Sort by multiple variables

In [19]:
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({
    'order_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    'customer_id': [10, 12, 12, 12, 10, 10, 10],
    'month': ['Feb', 'Jan', 'Jan', 'Feb', 'Feb', 'Jan', 'Feb'],
    'day_of_week': ['Mon', 'Wed', 'Sun', 'Tue', 'Sat', 'Mon', 'Thu'],
})

df

Unnamed: 0,order_id,customer_id,month,day_of_week
0,1001,10,Feb,Mon
1,1002,12,Jan,Wed
2,1003,12,Jan,Sun
3,1004,12,Feb,Tue
4,1005,10,Feb,Sat
5,1006,10,Jan,Mon
6,1007,10,Feb,Thu


In [20]:
cat_day_of_week = CategoricalDtype(
    ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], 
    ordered=True
)
cat_month = CategoricalDtype(
    ['Jan', 'Feb', 'Mar', 'Apr'], 
    ordered=True,
)

In [21]:
df['day_of_week'] = df['day_of_week'].astype(cat_day_of_week)
df['month'] = df['month'].astype(cat_month)

In [22]:
df.sort_values(['month', 'day_of_week'])

Unnamed: 0,order_id,customer_id,month,day_of_week
5,1006,10,Jan,Mon
1,1002,12,Jan,Wed
2,1003,12,Jan,Sun
0,1001,10,Feb,Mon
3,1004,12,Feb,Tue
6,1007,10,Feb,Thu
4,1005,10,Feb,Sat


In [23]:
df.sort_values(['customer_id', 'month', 'day_of_week'])

Unnamed: 0,order_id,customer_id,month,day_of_week
5,1006,10,Jan,Mon
0,1001,10,Feb,Mon
6,1007,10,Feb,Thu
4,1005,10,Feb,Sat
1,1002,12,Jan,Wed
2,1003,12,Jan,Sun
3,1004,12,Feb,Tue
