In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 12.1 Categorical Data

## Background and Motivation

In [3]:
v = pd.Series(data=['apple', 'orange', 'apple', 'orange']*2)

In [4]:
v

0     apple
1    orange
2     apple
3    orange
4     apple
5    orange
6     apple
7    orange
dtype: object

In [5]:
pd.unique(v)

array(['apple', 'orange'], dtype=object)

In [6]:
pd.value_counts(v)

apple     4
orange    4
dtype: int64

Many data systems (for data warehousing, statistical computing, or other uses) have
developed specialized approaches for representing data with repeated values for more
efficient storage and computation. In data warehousing, a best practice is to use socalled dimension tables containing the distinct values and storing the primary observations as integer keys referencing the dimension table:

In [9]:
v = pd.Series([0, 1, 0, 0]*2)
v

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [10]:
dim = pd.Series(['apple', 'orange'])
dim

0     apple
1    orange
dtype: object

In [11]:
dim.take(v)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

1. This representation as integers is called the categorical or dictionary-encoded representation.
2.  The array of distinct values can be called the categories, dictionary, or levels of the data. 
3. The integer values that reference the categories are called the category codes or simply codes.

## Categorical Type in pandas

In [12]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [13]:
N = len(fruits)

In [14]:
df = pd.DataFrame({'fruit': fruits, 'basket_id': np.arange(N), 'count': np.random.randint(3,15, size=N), 
                  'weight': np.random.uniform(0,4, size=N)})

In [15]:
df

Unnamed: 0,fruit,basket_id,count,weight
0,apple,0,13,2.17324
1,orange,1,12,2.200334
2,apple,2,6,2.514472
3,apple,3,7,2.368415
4,apple,4,3,0.045039
5,orange,5,9,2.752779
6,apple,6,6,3.931494
7,apple,7,10,3.498762


In [17]:
cat_fruit = df['fruit'].astype('category')

In [18]:
cat_fruit

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [19]:
cat_fruit.dtype

CategoricalDtype(categories=['apple', 'orange'], ordered=False)

In [22]:
c = cat_fruit.values
c

[apple, orange, apple, apple, apple, orange, apple, apple]
Categories (2, object): [apple, orange]

In [23]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [24]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [26]:
df['fruit'] = df['fruit'].astype('category')

In [29]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

You can also create pandas.Categorical directly from other types of Python sequences

In [33]:
f = pd.Categorical(['apple', 'orange', 'apple', 'grape']*2)
f

[apple, orange, apple, grape, apple, orange, apple, grape]
Categories (3, object): [apple, grape, orange]

In [34]:
f.categories

Index(['apple', 'grape', 'orange'], dtype='object')

In [35]:
f.codes

array([0, 2, 0, 1, 0, 2, 0, 1], dtype=int8)

In [36]:
f.as_ordered()

[apple, orange, apple, grape, apple, orange, apple, grape]
Categories (3, object): [apple < grape < orange]

Categorical data need not be strings, even though I have only showed string examples. A categorical array can consist of any immutable value types.

## Computations with Categoricals

In [37]:
np.random.seed(12345)

In [38]:
draws = np.random.randn(1000)

In [39]:
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [40]:
bins = pd.qcut(draws, 5)

In [41]:
bins

[(-0.302, 0.237], (0.237, 0.828], (-0.867, -0.302], (-0.867, -0.302], (0.828, 3.928], ..., (-0.302, 0.237], (-0.302, 0.237], (-2.9499999999999997, -0.867], (0.237, 0.828], (0.828, 3.928]]
Length: 1000
Categories (5, interval[float64]): [(-2.9499999999999997, -0.867] < (-0.867, -0.302] < (-0.302, 0.237] < (0.237, 0.828] < (0.828, 3.928]]

In [47]:
bins = pd.qcut(draws, 5, labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5'])

In [48]:
bins

[Q3, Q4, Q2, Q2, Q5, ..., Q3, Q3, Q1, Q4, Q5]
Length: 1000
Categories (5, object): [Q1 < Q2 < Q3 < Q4 < Q5]

In [49]:
bins.categories

Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5'], dtype='object')

In [51]:
bins.codes[:10]

array([2, 3, 1, 1, 4, 4, 2, 3, 3, 4], dtype=int8)

The labeled bins categorical does not contain information about the bin edges in the
data, so we can use groupby to extract some summary statistics

In [54]:
bins = pd.Series(bins, name='quartiles')

In [55]:
bins

0      Q3
1      Q4
2      Q2
3      Q2
4      Q5
       ..
995    Q3
996    Q3
997    Q1
998    Q4
999    Q5
Name: quartiles, Length: 1000, dtype: category
Categories (5, object): [Q1 < Q2 < Q3 < Q4 < Q5]

In [61]:
results = pd.Series(draws).groupby(bins).aggregate(['count', 'min', 'max']).reset_index()
results

Unnamed: 0,quartiles,count,min,max
0,Q1,200,-2.949343,-0.868731
1,Q2,200,-0.867136,-0.302335
2,Q3,200,-0.30135,0.236361
3,Q4,200,0.237372,0.827283
4,Q5,200,0.830254,3.927528


### Better performance with categorical

In [72]:
N = 10000000

In [75]:
draws = pd.Series(np.random.randn(N))

In [76]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N//4))

In [79]:
category = labels.astype('category')

In [81]:
labels.memory_usage()

80000128

In [82]:
category.memory_usage()

10000320

In [85]:
%time _ = labels.astype('category')

Wall time: 571 ms


## Categorical Methods

In [86]:
s = pd.Series(['a', 'b', 'a', 'c']*2)

In [88]:
s = s.astype('category')

In [89]:
s

0    a
1    b
2    a
3    c
4    a
5    b
6    a
7    c
dtype: category
Categories (3, object): [a, b, c]

In [91]:
s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [92]:
s.cat.codes

0    0
1    1
2    0
3    2
4    0
5    1
6    0
7    2
dtype: int8

In [93]:
actual_cat = list('abcde')

In [95]:
s2 = s.cat.set_categories(actual_cat)

In [96]:
s2

0    a
1    b
2    a
3    c
4    a
5    b
6    a
7    c
dtype: category
Categories (5, object): [a, b, c, d, e]

In [97]:
s.value_counts()

a    4
c    2
b    2
dtype: int64

In [98]:
s2.value_counts()

a    4
c    2
b    2
e    0
d    0
dtype: int64

In [101]:
s3 = s[s.isin(['a', 'b'])]

In [102]:
s3

0    a
1    b
2    a
4    a
5    b
6    a
dtype: category
Categories (3, object): [a, b, c]

In [103]:
s3.cat.remove_unused_categories()

0    a
1    b
2    a
4    a
5    b
6    a
dtype: category
Categories (2, object): [a, b]

*See Table 12-1 for a listing of available categorical methods.*

![Table 12-1. Categorical methods for Series in pandas](Img/12.1.png)

### Creating dummy variables for modeling

In [104]:
s = pd.Series(list('abcd')*2, dtype='category')

In [105]:
s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [108]:
pd.get_dummies(s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


# 12.2 Advanced GroupBy Use