In [1]:
import numpy as np
import pandas as pd

In [2]:
# repeated instances of smaller set of distinct vals
import numpy as np; import pandas as pd
values = pd.Series(['apple', 'orange', 'apple',
         'apple'] * 2)
values


0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [3]:
values.value_counts()

apple     6
orange    2
Name: count, dtype: int64

In [4]:
values.unique()

array(['apple', 'orange'], dtype=object)

In [7]:
# kinda coding encoding
values = pd.Series([0,1,0,0] * 2)
dim = pd.Series(['apple' , "orange"])
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [6]:
dim

0     apple
1    orange
dtype: object

In [8]:
# to map apple to 0a and orange to 1
dim.take(values)
# repr called cat/dic encoded repr
# str -> categories / levels / dic
# int -> cat codes / codes

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

### Categorical type in PD

In [9]:
# for int based cat repr / encoding
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
        'basket_id': np.arange(N),
        'count': np.random.randint(3, 15, size=N),
        'weight': np.random.uniform(0, 4, size=N)},
        columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,6,3.958598
1,1,orange,9,2.980955
2,2,apple,8,0.167359
3,3,apple,14,1.322723
4,4,apple,9,3.682312
5,5,orange,13,3.596509
6,6,apple,10,2.209675
7,7,apple,5,3.210849


In [10]:
# fruit to cat
fruit_cat = df["fruit"].astype("category")
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [13]:
# cat vals are categorical
c = fruit_cat.values
print(c)
print(type(c))

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']
<class 'pandas.core.arrays.categorical.Categorical'>


In [17]:
# codes and categories props of Categorical
print(c.codes)
print(c.categories)

[0 1 0 0 0 1 0 0]
Index(['apple', 'orange'], dtype='object')


In [18]:
# fruit str col to cat
df['fruit'] = df['fruit'].astype("category")
df.fruit

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [21]:
# From other python objs1
# 1) array with dups
# alphabetically assigned codes if not provided
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
my_categories


['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [26]:
# 2) encoded data from_codes
categories = ["male" , "female"] #unique only 
codes = [0,1,0,1] * 3
gender_cats = pd.Categorical.from_codes(codes, categories)
gender_cats

['male', 'female', 'male', 'female', 'male', ..., 'female', 'male', 'female', 'male', 'female']
Length: 12
Categories (2, object): ['male', 'female']

In [25]:
# ordered=True for alpha order
pd.Categorical.from_codes(codes, categories, ordered=True)

['female', 'male', 'male', 'female', 'female', ..., 'female', 'female', 'male', 'male', 'female']
Length: 12
Categories (2, object): ['male' < 'female']

In [27]:
# unorder to order with as_ordered()
gender_cats.as_ordered()
# Categorical array not only strings but immutable val types

['male', 'female', 'male', 'female', 'male', ..., 'female', 'male', 'female', 'male', 'female']
Length: 12
Categories (2, object): ['male' < 'female']

### Categorical Methods

In [28]:
# Categorical Series -> cat attr gives method
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [29]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [30]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [38]:
# set_categories to change them
actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2


0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [36]:
print(cat_s.value_counts())
print(cat_s2.value_counts())

a    2
b    2
c    2
d    2
Name: count, dtype: int64
a    2
b    2
c    2
d    2
e    0
Name: count, dtype: int64


In [40]:
# Removing unsused categories
cat_s3 = cat_s[cat_s.isin(["a", "b"])]
print(cat_s3)
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']


0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

### One  Hot Encoding 

In [42]:
# cat -> cols (dummy vars/ one hot encoding)
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,True,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,True,False,False,False
5,False,True,False,False
6,False,False,True,False
7,False,False,False,True


In [47]:
# not book
pd.get_dummies(df,columns=["fruit"])

Unnamed: 0,basket_id,count,weight,fruit_apple,fruit_orange
0,0,6,3.958598,True,False
1,1,9,2.980955,False,True
2,2,8,0.167359,True,False
3,3,14,1.322723,True,False
4,4,9,3.682312,True,False
5,5,13,3.596509,False,True
6,6,10,2.209675,True,False
7,7,5,3.210849,True,False
