In [71]:
s = pd.Series(['one', 'two', 'four', 'five'], dtype= 'category')
s

0     one
1     two
2    four
3    five
dtype: category
Categories (4, object): ['five', 'four', 'one', 'two']

In [73]:
s=s.cat.set_categories(['one','five'])
s

0     one
1     NaN
2     NaN
3    five
dtype: category
Categories (2, object): ['one', 'five']

In [60]:
metals.remove_categories(['gold'])

['bronze', NaN, 'silver', 'bronze']
Categories (2, object): ['bronze' < 'silver']

In [68]:
metals.remove_unused_categories()

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

#  <font color=red> Module_04_類別資料</font>

## 分類的背景、動機、好處

In [2]:
import pandas as pd
import numpy as np

values = pd.Series(['apple', 'orange', 'apple', 'apple']*2)
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [None]:
values.unique() # 也可用 pd.unique() 函式

In [None]:
values.value_counts() # 也可用 pd.value_counts() 函式

---

In [None]:
values = pd.Series([0, 1, 0, 0]*2)
values

In [None]:
# 維度表 (dimension table)
# 維度表讓我們知道 0 對應到 apple，1 對應到 orange
dim = pd.Series(['apple', 'orange']) 
dim

In [None]:
# 用 .take() 方法還原
dim.take(values)

---

In [None]:
N = 10000000

In [None]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux']*(N//4))   #前面列表*1*4 有四個列表，＊2有 2*4  8個 .....  * N//4 *4 有Ｎ//4 
labels

In [None]:
categories = labels.astype('category')
categories

In [None]:
labels.memory_usage()

In [None]:
categories.memory_usage() # 分類過後的序列對記憶體的消耗低很多

In [None]:
%timeit _ = labels.astype('category') # 但分類的成本是時間成本

## 建立類別物件

In [None]:
# 類別物件是用來表示類別變數
lmh_values = ['low', 'high', 'medium', 'medium', 'high']
lmh_cat = pd.Categorical(lmh_values)
lmh_cat 

In [None]:
# 類別物件
type(lmh_cat)

In [None]:
lmh_cat.categories # 可用屬性 .categories 來檢視有哪些類別

In [None]:
# 類別的順序可能不是妳要的
lmh_cat.codes

---

In [None]:
# 用 categories 參數決定妳要的順序
# 還可以用 ordered 參數來決定大小
lmh_cat = pd.Categorical(lmh_values, categories = ['low', 'medium', 'high']) 
lmh_cat

In [None]:
lmh_cat.codes

---

In [None]:
# 類別資料的方法，會依照編碼來排序
# 如果是存成字串，使用 .sort_values() 方法會是用第一個字元來排序
lmh_cat.sort_values()

In [None]:
# 要設定 ordered 才能使用 max()、min() 等方法
# 此時類別有順序關係
lmh_cat = lmh_cat.as_ordered() 
lmh_cat

In [None]:
lmh_cat.min()

In [None]:
lmh_cat.max()

---

In [None]:
lmh_values

In [None]:
s = pd.Series(lmh_values)
s

In [None]:
# 如果是存成字串，使用 .sort_values() 方法會是用第一個字元來排序
s.sort_values()

---

In [None]:
# 產生元素是類別資料的序列，也就是裡面的值是我們剛剛學的類別物件
# 不用 dtype的話，也可以 lmh_values 就直接放類別物件
cat_series = pd.Series(lmh_values, dtype = 'category') 
cat_series

In [None]:
cat_series.values

In [None]:
# 序列的換類別順序的方法
cut_dtype = pd.CategoricalDtype(['low', 'medium', 'high'], ordered = True)
cat_series = cat_series.astype(cut_dtype)
cat_series    

In [None]:
# 這才是我們想要的結果
cat_series.sort_values()

---

In [None]:
# 再回到原來
cat_series = pd.Series(lmh_values, dtype = 'category')
cat_series

In [None]:
# 注意這還是序列，只是裡面的資料是類別資料
# 所以沒辦法直接用類別物件的方法與屬性
# 解決方法是用序列的 .cat 屬性
cat_series

In [None]:
cat_series.cat # 此物件能讓我們讀取底層類別物件的各項屬性

In [None]:
cat_series.cat.categories

In [None]:
cat_series.cat.codes

In [None]:
# 序列的另一種換類別順序的方法
# 利用類別物件的 .set_categories() 方法
cat_series = cat_series.cat.set_categories(['low', 'medium', 'high']) 
cat_series

In [None]:
cat_series.cat.codes

---

In [3]:
np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Value': values})
bins

Unnamed: 0,Value
0,65
1,49
2,56
3,43
4,43


In [None]:
#cut()用来把一组数据分割成离散的区间。

bins['Group'] = pd.cut(bins['Value'].values, bins = range(0, 110, 10)) # pd.cut() 函式會回傳類別物件
bins

In [5]:
# 注意它分成 10 個桶子 
bins.Group

0    (60, 70]
1    (40, 50]
2    (50, 60]
3    (40, 50]
4    (40, 50]
Name: Group, dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [13]:
# 注意類別資料就算 0 次也會顯示
bins.Group.value_counts()

(40, 50]     3
(50, 60]     1
(60, 70]     1
(0, 10]      0
(10, 20]     0
(20, 30]     0
(30, 40]     0
(70, 80]     0
(80, 90]     0
(90, 100]    0
Name: Group, dtype: int64

In [8]:
bins.sort_values(by = 'Group', ascending = False)

Unnamed: 0,Value,Group
0,65,"(60, 70]"
2,56,"(50, 60]"
1,49,"(40, 50]"
3,43,"(40, 50]"
4,43,"(40, 50]"


---

In [19]:
# 建立有序的類別物件
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, 
                        categories = metal_categories, 
                        ordered = True) 
metals

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [20]:
# 再建一個跟剛剛的類別物件裡面的值顛倒的
metals_reversed_values = pd.Categorical(metals[::-1],
                         categories = metal_categories,
                         ordered = True)
metals_reversed_values

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [21]:
# 有序的物件讓不同的類別物件可以比較
# 他們是在比編碼大小
metals < metals_reversed_values 

array([False, False,  True, False])

In [22]:
metals.codes

array([0, 2, 1, 0], dtype=int8)

In [23]:
metals_reversed_values.codes

array([0, 1, 2, 0], dtype=int8)

---

In [24]:
metal_categories

['bronze', 'silver', 'gold']

In [26]:
pd.Categorical(['bronze', 'copper'], categories = metal_categories) # 這個技巧可以在建立類別物件時，過濾掉不適當的類別值

['bronze', NaN]
Categories (3, object): ['bronze', 'silver', 'gold']

---

In [27]:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]

In [29]:
my_cat = pd.Categorical.from_codes(codes = codes, categories = categories)
my_cat

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [32]:
# 忘記上面的方法用個 List comprehension 就來輕鬆復原，再用 pd.Categorical() 函式來建立類別物件
tmp = [categories[i] for i in codes]
tmp

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']

In [33]:
pd.Categorical(tmp, categories = categories)

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

## 重新命名類別

In [35]:
import pandas as pd

cat = pd.Categorical(['a', 'b', 'c', 'a'],
                   categories = ['a', 'b', 'c'])
cat

['a', 'b', 'c', 'a']
Categories (3, object): ['a', 'b', 'c']

In [36]:
cat.categories = ['bronze', 'silver', 'gold'] # in-place
cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze', 'silver', 'gold']

In [37]:
cat.rename_categories(['x', 'y', 'z']) # not in-place

['x', 'y', 'z', 'x']
Categories (3, object): ['x', 'y', 'z']

In [38]:
cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze', 'silver', 'gold']

## 附加新類別

In [None]:
 # 建立有序的類別物件
import pandas as pd

metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, 
                        categories = metal_categories, 
                        ordered = True)
metals

In [61]:
with_platinum = metals.add_categories(['platinum'])
with_platinum

['bronze', 'gold', 'silver', 'bronze']
Categories (4, object): ['bronze' < 'silver' < 'gold' < 'platinum']

## 移除類別

In [62]:
no_bronze = metals.remove_categories(['bronze'])
no_bronze

[NaN, 'gold', 'silver', NaN]
Categories (2, object): ['silver' < 'gold']

## 移除未使用的類別

In [66]:
with_platinum

['bronze', 'gold', 'silver', 'bronze']
Categories (4, object): ['bronze' < 'silver' < 'gold' < 'platinum']

In [67]:
with_platinum.remove_unused_categories()

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

## 設定類別

In [69]:
s = pd.Series(['one', 'two', 'four', 'five'], dtype= 'category')
s

0     one
1     two
2    four
3    five
dtype: category
Categories (4, object): ['five', 'four', 'one', 'two']

In [70]:
s = s.cat.set_categories(['one', 'four'])
s

0     one
1     NaN
2    four
3     NaN
dtype: category
Categories (2, object): ['one', 'four']

## 類別物件的敘述性資訊

In [74]:
metals

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [75]:
metals.describe() # 類別物件的敘述性資訊

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
bronze,2,0.5
silver,1,0.25
gold,1,0.25


In [76]:
# 若是有 0 的桶子也會被顯示
metals.value_counts()

bronze    2
silver    1
gold      1
dtype: int64

In [77]:
metals.min()

'bronze'

In [78]:
metals.max()

'gold'

In [79]:
metals.mode()

['bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

## 學校成績轉換

In [80]:
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', 'Dylan', 'Katina', 'Alissa', "Marc"]
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name': names, 'Grade': grades})
scores

Unnamed: 0,Name,Grade
0,Ivana,51
1,Norris,92
2,Ruth,100
3,Lane,99
4,Skye,93
5,Sol,97
6,Dylan,93
7,Katina,77
8,Alissa,82
9,Marc,73


In [81]:
score_bins = [ 0,  59,   62,  66,   69,   72,  76,   79,   82, 86,   89,   92,  99, 100]
letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']

In [82]:
letter_cats = pd.cut(scores.Grade, score_bins, labels = letter_grades)
scores['Letter'] = letter_cats
scores

Unnamed: 0,Name,Grade,Letter
0,Ivana,51,F
1,Norris,92,A-
2,Ruth,100,A+
3,Lane,99,A
4,Skye,93,A
5,Sol,97,A
6,Dylan,93,A
7,Katina,77,C+
8,Alissa,82,B-
9,Marc,73,C


In [85]:
scores.Letter

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Letter, dtype: category
Categories (13, object): ['F' < 'D-' < 'D' < 'D+' ... 'B+' < 'A-' < 'A' < 'A+']

In [86]:
# 這是序列的 .describe() 方法，跟類別物件的 .describe() 方法呈現的稍有不同
scores.Letter.describe()

count     10
unique     7
top        A
freq       4
Name: Letter, dtype: object

In [87]:
# 沒出現的類別也會顯示，顯示 0
scores.Letter.value_counts()

A     4
F     1
C     1
C+    1
B-    1
A-    1
A+    1
D-    0
D     0
D+    0
C-    0
B     0
B+    0
Name: Letter, dtype: int64

In [88]:
scores.sort_values(by = ['Letter'], ascending = False )

Unnamed: 0,Name,Grade,Letter
2,Ruth,100,A+
3,Lane,99,A
4,Skye,93,A
5,Sol,97,A
6,Dylan,93,A
1,Norris,92,A-
8,Alissa,82,B-
7,Katina,77,C+
9,Marc,73,C
0,Ivana,51,F


## 綜合應用

In [None]:
import pandas as pd 
import numpy as np

fruits = ['apple', 'orange', 'apple', 'apple']*2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size = N),
                   'weight': np.random.uniform(0, 4, size= N)},
                   columns = ['basket_id', 'fruit', 'count', 'weight']) # 這裡的 columns 決定了順序
df

In [None]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

In [None]:
c = fruit_cat.values # 值不是 NumPy 陣列，而是 pandas.Categorial 實例
type(c)

In [None]:
c.categories

In [None]:
c.codes

In [None]:
# 看是哪個資料型態的 .take() 方法，來得知返回的資料型態
c.categories.take(c.codes)

---

In [None]:
np.random.seed(12345)
draws = np.random.randn(1000)
draws[:5]

In [None]:
bins = pd.qcut(draws, 4)
bins

In [None]:
bins.describe()

In [None]:
bins = pd.qcut(draws, 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])
bins

In [None]:
bins.describe()

In [None]:
bins.codes[:10]

In [None]:
bins = pd.Series(bins, name = 'quantile')
bins

In [None]:
results = pd.Series(draws).groupby(bins).agg(['count', 'min', 'max'])
results

In [None]:
results = results.reset_index()
results

In [None]:
results['quantile']

---

In [None]:
s_cat = pd.Series(['a', 'b', 'c', 'd']*2, dtype = 'category')
s_cat

In [None]:
s_cat.cat.codes

In [None]:
# 為建模而建立 dummy 變數
# one-hot 編碼
# 可以跟 s_cat.cat.codes 做比較


pd.get_dummies(s_cat) 