# 分类数据 categories

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./data/learn_pandas.csv', usecols = ['Grade', 'Name', 'Gender', 'Height', 'Weight'])

In [8]:
df.Grade.values.dtype

dtype('O')

In [11]:
s = df.Grade
s_cat = df.Grade.astype('category')

In [12]:
s

0       Freshman
1       Freshman
2         Senior
3      Sophomore
4      Sophomore
         ...    
195       Junior
196       Senior
197       Senior
198       Senior
199    Sophomore
Name: Grade, Length: 200, dtype: object

In [13]:
s_cat

0       Freshman
1       Freshman
2         Senior
3      Sophomore
4      Sophomore
         ...    
195       Junior
196       Senior
197       Senior
198       Senior
199    Sophomore
Name: Grade, Length: 200, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']

## category 的属性 code ordered 

In [15]:
s_cat.cat.codes

0      0
1      0
2      2
3      3
4      3
      ..
195    1
196    2
197    2
198    2
199    3
Length: 200, dtype: int8

In [16]:
s_cat.cat.ordered

False

In [17]:
s_cat.cat.categories

Index(['Freshman', 'Junior', 'Senior', 'Sophomore'], dtype='object')

# categories 增加，删除，修改

In [18]:
s_cat.cat.add_categories(["Graduate","Aluminum","Staff"])

0       Freshman
1       Freshman
2         Senior
3      Sophomore
4      Sophomore
         ...    
195       Junior
196       Senior
197       Senior
198       Senior
199    Sophomore
Name: Grade, Length: 200, dtype: category
Categories (7, object): ['Freshman', 'Junior', 'Senior', 'Sophomore', 'Graduate', 'Aluminum', 'Staff']

## remove categories 或者 remove unused categories

## 使用rename_categories

In [21]:
s_cat.cat.rename_categories({"Freshman":"菜鸟","Sophomore":"新人","Junior":"骨干","Senior":"老油条"})

0       菜鸟
1       菜鸟
2      老油条
3       新人
4       新人
      ... 
195     骨干
196    老油条
197    老油条
198    老油条
199     新人
Name: Grade, Length: 200, dtype: category
Categories (4, object): ['菜鸟', '骨干', '老油条', '新人']

# 为分类排序

In [24]:
s_cat= df.Grade.astype("category")

In [25]:
s_cat.cat.reorder_categories(["Freshman","Sophomore","Junior","Senior"], ordered=True)

0       Freshman
1       Freshman
2         Senior
3      Sophomore
4      Sophomore
         ...    
195       Junior
196       Senior
197       Senior
198       Senior
199    Sophomore
Name: Grade, Length: 200, dtype: category
Categories (4, object): ['Freshman' < 'Sophomore' < 'Junior' < 'Senior']

## 如果你想再次调整 先 cat.as_unordered
## 再reorder

In [29]:
df.shape[:]

(200, 5)

## 比较 

##     == 
##     ！= 
##     <= 
##     >=

In [30]:
ordered_cat = s_cat.cat.reorder_categories(["Freshman","Sophomore","Junior","Senior"], ordered=True)

In [35]:
ordered_cat == "Junior"

0      False
1      False
2      False
3      False
4      False
       ...  
195     True
196    False
197    False
198    False
199    False
Name: Grade, Length: 200, dtype: bool

In [36]:
ordered_cat != "Junior"

0       True
1       True
2       True
3       True
4       True
       ...  
195    False
196     True
197     True
198     True
199     True
Name: Grade, Length: 200, dtype: bool

In [37]:
ordered_cat <= "Junior"

0       True
1       True
2      False
3       True
4       True
       ...  
195     True
196    False
197    False
198    False
199     True
Name: Grade, Length: 200, dtype: bool

In [38]:
ordered_cat >= "Junior"

0      False
1      False
2       True
3      False
4      False
       ...  
195     True
196     True
197     True
198     True
199    False
Name: Grade, Length: 200, dtype: bool

# 使用 cut, qcut 进行区间构造

In [42]:
ser1 = pd.Series(np.arange(1,100,5))

In [43]:
ser1

0      1
1      6
2     11
3     16
4     21
5     26
6     31
7     36
8     41
9     46
10    51
11    56
12    61
13    66
14    71
15    76
16    81
17    86
18    91
19    96
dtype: int32

In [44]:
pd.cut(ser1,bins=3)

0      (0.905, 32.667]
1      (0.905, 32.667]
2      (0.905, 32.667]
3      (0.905, 32.667]
4      (0.905, 32.667]
5      (0.905, 32.667]
6      (0.905, 32.667]
7     (32.667, 64.333]
8     (32.667, 64.333]
9     (32.667, 64.333]
10    (32.667, 64.333]
11    (32.667, 64.333]
12    (32.667, 64.333]
13      (64.333, 96.0]
14      (64.333, 96.0]
15      (64.333, 96.0]
16      (64.333, 96.0]
17      (64.333, 96.0]
18      (64.333, 96.0]
19      (64.333, 96.0]
dtype: category
Categories (3, interval[float64, right]): [(0.905, 32.667] < (32.667, 64.333] < (64.333, 96.0]]

In [45]:
pd.cut(ser1,bins=3, labels=["中杯","大杯","超大杯"])

0      中杯
1      中杯
2      中杯
3      中杯
4      中杯
5      中杯
6      中杯
7      大杯
8      大杯
9      大杯
10     大杯
11     大杯
12     大杯
13    超大杯
14    超大杯
15    超大杯
16    超大杯
17    超大杯
18    超大杯
19    超大杯
dtype: category
Categories (3, object): ['中杯' < '大杯' < '超大杯']

In [46]:
pd.qcut(ser1, q=3,labels=["大杯","超大杯","巨大杯"])

0      大杯
1      大杯
2      大杯
3      大杯
4      大杯
5      大杯
6      大杯
7     超大杯
8     超大杯
9     超大杯
10    超大杯
11    超大杯
12    超大杯
13    巨大杯
14    巨大杯
15    巨大杯
16    巨大杯
17    巨大杯
18    巨大杯
19    巨大杯
dtype: category
Categories (3, object): ['大杯' < '超大杯' < '巨大杯']

# 一般区间构造

# pd.Interval() 包不包括左右边界 right  left, both  enither

In [48]:
my_interval = pd.Interval(-1,1,"both") # [-1,1]

In [49]:
-1 in my_interval

True

In [50]:
1 in my_interval

True

In [51]:
sec_interval = pd.Interval(-5,5, "right")

In [52]:
-5 in sec_interval

False

In [53]:
5 in sec_interval

True

In [54]:
sec_interval.overlaps(my_interval)

True

### interval from break

In [55]:
pd.IntervalIndex.from_breaks([1,2,3,4,5],"both")

IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], dtype='interval[int64, both]')

### intervl from array

In [56]:
pd.IntervalIndex.from_arrays([1,2,3,4,5],[6,7,8,9,10],"left")

IntervalIndex([[1, 6), [2, 7), [3, 8), [4, 9), [5, 10)], dtype='interval[int64, left]')

### from tuple

In [59]:
pd.IntervalIndex.from_tuples([(1,3),(5,9),(1,10)],"right")

IntervalIndex([(1, 3], (5, 9], (1, 10]], dtype='interval[int64, right]')

### interval range

In [60]:
pd.interval_range(start=1,end=5,periods=5)

IntervalIndex([(1.0, 1.8], (1.8, 2.6], (2.6, 3.4000000000000004], (3.4000000000000004, 4.2], (4.2, 5.0]], dtype='interval[float64, right]')

### interval 的属性

In [61]:
intervals = pd.IntervalIndex.from_tuples([(1,3),(5,9),(1,10)],"right")

In [62]:
intervals.left

Int64Index([1, 5, 1], dtype='int64')

In [63]:
intervals.right

Int64Index([3, 9, 10], dtype='int64')

In [64]:
intervals.mid

Float64Index([2.0, 7.0, 5.5], dtype='float64')