# categorical Data

In [1]:
import pandas as pd
import numpy as np


In [2]:
s1 = pd.Series(['apple','orange','apple'] * 2)
s1.unique()

array(['apple', 'orange'], dtype=object)

In [4]:
s1.value_counts() # 或者
pd.value_counts(s1)

apple     4
orange    2
dtype: int64

In [17]:
s2 = pd.Series(['apple','orange'])
s3 = pd.Series([0,1,0,1,1]) 
s4 = s2.take(s3) # 用take方法

In [7]:
fruits = ['apple','orange','apple','apple'] * 2
num = len(fruits)
df1 = pd.DataFrame({'fruits':fruits,
                    'basket_id':np.arange(num),
                    'count':np.random.randint(3,15,size = num),
                    'columns':np.random.uniform(0,4,size = num)})

In [48]:
s2 = df1['fruits'].astype('category')
s2

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]

In [10]:
type(s2.values)

pandas.core.categorical.Categorical

In [12]:
s2.values.codes # 和s3的作用很像

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [13]:
s2.values.categories

Index(['apple', 'orange'], dtype='object')

In [22]:
s5 = pd.Categorical(['foo','bar','baz','foo','bar']) # 还有一个方法
s5

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

In [21]:
pd.Categorical.from_codes(codes = [0,1,2,1,2,2,0],
                          categories = ['foo','bar','baz'],
                          ordered = True) # ordered 是否有序

[foo, bar, baz, bar, baz, baz, foo]
Categories (3, object): [foo < bar < baz]

In [23]:
s5.as_ordered() # 和上面ordered  = True 的结果一样

[foo, bar, baz, foo, bar]
Categories (3, object): [bar < baz < foo]

In [25]:
np.random.seed(123)
s6 = np.random.normal(0,1,size = 1000)
bins = pd.qcut(s6,4)
bins

[(-3.232, -0.685], (0.669, 3.572], (-0.0412, 0.669], (-3.232, -0.685], (-0.685, -0.0412], ..., (-0.0412, 0.669], (0.669, 3.572], (-3.232, -0.685], (-0.0412, 0.669], (-3.232, -0.685]]
Length: 1000
Categories (4, interval[float64]): [(-3.232, -0.685] < (-0.685, -0.0412] < (-0.0412, 0.669] < (0.669, 3.572]]

In [26]:
bins = pd.qcut(s6,4,labels = ['Q1','Q2','Q3','Q4'])
bins

[Q1, Q4, Q3, Q1, Q2, ..., Q3, Q4, Q1, Q3, Q1]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [28]:
bins.codes[:10]

array([0, 3, 2, 0, 1, 3, 0, 1, 3, 0], dtype=int8)

In [29]:
bins.categories

Index(['Q1', 'Q2', 'Q3', 'Q4'], dtype='object')

In [33]:
#bins = pd.Series(bins,name = 'quartile')
result = pd.Series(s6).groupby(bins).aggregate(['count','min','max'])
result.reset_index()['index']

0    Q1
1    Q2
2    Q3
3    Q4
Name: index, dtype: category
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [39]:
N = 10 ** 6
draws2 = np.random.randn(N)
labels = pd.Series(['foo','bar','baz','qux'] * (N // 4))

In [40]:
labels.memory_usage()

8000080

In [41]:
labels.astype('category').memory_usage() 
# 为什么它内存使用的少？我的猜想是：
# category由两部分组成，一部分是三个字符串，另一部分全部都是数字
# 就像书上说的：
# underlying algorithms use the integer-based codes array instead of 
# an array of strings

1000272

In [54]:
type(s2.cat.codes) # 生成的仍然是series
s2.values.codesn # 生成的是array

pandas.core.series.Series

In [59]:
actual_categories = ['apple','orange','c','d','e']
s2.cat.set_categories(actual_categories).value_counts() # 虽然结果没有变，但是一用这个

apple     6
orange    2
e         0
d         0
c         0
dtype: int64

In [62]:
s4 = s2[s2.isin(['apple'])]
s4

0    apple
2    apple
3    apple
4    apple
6    apple
7    apple
Name: fruits, dtype: category
Categories (2, object): [apple, orange]

In [63]:
s4.cat.remove_unused_categories()

0    apple
2    apple
3    apple
4    apple
6    apple
7    apple
dtype: category
Categories (1, object): [apple]

In [68]:
pd.get_dummies(s2)

Unnamed: 0,apple,orange
0,1,0
1,0,1
2,1,0
3,1,0
4,1,0
5,0,1
6,1,0
7,1,0


# advanced groupby use

In [70]:
df2 = pd.DataFrame({'key':['a','b','c'] * 4,
                    'values':np.arange(12)})

In [71]:
group = df2.groupby('key')

In [72]:
group.mean()

Unnamed: 0_level_0,values
key,Unnamed: 1_level_1
a,4.5
b,5.5
c,6.5


In [73]:
group.transform(lambda x :x.mean()) # 直接用下面的方法：

Unnamed: 0,values
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [76]:
group.transform('std')

Unnamed: 0,values
0,3.872983
1,3.872983
2,3.872983
3,3.872983
4,3.872983
5,3.872983
6,3.872983
7,3.872983
8,3.872983
9,3.872983


In [78]:
group.transform(lambda x: x* 2)

Unnamed: 0,values
0,0
1,2
2,4
3,6
4,8
5,10
6,12
7,14
8,16
9,18


In [79]:
group.transform(lambda x : x.rank(ascending = False))

Unnamed: 0,values
0,4.0
1,4.0
2,4.0
3,3.0
4,3.0
5,3.0
6,2.0
7,2.0
8,2.0
9,1.0


In [81]:
def normalize(x):
    return (x - x.mean*()) / x.std()

group.transform(normalize) # 这里和书上一样，但是报错了

TypeError: Transform function invalid for data types

In [82]:
group.apply(normalize)

TypeError: can't multiply sequence by non-int of type 'method'

In [91]:
df2[['values']] - group.transform('mean') # 这时用df2['values']不行，因为

Unnamed: 0,values
0,-4.5
1,-4.5
2,-4.5
3,-1.5
4,-1.5
5,-1.5
6,1.5
7,1.5
8,1.5
9,4.5


In [93]:
(df2[['values']] - group.transform('mean')) / group.transform('std')

Unnamed: 0,values
0,-1.161895
1,-1.161895
2,-1.161895
3,-0.387298
4,-0.387298
5,-0.387298
6,0.387298
7,0.387298
8,0.387298
9,1.161895


In [94]:
N2 = 15
time = pd.date_range('2018-5-7 11:00',freq = '1min',periods = N2)
df3 = pd.DataFrame({'time':time,'values':np.arange(N2)})
df3

Unnamed: 0,time,values
0,2018-05-07 11:00:00,0
1,2018-05-07 11:01:00,1
2,2018-05-07 11:02:00,2
3,2018-05-07 11:03:00,3
4,2018-05-07 11:04:00,4
5,2018-05-07 11:05:00,5
6,2018-05-07 11:06:00,6
7,2018-05-07 11:07:00,7
8,2018-05-07 11:08:00,8
9,2018-05-07 11:09:00,9


In [97]:
df3.set_index('time').resample('5min').count()

Unnamed: 0_level_0,values
time,Unnamed: 1_level_1
2018-05-07 11:00:00,5
2018-05-07 11:05:00,5
2018-05-07 11:10:00,5


In [125]:
df4 = pd.DataFrame({'time':time.repeat(3),
                   'key':np.tile(['a','b','c'],N2),
                   'values':np.arange(N2 * 3)})

In [104]:
df4.set_index('time').groupby(['key',pd.TimeGrouper('5min')]).sum().reset_index()
# 用TimeGrouper时，索引必须时时间

Unnamed: 0,key,time,values
0,a,2018-05-07 11:00:00,30
1,a,2018-05-07 11:05:00,105
2,a,2018-05-07 11:10:00,180
3,b,2018-05-07 11:00:00,35
4,b,2018-05-07 11:05:00,110
5,b,2018-05-07 11:10:00,185
6,c,2018-05-07 11:00:00,40
7,c,2018-05-07 11:05:00,115
8,c,2018-05-07 11:10:00,190


In [119]:
# df4 = df4[df4['values'] < 23]
df4['demeaned'] = df4['values'] - df4['values'].mean()
df4.groupby('key').demeaned.std()

key
a    13.416408
b    13.416408
c    13.416408
Name: demeaned, dtype: float64

In [127]:
df5 = df4.copy()
# df5['k'] = 1
# df5 = df4.assign(k = 1)
# df5.assign()

In [130]:
df5.assign(demeaned = df5['values'] - df5['values'].mean()).groupby('key').demeaned.std()  
# demeaned 这个变量时临时存在的，如下：

key
a    13.416408
b    13.416408
c    13.416408
Name: demeaned, dtype: float64

In [133]:
df5.sample(4)

Unnamed: 0,key,time,values
3,a,2018-05-07 11:01:00,3
11,c,2018-05-07 11:03:00,11
41,c,2018-05-07 11:13:00,41
13,b,2018-05-07 11:04:00,13


In [141]:
df4[lambda x :x['values'] < 100] \
.assign(demeaned = lambda x:x['values'] - x['values'].mean())\
.groupby('key').demeaned.std()    # 一口气读完  好长

key
a    13.416408
b    13.416408
c    13.416408
Name: demeaned, dtype: float64

In [144]:
def group_demean(df,by,cols):
    result = df.copy()
    g = result.groupby(by)
    for c in cols:
        result[c] = df[c] - g[c].transform('mean')
    return result

df4[df4['values'] < 100].pipe(group_demean,['key'],['values'])

Unnamed: 0,key,time,values
0,a,2018-05-07 11:00:00,-21
1,b,2018-05-07 11:00:00,-21
2,c,2018-05-07 11:00:00,-21
3,a,2018-05-07 11:01:00,-18
4,b,2018-05-07 11:01:00,-18
5,c,2018-05-07 11:01:00,-18
6,a,2018-05-07 11:02:00,-15
7,b,2018-05-07 11:02:00,-15
8,c,2018-05-07 11:02:00,-15
9,a,2018-05-07 11:03:00,-12
