In [2]:
import pandas as pd
import numpy as np


## Series
Series是一种类似于一维数组的对象，组成：
一组数据（各种NumPy数据类型）
一组与之对应的索引（数据标签）
索引(index)在左，数据(values)在右
索引是自动创建的
### 1、通过list构建Series
ser_obj = pd.Series(range(10,30))
不指定索引的话，默认从0开始
范例
print(ser_obj)
print(ser_obj.head(3)) #打印前三行数据
print(type(ser_obj)) #打印数据类型
### 2、 通过dict构建Series
范例
dict = {1000:"hello",2000:"world",3000:"!"}
ser_obj = pd.Series(dict)
print(ser_obj)

In [3]:
ser_obj = pd.Series(range(10,20))
print(ser_obj)
print(type(ser_obj))
print(ser_obj.head(3)) #打印前三行数据
print(ser_obj.tail(3)) #打印后三行数据

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
<class 'pandas.core.series.Series'>
0    10
1    11
2    12
dtype: int64
7    17
8    18
9    19
dtype: int64


In [6]:
dict = {1000:"hello",2000:"world",3000:"!"}
ser_obj = pd.Series(dict)
print(ser_obj)

1000    hello
2000    world
3000        !
dtype: object


### 1. 通过ndarray构建DataFrame
范例


In [11]:
arr_obj = np.random.rand(3,4)
df_obj = pd.DataFrame(arr_obj)
print(df_obj)
print(type(df_obj))
print(df_obj.head(2)) #看前两行

          0         1         2         3
0  0.456275  0.391126  0.633056  0.080891
1  0.544013  0.039482  0.538399  0.824807
2  0.225624  0.489394  0.161644  0.377619
<class 'pandas.core.frame.DataFrame'>
          0         1         2         3
0  0.456275  0.391126  0.633056  0.080891
1  0.544013  0.039482  0.538399  0.824807


### 2. 通过dict构建DataFrame

In [12]:
dict = {
    "A":1,
    "B":pd.Timestamp("20171212"),
    "C":pd.Series(range(10,14),dtype="float64"),
    "D":["python","java","c++","c"],
    "E":np.array([3] *4,dtype="int32"),
    "F":"Qiku"
}
df_obj = pd.DataFrame(dict)
print(df_obj)


   A          B     C       D  E     F
0  1 2017-12-12  10.0  python  3  Qiku
1  1 2017-12-12  11.0    java  3  Qiku
2  1 2017-12-12  12.0     c++  3  Qiku
3  1 2017-12-12  13.0       c  3  Qiku


In [13]:
#dataframe数据访问
#通过索引，先列后行
print(df_obj['D'][0])
#查找指定元素，先找列，在找行
print(df_obj.D[2])

python
c++


## 加载外部csv文件

In [14]:
df = pd.read_csv('data/douban.csv')
print(df)

       title  score                                  info
0     肖申克的救赎    9.6                               希望让人自由。
1       霸王别姬    9.6                                 风华绝代。
2    这个杀手不太冷    9.4                       怪蜀黍和小萝莉不得不说的故事。
3       阿甘正传    9.4                             一部美国近现代史。
4       美丽人生    9.5                                最美的谎言。
..       ...    ...                                   ...
245     荒岛余生    8.5                              一个人的独角戏。
246       勇士    8.9                     热血沸腾，相当完美的娱乐拳击大餐。
247       枪火    8.7  一群演技精湛的戏骨，奉献出一个精致的黑帮小品，成就杜琪峰群戏的巅峰之作。
248     迁徙的鸟    9.1                                最美的飞翔。
249    攻壳机动队    8.9                    上承《银翼杀手》，下启《黑客帝国》。

[250 rows x 3 columns]


In [3]:
df = pd.read_csv('data/douban.csv',usecols=['title','score'])
print(df)

       title  score
0     肖申克的救赎    9.6
1       霸王别姬    9.6
2    这个杀手不太冷    9.4
3       阿甘正传    9.4
4       美丽人生    9.5
..       ...    ...
245     荒岛余生    8.5
246       勇士    8.9
247       枪火    8.7
248     迁徙的鸟    9.1
249    攻壳机动队    8.9

[250 rows x 2 columns]


## Series索引

In [None]:
1. index 指定行索引名
不指定索引的话，默认从0开始

### 1. index 指定行索引名

In [6]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [7]:
# 利用属性修改索引
ser_obj.index = (['A', 'B', 'C', 'D', 'E'])
print(ser_obj)

A    0
B    1
C    2
D    3
E    4
dtype: int64


### 2. 行索引

In [19]:
#行索引
print(ser_obj['b'])
print(ser_obj[2])

1
2


### 3. 切片索引

In [23]:
print(ser_obj[1:3])
print(ser_obj['B':'D'])

B    1
C    2
dtype: int64
B    1
C    2
D    3
dtype: int64


### 4.不连续索引

In [24]:
print(ser_obj[[0, 2, 4]])
print(ser_obj[['A', 'E']])

A    0
C    2
E    4
dtype: int64
A    0
E    4
dtype: int64


### 5. 布尔索引

In [25]:
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])
print(ser_obj[ser_obj > 2])

A    False
B    False
C    False
D     True
E     True
dtype: bool
D    3
E    4
dtype: int64
D    3
E    4
dtype: int64


## DataFrame索引

### 1. columns 指定列索引名

In [9]:
df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])#  指定列索引
print(df_obj.head())

          a         b         c         d
0  0.443089 -1.252251 -0.068372  0.693918
1  0.150004 -0.555430  0.876956  0.584483
2  0.571691  1.870179 -0.993798 -0.117185
3  0.747551  0.024265 -0.025589  1.452942
4  1.031956 -0.789453 -1.482885  1.127424


In [28]:
df_obj.index = (['A', 'B', 'C', 'D', 'E'])#  指定行索引
print(df_obj.head())

          a         b         c         d
A  0.505765 -0.245012 -0.725554  0.873485
B -0.245129 -0.171176 -0.699629  0.797298
C -0.549784 -0.567473  0.362150  0.888390
D -0.131546  0.707825 -0.194642 -0.892236
E -0.189285  0.948868 -0.044856  0.356000


### 2. 列索引

In [31]:
print(df_obj['a']) # 返回Series类型
print(type(df_obj['a']))
print(df_obj['a'][0])

A    0.505765
B   -0.245129
C   -0.549784
D   -0.131546
E   -0.189285
Name: a, dtype: float64
<class 'pandas.core.series.Series'>
0.5057649127830904


### 3. 不连续索引

In [38]:
print(df_obj)
print(df_obj[['a','c']])
# print(df_obj[['a':'c']])# error
# print(df_obj[[1, 3]]) # error
print(df_obj['a'][1:3])
print(df_obj[['a','c']][1:3])


          a         b         c         d
A  0.505765 -0.245012 -0.725554  0.873485
B -0.245129 -0.171176 -0.699629  0.797298
C -0.549784 -0.567473  0.362150  0.888390
D -0.131546  0.707825 -0.194642 -0.892236
E -0.189285  0.948868 -0.044856  0.356000
          a         c
A  0.505765 -0.725554
B -0.245129 -0.699629
C -0.549784  0.362150
D -0.131546 -0.194642
E -0.189285 -0.044856
B   -0.245129
C   -0.549784
Name: a, dtype: float64
          a         c
B -0.245129 -0.699629
C -0.549784  0.362150


### 布尔索引

In [50]:

print(df_obj)

print(df_obj[df_obj['a'] > 0])

          a         b         c         d
A  0.505765 -0.245012 -0.725554  0.873485
B -0.245129 -0.171176 -0.699629  0.797298
C -0.549784 -0.567473  0.362150  0.888390
D -0.131546  0.707825 -0.194642 -0.892236
E -0.189285  0.948868 -0.044856  0.356000
          a         b         c         d
A  0.505765 -0.245012 -0.725554  0.873485


## 高级索引：标签、位置和混合

### 1. loc 标签索引

In [44]:
#Series
print(ser_obj)
print(ser_obj['B':'D'])
print(ser_obj.loc['B':'D']) # 等效


A    0
B    1
C    2
D    3
E    4
dtype: int64
B    1
C    2
D    3
dtype: int64
B    1
C    2
D    3
dtype: int64


In [53]:
# DataFrame 不能直接切片，可以通过loc来做切片
#DataFrame
df_obj.index = (range(5))#  指定行索引

print(df_obj)
print(df_obj['a'])
#第一个参数索引行，第二个参数是列
print(df_obj.loc[0:2, 'a'])
print(df_obj.loc[1:3,['b','c']])
print(df_obj.loc[1:3,'b':'e'])



          a         b         c         d
0  0.505765 -0.245012 -0.725554  0.873485
1 -0.245129 -0.171176 -0.699629  0.797298
2 -0.549784 -0.567473  0.362150  0.888390
3 -0.131546  0.707825 -0.194642 -0.892236
4 -0.189285  0.948868 -0.044856  0.356000
0    0.505765
1   -0.245129
2   -0.549784
3   -0.131546
4   -0.189285
Name: a, dtype: float64
0    0.505765
1   -0.245129
2   -0.549784
Name: a, dtype: float64
          b         c
1 -0.171176 -0.699629
2 -0.567473  0.362150
3  0.707825 -0.194642
          b         c         d
1 -0.171176 -0.699629  0.797298
2 -0.567473  0.362150  0.888390
3  0.707825 -0.194642 -0.892236


### 2. iloc 位置索引

In [14]:
#Series
print(ser_obj)
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])
# print(ser_obj.iloc['B':'D']) # error基于位置索引，不能使用编号

A    0
B    1
C    2
D    3
E    4
dtype: int64
B    1
C    2
dtype: int64
B    1
C    2
dtype: int64


In [13]:
# DataFrame
print(df_obj)
print(df_obj.iloc[0:2, 0])
# print(df_obj[0:2, 0]) # error
print(df_obj.iloc[0:2, 0:2])

          a         b         c         d
0  0.443089 -1.252251 -0.068372  0.693918
1  0.150004 -0.555430  0.876956  0.584483
2  0.571691  1.870179 -0.993798 -0.117185
3  0.747551  0.024265 -0.025589  1.452942
4  1.031956 -0.789453 -1.482885  1.127424
0    0.443089
1    0.150004
Name: a, dtype: float64
          a         b
0  0.443089 -1.252251
1  0.150004 -0.555430


### 3. ix 标签与位置混合索引

In [19]:
#Series
print(ser_obj)
print(ser_obj.ix[1:3])
# print(ser_obj.ix['B':'D'])

A    0
B    1
C    2
D    3
E    4
dtype: int64


AttributeError: 'Series' object has no attribute 'ix'

In [23]:
#DataFrame
print(df_obj)
print(df_obj.loc[0:2, 'a'])
# print(df_obj.ix[0:2, 'a'])

          a         b         c         d
0  0.443089 -1.252251 -0.068372  0.693918
1  0.150004 -0.555430  0.876956  0.584483
2  0.571691  1.870179 -0.993798 -0.117185
3  0.747551  0.024265 -0.025589  1.452942
4  1.031956 -0.789453 -1.482885  1.127424
0    0.443089
1    0.150004
2    0.571691
Name: a, dtype: float64


## 分组和聚合

In [24]:
# 范例
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randn(8),
            'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)
#分组操作，如果对整个数据集进行分组，groupby参数直接指定列名即可
grouped = df_obj.groupby("key2")
print(grouped) 

  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F8BC333A20>


In [26]:
result =grouped.sum()
print(result)


          data1     data2
key2                     
one    5.593232  0.326011
three  1.942064  1.152670
two    3.495245 -0.621303


In [30]:
print(df_obj)
grouped2 = df_obj["data2"].groupby(df_obj["key1"])
grouped2.mean() #求平均值 


  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296


key1
a    0.232373
b   -0.101496
Name: data2, dtype: float64

In [29]:
print(df_obj)
grouped1 = df_obj.groupby('key1') 
print(grouped1.mean())
print(grouped1.size())

  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296
         data1     data2
key1                    
a     1.320974  0.232373
b     1.475224 -0.101496
key1
a    5
b    3
dtype: int64


In [31]:
print(df_obj.describe())

          data1     data2
count  8.000000  8.000000
mean   1.378818  0.107172
std    0.473161  0.588755
min    0.712416 -0.805121
25%    1.066126 -0.137474
50%    1.391414  0.056308
75%    1.826228  0.242505
max    1.921876  1.287296


### 自定义分组

In [33]:
print(df_obj)
self_def_key =[0,0,1,2,3,3,4,5] # 定义索引和分组

print(df_obj.groupby(self_def_key).size())



  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296
0    2
1    1
2    1
3    2
4    1
5    1
dtype: int64


In [35]:
print(df_obj)
## 定义索引和分组
self_def_key =['group1','group2','group1','group3','group3','group3','group2','group3'] 
grouped3 = df_obj.groupby(self_def_key)
print(df_obj.groupby(self_def_key).size())
print(df_obj.groupby(self_def_key).max())

  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296
group1    2
group2    2
group3    4
dtype: int64
       key1 key2     data1     data2
group1    a  two  1.816777  0.136459
group2    b  one  1.921876  0.213395
group3    b  two  1.440253  1.287296


### GroupBy对象支持迭代操作

In [37]:
#单层分组 - 范例
self_def_key =['group1','group2','group1','group3','group3','group3','group2','group3'] 
grouped3 = df_obj.groupby(self_def_key)
for group_name, group_data in grouped3:
    print(group_name)
    print(group_data)

group1
  key1 key2     data1     data2
0    a  one  1.816777  0.136459
2    a  two  0.712416 -0.805121
group2
  key1 key2     data1     data2
1    b  one  1.921876 -0.023843
6    a  one  1.854579  0.213395
group3
  key1   key2     data1     data2
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
7    a  three  0.780843  1.287296


In [38]:
# 多层分组
# 可以指定多个列，索引顺序按列表里的参数顺序来决定
# 范例
#按多个列多层分组
print(df_obj)
grouped2 = df_obj.groupby(['key1', 'key2'])
print(grouped2.size())

#多层分组，根据key1 和 key2
for group_name, group_data in grouped2:
    print(group_name)
    print(group_data)


  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
('a', 'one')
  key1 key2     data1     data2
0    a  one  1.816777  0.136459
6    a  one  1.854579  0.213395
('a', 'three')
  key1   key2     data1     data2
7    a  three  0.780843  1.287296
('a', 'two')
  key1 key2     data1     data2
2    a  two  0.712416 -0.805121
4    a  two  1.440253  0.329836
('b', 'one')
  key1 key2     data1     data2
1    b  one  1.921876 -0.023843
('b', 'three')
  key1   key2    data1     data2
3    b  three  1.16122 -0.134626
('b', 'two')
  key1 key2     data1     data2
5    b  two  1.342576 -0.146018


In [39]:
### GroupBy对象可以转换成列表
print(list(grouped2))


[(('a', 'one'),   key1 key2     data1     data2
0    a  one  1.816777  0.136459
6    a  one  1.854579  0.213395), (('a', 'three'),   key1   key2     data1     data2
7    a  three  0.780843  1.287296), (('a', 'two'),   key1 key2     data1     data2
2    a  two  0.712416 -0.805121
4    a  two  1.440253  0.329836), (('b', 'one'),   key1 key2     data1     data2
1    b  one  1.921876 -0.023843), (('b', 'three'),   key1   key2    data1     data2
3    b  three  1.16122 -0.134626), (('b', 'two'),   key1 key2     data1     data2
5    b  two  1.342576 -0.146018)]


### 按列分组、数据类型分组

In [42]:
print(df_obj)
print(df_obj.dtypes) #显示有两个float64类型，两个object类型 
#按数据类型分组 
print(df_obj.groupby(df_obj.dtypes, axis=1).size())
print(df_obj.groupby(df_obj.dtypes, axis=1).sum())

  key1   key2     data1     data2
0    a    one  1.816777  0.136459
1    b    one  1.921876 -0.023843
2    a    two  0.712416 -0.805121
3    b  three  1.161220 -0.134626
4    a    two  1.440253  0.329836
5    b    two  1.342576 -0.146018
6    a    one  1.854579  0.213395
7    a  three  0.780843  1.287296
key1      object
key2      object
data1    float64
data2    float64
dtype: object
float64    2
object     2
dtype: int64
    float64  object
0  1.953236    aone
1  1.898033    bone
2 -0.092706    atwo
3  1.026594  bthree
4  1.770089    atwo
5  1.196559    btwo
6  2.067974    aone
7  2.068140  athree


### 通过字典分组

In [52]:
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
df_obj2.loc['B', 1:4] = np.NaN
print(df_obj2)

mapping_dict = {'a':'Python', 'b':'Python', 'c':'Java', 'd':'C', 'e':'Java'}
# print(df_obj2.groupby(mapping_dict, axis=1).size())
print(df_obj2.groupby(mapping_dict, axis=1).count()) # 非NaN的个数
print(df_obj2.groupby(mapping_dict, axis=1).sum())


   a    b    c    d  e
A  1  2.0  1.0  6.0  9
B  8  NaN  NaN  NaN  1
C  9  9.0  7.0  9.0  4
D  1  3.0  2.0  2.0  1
E  1  7.0  3.0  5.0  9
   C  Java  Python
A  1     2       2
B  0     1       1
C  1     2       2
D  1     2       2
E  1     2       2
     C  Java  Python
A  6.0  10.0     3.0
B  0.0   1.0     8.0
C  9.0  11.0    18.0
D  2.0   3.0     4.0
E  5.0  12.0     8.0


In [48]:
print(df_obj2)

mapping_dict = {'A':'Python', 'B':'Python', 'C':'Java', 'D':'C', 'E':'Java'}
print(df_obj2.groupby(mapping_dict, axis=0).size())
print(df_obj2.groupby(mapping_dict, axis=0).count()) # 非NaN的个数
print(df_obj2.groupby(mapping_dict, axis=0).sum())

   a    b    c    d  e
A  3  6.0  2.0  2.0  9
B  9  NaN  NaN  NaN  8
C  3  3.0  9.0  6.0  5
D  8  5.0  5.0  4.0  5
E  7  6.0  3.0  8.0  3
C         1
Java      2
Python    2
dtype: int64
        a  b  c  d  e
C       1  1  1  1  1
Java    2  2  2  2  2
Python  2  1  1  1  2
         a    b     c     d   e
C        8  5.0   5.0   4.0   5
Java    10  9.0  12.0  14.0   8
Python  12  6.0   2.0   2.0  17


## 聚合

In [9]:
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 
            'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 
            'data1': np.random.randint(1,10, 8), 
            'data2': np.random.randint(1,10, 8)} 
df_obj5 = pd.DataFrame(dict_obj) 
print(df_obj5)
print(type(df_obj5))
print(df_obj5.groupby('key1').sum())
print(df_obj5.groupby('key1').max())
print(df_obj5.groupby('key1').min())
print(df_obj5.groupby('key1').mean())
print(df_obj5.groupby('key1').size())
print(df_obj5.groupby('key1').count())
print(df_obj5.groupby('key1').describe())


  key1   key2  data1  data2
0    a    one      1      7
1    b    one      7      1
2    a    two      1      7
3    b  three      7      5
4    a    two      8      6
5    b    two      5      1
6    a    one      5      6
7    a  three      1      1
<class 'pandas.core.frame.DataFrame'>
      data1  data2
key1              
a        16     27
b        19      7
     key2  data1  data2
key1                   
a     two      8      7
b     two      7      5
     key2  data1  data2
key1                   
a     one      1      1
b     one      5      1
         data1     data2
key1                    
a     3.200000  5.400000
b     6.333333  2.333333
key1
a    5
b    3
dtype: int64
      key2  data1  data2
key1                    
a        5      5      5
b        3      3      3
     data1                                              data2            \
     count      mean       std  min  25%  50%  75%  max count      mean   
key1                                                        

### 自定义聚合函数

In [12]:
print(df_obj5)
def func(num): #最大值和最小值的差
    return num.max() - num.min() 
print(df_obj5.groupby("key2").agg(func))
print(df_obj5.groupby("key2").agg(lambda df:df.max()-df.min()))

  key1   key2  data1  data2
0    a    one      1      7
1    b    one      7      1
2    a    two      1      7
3    b  three      7      5
4    a    two      8      6
5    b    two      5      1
6    a    one      5      6
7    a  three      1      1
       data1  data2
key2               
one        6      6
three      6      4
two        7      6
       data1  data2
key2               
one        6      6
three      6      4
two        7      6


### 多个聚合函数的调用

In [13]:
print(df_obj5)

print(df_obj5.groupby("key2").agg(['sum','mean','max',func]))


  key1   key2  data1  data2
0    a    one      1      7
1    b    one      7      1
2    a    two      1      7
3    b  three      7      5
4    a    two      8      6
5    b    two      5      1
6    a    one      5      6
7    a  three      1      1
      data1                    data2                   
        sum      mean max func   sum      mean max func
key2                                                   
one      13  4.333333   7    6    14  4.666667   7    6
three     8  4.000000   7    6     6  3.000000   5    4
two      14  4.666667   8    7    14  4.666667   7    6


### 对不同列使用不同聚合函数


In [16]:
print(df_obj5)
dict_mapping = {'data1':'mean','data2':'sum'}
print(df_obj5.groupby('key1').agg(dict_mapping))
dict_mapping = {'data1':'mean','data2':['sum','mean',func]}
print(df_obj5.groupby('key1').agg(dict_mapping))

  key1   key2  data1  data2
0    a    one      1      7
1    b    one      7      1
2    a    two      1      7
3    b  three      7      5
4    a    two      8      6
5    b    two      5      1
6    a    one      5      6
7    a  three      1      1
         data1  data2
key1                 
a     3.200000     27
b     6.333333      7
         data1 data2               
          mean   sum      mean func
key1                               
a     3.200000    27  5.400000    6
b     6.333333     7  2.333333    4


In [18]:
#按key1分组后，计算data1，data2的统计信息并附加到原始表格中，并添加表头前缀
k1_sum = df_obj5.groupby('key1').sum().add_prefix('sum_')
print(k1_sum)


      sum_data1  sum_data2
key1                      
a            16         27
b            19          7


# Pandas的排序

## 1. 索引排序

### Series

In [20]:
ser_obj2 = pd.Series(range(10,15),index=np.random.randint(5,size=5))
print(ser_obj2)

4    10
1    11
3    12
4    13
1    14
dtype: int64


In [22]:
print(ser_obj2.sort_index()) # 默认升序排序
print(ser_obj2.sort_index(ascending=False))

1    11
1    14
3    12
4    10
4    13
dtype: int64
4    10
4    13
3    12
1    11
1    14
dtype: int64


### DataFrame

In [28]:
#对DataFrame操作时注意轴方向，默认列，axis = 1 为行
df_obj = pd.DataFrame(np.random.randn(3, 5), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(5, size=5))
print(df_obj )
# 列索引降序排序
df_obj_isort = df_obj.sort_index(axis=1, ascending=False)
print(df_obj_isort)
# 行索引升序排序 （默认就是这种排序）等同 df_obj.sort_index()
df_obj_isort = df_obj.sort_index(axis=0)
print(df_obj_isort)

          3         2         2         3         1
1  2.051069 -1.662379 -0.564714 -0.251901 -1.176514
0  0.428163  1.042522 -0.907377  0.709390  0.761151
2  1.551054  0.334315 -0.240035 -1.043763  1.487060
          3         3         2         2         1
1  2.051069 -0.251901 -1.662379 -0.564714 -1.176514
0  0.428163  0.709390  1.042522 -0.907377  0.761151
2  1.551054 -1.043763  0.334315 -0.240035  1.487060
          3         2         2         3         1
0  0.428163  1.042522 -0.907377  0.709390  0.761151
1  2.051069 -1.662379 -0.564714 -0.251901 -1.176514
2  1.551054  0.334315 -0.240035 -1.043763  1.487060


## 2.按值排序

sort_values(by='column name')
根据某个唯一的列名进行排序，如果有其他相同列名则报错


### Series对象 


In [29]:
#创建随机Series对象
ser_obj = pd.Series(np.random.randint(10,20,size= 10))
print(ser_obj)
print(ser_obj.sort_values()) #默认升序
print(ser_obj.sort_values(ascending = False)) #降序

0    18
1    12
2    10
3    17
4    17
5    13
6    16
7    11
8    12
9    14
dtype: int32
2    10
7    11
1    12
8    12
5    13
9    14
6    16
3    17
4    17
0    18
dtype: int32
0    18
4    17
3    17
6    16
9    14
5    13
8    12
1    12
7    11
2    10
dtype: int32


###  DataFrame对象


In [31]:
# sort_values(by='column name') by:'列/行名'
#如果根据某一个行名/列名来排序时，
# 要保证没有其他相同的行名/列名，axis 指定排序的轴方向
df4 = pd.DataFrame(np.random.randn(3, 5), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(5, size=5))
print(df4)


          3         3         0         2         1
0  0.057544 -0.572330 -1.329857  0.933553  0.077042
0 -0.249510  0.701981 -0.352544 -0.294493 -0.261373
1 -1.669200  1.218625 -0.685515  0.517565 -0.233256


In [32]:
print(df4.sort_values(by=0)) # 列名必须唯一
print(df4.sort_values(by=1,axis=1))

          3         3         0         2         1
0  0.057544 -0.572330 -1.329857  0.933553  0.077042
1 -1.669200  1.218625 -0.685515  0.517565 -0.233256
0 -0.249510  0.701981 -0.352544 -0.294493 -0.261373
          3         0         1         2         3
0  0.057544 -1.329857  0.077042  0.933553 -0.572330
0 -0.249510 -0.352544 -0.261373 -0.294493  0.701981
1 -1.669200 -0.685515 -0.233256  0.517565  1.218625


# 数据清洗
## 处理缺失数据

In [33]:
df_obj = pd.DataFrame([
    [1,2,np.nan,np.nan],
    [np.nan,3,4,np.nan],
    list(range(4))
]
)
# 如果某一列中有一个数是Nan或者浮点数，其他数都转换为浮点数
print(df_obj)


     0  1    2    3
0  1.0  2  NaN  NaN
1  NaN  3  4.0  NaN
2  0.0  1  2.0  3.0


In [35]:
# 判断缺失值
print(df_obj.isnull())

       0      1      2      3
0  False  False   True   True
1   True  False  False   True
2  False  False  False  False


### 删除缺失值

In [38]:
print(df_obj)
# 删除缺失值所在的行
print(df_obj.dropna()) 
# 删除缺失值所在的列
print(df_obj.dropna(axis=1)) 

     0  1    2    3
0  1.0  2  NaN  NaN
1  NaN  3  4.0  NaN
2  0.0  1  2.0  3.0
     0  1    2    3
2  0.0  1  2.0  3.0
   1
0  2
1  3
2  1


### 填充缺失数据

In [39]:
print(df_obj)
print(df_obj.fillna(0))


     0  1    2    3
0  1.0  2  NaN  NaN
1  NaN  3  4.0  NaN
2  0.0  1  2.0  3.0
     0  1    2    3
0  1.0  2  0.0  0.0
1  0.0  3  4.0  0.0
2  0.0  1  2.0  3.0


## 数据去重

In [40]:
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4, 
                       'data2' : np.random.randint(0, 4, 8)})
print(df_obj)


  data1  data2
0     a      0
1     a      3
2     a      2
3     a      0
4     b      0
5     b      3
6     b      0
7     b      1


In [41]:
#是否有重复行，和之前的行进行对比判断，如果有重复，则返回true，否则 false
print(df_obj.duplicated())

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7    False
dtype: bool


## 过滤重复行

In [42]:
print(df_obj)
# 去重重复行，每一列都相同
print(df_obj.drop_duplicates())


  data1  data2
0     a      0
1     a      3
2     a      2
3     a      0
4     b      0
5     b      3
6     b      0
7     b      1
  data1  data2
0     a      0
1     a      3
2     a      2
4     b      0
5     b      3
7     b      1


In [43]:
print(df_obj)
# 去重重复行，指定列相同
print(df_obj.drop_duplicates("data2")) 


  data1  data2
0     a      0
1     a      3
2     a      2
3     a      0
4     b      0
5     b      3
6     b      0
7     b      1
  data1  data2
0     a      0
1     a      3
2     a      2
7     b      1


## 根据map传入的函数对每行或每列进行转换


In [44]:
# Series根据map传入的函数对每行或每列进行转换
# 范例
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj) 
print(ser_obj.map(lambda x : x ** 2))

0    6
1    9
2    7
3    2
4    9
5    7
6    9
7    5
8    8
9    2
dtype: int32
0    36
1    81
2    49
3     4
4    81
5    49
6    81
7    25
8    64
9     4
dtype: int64


## 数据替换


In [47]:
# replace根据值的内容进行替换
# 接收两个参数，第一个参数的值替换为第二个参数的值
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj)
# 0    1
# 1    1
# 2    7
# 3    3
# 4    0
# 5    6
# 6    2
# 7    4
# 8    6
# 9    8
# dtype: int32

0    1
1    1
2    7
3    3
4    0
5    6
6    2
7    4
8    6
9    8
dtype: int32


In [48]:
print(ser_obj.replace(5,100)) #把所有5替换为100
print(ser_obj.replace([4,5],100)) # 多个值替换一个值
print(ser_obj.replace([4, 7], [-100, -200]))# 多个值替换多个值

0    1
1    1
2    7
3    3
4    0
5    6
6    2
7    4
8    6
9    8
dtype: int32
0      1
1      1
2      7
3      3
4      0
5      6
6      2
7    100
8      6
9      8
dtype: int32
0      1
1      1
2   -200
3      3
4      0
5      6
6      2
7   -100
8      6
9      8
dtype: int64


# Pandas的函数应用
## 1. 可直接使用NumPy的函数

In [49]:
# 在numpy里，可以对数组里的每一个元素进行操作的函数称为
# ufunc 通用函数（universal function）
# 在Pandas里可以直接使用Numpy的ufunc
# 范例
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)
print(np.abs(df)) # 求绝对值


          0         1         2         3
0 -1.362358 -2.295209  0.312203 -1.414287
1 -0.733339 -1.263318  0.000027  0.119743
2  0.854483 -1.919220 -0.623614 -0.903995
3 -0.847397  0.269649  1.397216 -1.086830
4 -1.151375 -2.398277 -2.828165  0.042131
          0         1         2         3
0  1.362358  2.295209  0.312203  1.414287
1  0.733339  1.263318  0.000027  0.119743
2  0.854483  1.919220  0.623614  0.903995
3  0.847397  0.269649  1.397216  1.086830
4  1.151375  2.398277  2.828165  0.042131


## 2. 通过apply将函数应用到列或行上

In [50]:
# axis参数可以指定轴向，默认值为0，方向是列，值为1：方向是行
# 范例
f = lambda x : x.max()
print(df)
print(df.apply(lambda x : x.max()))
print(df.apply(lambda x : x.max(), axis=1))


          0         1         2         3
0 -1.362358 -2.295209  0.312203 -1.414287
1 -0.733339 -1.263318  0.000027  0.119743
2  0.854483 -1.919220 -0.623614 -0.903995
3 -0.847397  0.269649  1.397216 -1.086830
4 -1.151375 -2.398277 -2.828165  0.042131
0    0.854483
1    0.269649
2    1.397216
3    0.119743
dtype: float64
0    0.312203
1    0.119743
2    0.854483
3    1.397216
4    0.042131
dtype: float64



## 3. 通过applymap将函数应用到每个数据上，只用于DataFrame

In [53]:
print(df)
f1 = lambda x : '%.2f' % x
print(df.applymap(f1))
f2 = lambda x: x+x
print(df.applymap(f2))

# DataFrame对象可以用apply()和applymap()
# apply()应用于行，列，可以通过axis来指定

          0         1         2         3
0 -1.362358 -2.295209  0.312203 -1.414287
1 -0.733339 -1.263318  0.000027  0.119743
2  0.854483 -1.919220 -0.623614 -0.903995
3 -0.847397  0.269649  1.397216 -1.086830
4 -1.151375 -2.398277 -2.828165  0.042131
       0      1      2      3
0  -1.36  -2.30   0.31  -1.41
1  -0.73  -1.26   0.00   0.12
2   0.85  -1.92  -0.62  -0.90
3  -0.85   0.27   1.40  -1.09
4  -1.15  -2.40  -2.83   0.04
          0         1         2         3
0 -2.724716 -4.590418  0.624406 -2.828573
1 -1.466678 -2.526637  0.000055  0.239486
2  1.708967 -3.838439 -1.247229 -1.807990
3 -1.694794  0.539299  2.794432 -2.173660
4 -2.302750 -4.796555 -5.656330  0.084262
