In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd

### Pandas 数据生成

In [3]:
s = pd.Series([1, 2, 3, np.nan, 5, 6])
print("\n 生成序列：\n", s)


 生成序列：
 0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
5    6.0
dtype: float64


In [4]:
date = pd.date_range('20190520', periods=6)
print("\n 生成连续日期：\n", date)


 生成连续日期：
 DatetimeIndex(['2019-05-20', '2019-05-21', '2019-05-22', '2019-05-23',
               '2019-05-24', '2019-05-25'],
              dtype='datetime64[ns]', freq='D')


In [5]:
df = pd.DataFrame(np.arange(12).reshape(3, 4))
print("\n 生成随机二维矩阵并设置默认行列标题：\n", df)


 生成随机二维矩阵并设置默认行列标题：
    0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [6]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=date, columns=['a', 'b', 'c', 'd'])
print("\n 生成随机二维矩阵并设置行列标题：\n", df1)


 生成随机二维矩阵并设置行列标题：
                    a         b         c         d
2019-05-20  0.113994  0.098927 -1.126848  1.453379
2019-05-21  0.905360 -0.733587  0.693714  1.245601
2019-05-22 -0.152723 -0.329269 -1.125867 -0.723611
2019-05-23  0.390933 -0.166435  0.294369 -1.120636
2019-05-24  1.958263  1.018412 -0.493184 -0.340827
2019-05-25  0.394073  0.000110 -0.310317 -0.367956


In [7]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

print("\n 生成指定二维矩阵并设置行列标题：\n", df2)
print("\n 行标题：\n", df2.index)
print("\n 列标题：\n", df2.columns)b
print("\n 值信息：\n", df2.values)
print("\n 数据描述：\n", df2.describe())
print("\n 转置：\n", df2.T)
print("\n 按行逆序排列：\n", df2.sort_index(axis=1, ascending=False))
print("\n 按某一列的值排序：\n", df2.sort_values(by='B'))


 生成指定二维矩阵并设置行列标题：
      A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo

 行标题：
 Int64Index([0, 1, 2, 3], dtype='int64')

 列标题：
 Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

 值信息：
 [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]

 数据描述：
          A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0

 转置：
                      0                    1                    2  \
A                    1                    1                    1   
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
C                    1  

### Pandas 数据选择

In [9]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])

In [10]:
print("df:", df)

df:                    A         B         C         D
2013-01-01  0.384278  0.563643 -0.064643  1.242331
2013-01-02  0.291294 -0.243147 -0.824511  1.285801
2013-01-03  0.595584  0.080917  0.843154  0.506105
2013-01-04  0.714111 -2.690325  0.698047 -0.138562
2013-01-05 -0.148739 -1.431588 -1.250028 -1.111583
2013-01-06  0.692900  1.264165 -0.415206 -0.757147


#### 根据行列选择

In [11]:
print("\n 选择某一列：\n", df['A'], df.A)
print("\n 选择某一行：\n", df[0:3], df['20130102':'20130104'])


 选择某一列：
 2013-01-01    0.384278
2013-01-02    0.291294
2013-01-03    0.595584
2013-01-04    0.714111
2013-01-05   -0.148739
2013-01-06    0.692900
Freq: D, Name: A, dtype: float64 2013-01-01    0.384278
2013-01-02    0.291294
2013-01-03    0.595584
2013-01-04    0.714111
2013-01-05   -0.148739
2013-01-06    0.692900
Freq: D, Name: A, dtype: float64

 选择某一行：
                    A         B         C         D
2013-01-01  0.384278  0.563643 -0.064643  1.242331
2013-01-02  0.291294 -0.243147 -0.824511  1.285801
2013-01-03  0.595584  0.080917  0.843154  0.506105                    A         B         C         D
2013-01-02  0.291294 -0.243147 -0.824511  1.285801
2013-01-03  0.595584  0.080917  0.843154  0.506105
2013-01-04  0.714111 -2.690325  0.698047 -0.138562


#### 根据标签选择 select by label: loc 

In [12]:
print("\n 以标签的名义选择：\n", df.loc['20130102'])
print("\n 选择所有行 某两列：\n", df.loc[:, ['A', 'B']])
print("\n 选择某一行 某两列：\n", df.loc['20130102', ['A', 'B']])


 以标签的名义选择：
 A    0.291294
B   -0.243147
C   -0.824511
D    1.285801
Name: 2013-01-02 00:00:00, dtype: float64

 选择所有行 某两列：
                    A         B
2013-01-01  0.384278  0.563643
2013-01-02  0.291294 -0.243147
2013-01-03  0.595584  0.080917
2013-01-04  0.714111 -2.690325
2013-01-05 -0.148739 -1.431588
2013-01-06  0.692900  1.264165

 选择某一行 某两列：
 A    0.291294
B   -0.243147
Name: 2013-01-02 00:00:00, dtype: float64


#### 根据位置选择 select by position: iloc 

In [13]:
print("\n 选择某一行数据：\n", df.iloc[3])
print("\n 选择某一行数据某一位：\n", df.iloc[3, 1])
print("\n 切片，选择行列某范围数据：\n", df.iloc[3:5, 0:2])
print("\n 逐个筛选：\n", df.iloc[[1, 2, 4], [0, 2]])


 选择某一行数据：
 A    0.714111
B   -2.690325
C    0.698047
D   -0.138562
Name: 2013-01-04 00:00:00, dtype: float64

 选择某一行数据某一位：
 -2.690324689161776

 切片，选择行列某范围数据：
                    A         B
2013-01-04  0.714111 -2.690325
2013-01-05 -0.148739 -1.431588

 逐个筛选：
                    A         C
2013-01-02  0.291294 -0.824511
2013-01-03  0.595584  0.843154
2013-01-05 -0.148739 -1.250028


#### 位置和标签混合筛选 mixed selection: ix 

In [15]:
print("\n 位置和标签混合筛选：\n", df.ix[:3, ['A', 'C']])


 位置和标签混合筛选：
                    A         C
2013-01-01  0.384278 -0.064643
2013-01-02  0.291294 -0.824511
2013-01-03  0.595584  0.843154


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


#### 比较筛选 Boolean indexing

In [16]:
print("\n 比较筛选：\n", df[df.A > 0])


 比较筛选：
                    A         B         C         D
2013-01-01  0.384278  0.563643 -0.064643  1.242331
2013-01-02  0.291294 -0.243147 -0.824511  1.285801
2013-01-03  0.595584  0.080917  0.843154  0.506105
2013-01-04  0.714111 -2.690325  0.698047 -0.138562
2013-01-06  0.692900  1.264165 -0.415206 -0.757147


### Pandas 设置值

In [17]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)

                   A         B         C         D
2013-01-01  0.193269  0.321133  0.564039  0.792025
2013-01-02  0.260250  2.231436 -0.007299 -0.804777
2013-01-03  0.668056  1.713257  0.733534 -1.470216
2013-01-04  0.702817  0.111031 -0.643776 -1.352658
2013-01-05 -0.828930  0.766173  0.269518  0.751904
2013-01-06  0.697877  0.438005 -0.554430  0.468898


#### 根据位置赋值

In [18]:
df.iloc[2, 2] = 1111
print(df)

                   A         B            C         D
2013-01-01  0.193269  0.321133     0.564039  0.792025
2013-01-02  0.260250  2.231436    -0.007299 -0.804777
2013-01-03  0.668056  1.713257  1111.000000 -1.470216
2013-01-04  0.702817  0.111031    -0.643776 -1.352658
2013-01-05 -0.828930  0.766173     0.269518  0.751904
2013-01-06  0.697877  0.438005    -0.554430  0.468898


#### 根据标签赋值

In [19]:
df.loc['2013-01-03', 'D'] = 2222
print(df)

                   A         B            C            D
2013-01-01  0.193269  0.321133     0.564039     0.792025
2013-01-02  0.260250  2.231436    -0.007299    -0.804777
2013-01-03  0.668056  1.713257  1111.000000  2222.000000
2013-01-04  0.702817  0.111031    -0.643776    -1.352658
2013-01-05 -0.828930  0.766173     0.269518     0.751904
2013-01-06  0.697877  0.438005    -0.554430     0.468898


#### 根据运算赋值

In [None]:
df[df.A > 0] = 0  # A列大于0所在的行，所有数值都设置为0
df.A[df.A > 0] = 0  # A列大于0的值设置为0
print(df)

#### 添加空列

In [20]:
df['F'] = np.nan
print(df)

                   A         B            C            D   F
2013-01-01  0.193269  0.321133     0.564039     0.792025 NaN
2013-01-02  0.260250  2.231436    -0.007299    -0.804777 NaN
2013-01-03  0.668056  1.713257  1111.000000  2222.000000 NaN
2013-01-04  0.702817  0.111031    -0.643776    -1.352658 NaN
2013-01-05 -0.828930  0.766173     0.269518     0.751904 NaN
2013-01-06  0.697877  0.438005    -0.554430     0.468898 NaN


#### 添加非空列

In [21]:
df['G'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods=6))
print(df)

                   A         B            C            D   F  G
2013-01-01  0.193269  0.321133     0.564039     0.792025 NaN  1
2013-01-02  0.260250  2.231436    -0.007299    -0.804777 NaN  2
2013-01-03  0.668056  1.713257  1111.000000  2222.000000 NaN  3
2013-01-04  0.702817  0.111031    -0.643776    -1.352658 NaN  4
2013-01-05 -0.828930  0.766173     0.269518     0.751904 NaN  5
2013-01-06  0.697877  0.438005    -0.554430     0.468898 NaN  6


### Pandas 处理丢失数据

In [23]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])

df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)

             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23


#### 按行丢弃数据

In [24]:
print(df.dropna(axis=0, how='any'))  # 一行中有任何一个数据为nan则丢弃一行
print(df.dropna(axis=0, how='all'))  # 一行所有数据为nan则丢弃一行

             A     B     C   D
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23
             A     B     C   D
2013-01-01   0   NaN   2.0   3
2013-01-02   4   5.0   NaN   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23


#### nan数据置为0

In [25]:
print(df.fillna(value=0))  

             A     B     C   D
2013-01-01   0   0.0   2.0   3
2013-01-02   4   5.0   0.0   7
2013-01-03   8   9.0  10.0  11
2013-01-04  12  13.0  14.0  15
2013-01-05  16  17.0  18.0  19
2013-01-06  20  21.0  22.0  23


#### 判断哪一个有缺失值

In [26]:
print(pd.isnull(df))  

                A      B      C      D
2013-01-01  False   True  False  False
2013-01-02  False  False   True  False
2013-01-03  False  False  False  False
2013-01-04  False  False  False  False
2013-01-05  False  False  False  False
2013-01-06  False  False  False  False


#### 判断是否有缺失值

In [27]:
print(np.any(pd.isnull(df))==True)  

True
