# 对官方十分钟入门pandas的学习
[link](http://pandas.pydata.org/pandas-docs/stable/10min.html)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [58]:
pd.__version__

'0.20.3'

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range('20180222',periods=6)
dates

DatetimeIndex(['2018-02-22', '2018-02-23', '2018-02-24', '2018-02-25',
               '2018-02-26', '2018-02-27'],
              dtype='datetime64[ns]', freq='D')

In [10]:
#dataframe
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-02-22,-0.606309,-0.993584,-0.23117,-0.074665
2018-02-23,1.382348,-0.669545,-0.571891,0.836428
2018-02-24,-1.659021,1.594004,0.080914,-1.150539
2018-02-25,-0.516984,-0.607327,-0.111983,0.589295
2018-02-26,-0.696158,0.694692,-1.293918,-0.460379
2018-02-27,0.290481,-0.270417,-1.10742,-0.269404


In [19]:
#dataframe 取值
## 列取值
### 对于有columns的dataframe
df['A']

2018-02-22   -0.606309
2018-02-23    1.382348
2018-02-24   -1.659021
2018-02-25   -0.516984
2018-02-26   -0.696158
2018-02-27    0.290481
Freq: D, Name: A, dtype: float64

In [24]:
### 取第一列
df.iloc[:,0]

2018-02-22   -0.606309
2018-02-23    1.382348
2018-02-24   -1.659021
2018-02-25   -0.516984
2018-02-26   -0.696158
2018-02-27    0.290481
Freq: D, Name: A, dtype: float64

In [25]:
###取第一列到第三列
df.iloc[:,0:3]

Unnamed: 0,A,B,C
2018-02-22,-0.606309,-0.993584,-0.23117
2018-02-23,1.382348,-0.669545,-0.571891
2018-02-24,-1.659021,1.594004,0.080914
2018-02-25,-0.516984,-0.607327,-0.111983
2018-02-26,-0.696158,0.694692,-1.293918
2018-02-27,0.290481,-0.270417,-1.10742


In [27]:
### 取第一列与第四列
df.iloc[:,[0,3]]

Unnamed: 0,A,D
2018-02-22,-0.606309,-0.074665
2018-02-23,1.382348,0.836428
2018-02-24,-1.659021,-1.150539
2018-02-25,-0.516984,0.589295
2018-02-26,-0.696158,-0.460379
2018-02-27,0.290481,-0.269404


In [26]:
## 行取值
### 取第三行的值
df.iloc[2]

A   -1.659021
B    1.594004
C    0.080914
D   -1.150539
Name: 2018-02-24 00:00:00, dtype: float64

In [28]:
### 取第一行到第三行的值
df.iloc[0:3,:]

Unnamed: 0,A,B,C,D
2018-02-22,-0.606309,-0.993584,-0.23117,-0.074665
2018-02-23,1.382348,-0.669545,-0.571891,0.836428
2018-02-24,-1.659021,1.594004,0.080914,-1.150539


In [29]:
### 取第一行与第三行
df.iloc[[0,2],:]

Unnamed: 0,A,B,C,D
2018-02-22,-0.606309,-0.993584,-0.23117,-0.074665
2018-02-24,-1.659021,1.594004,0.080914,-1.150539


In [31]:
## 取指定值
### 取第二行第二列的值
df.iat[1,1]

-0.66954450071749294

In [40]:
## 过滤取值
### 数值大小判断取值(第一列的值大于-0.5并且小于1)
df[(df.A > -0.5) & (df.A < 1)]

Unnamed: 0,A,B,C,D
2018-02-27,0.290481,-0.270417,-1.10742,-0.269404


In [42]:
### 字符串过滤
### isin的使用
### 选取包含two与four字符串的行
df2 = df.copy()
df2['E'] = ['One', 'one','two','three','four','three']
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2018-02-24,-1.659021,1.594004,0.080914,-1.150539,two
2018-02-26,-0.696158,0.694692,-1.293918,-0.460379,four


## 缺失值的处理

In [52]:
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
df1.iloc[0:3,4] = 1
df1

Unnamed: 0,A,B,C,D,E
2018-02-22,-0.606309,-0.993584,-0.23117,-0.074665,1.0
2018-02-23,1.382348,-0.669545,-0.571891,0.836428,1.0
2018-02-24,-1.659021,1.594004,0.080914,-1.150539,1.0
2018-02-25,-0.516984,-0.607327,-0.111983,0.589295,


In [53]:
### 剔除包含缺失值的行
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2018-02-22,-0.606309,-0.993584,-0.23117,-0.074665,1.0
2018-02-23,1.382348,-0.669545,-0.571891,0.836428,1.0
2018-02-24,-1.659021,1.594004,0.080914,-1.150539,1.0


In [54]:
### 缺失值填补 
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2018-02-22,-0.606309,-0.993584,-0.23117,-0.074665,1.0
2018-02-23,1.382348,-0.669545,-0.571891,0.836428,1.0
2018-02-24,-1.659021,1.594004,0.080914,-1.150539,1.0
2018-02-25,-0.516984,-0.607327,-0.111983,0.589295,5.0


In [56]:
### To get the boolean mask where values are nan（用布尔值标记缺失值与否）
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2018-02-22,False,False,False,False,False
2018-02-23,False,False,False,False,False
2018-02-24,False,False,False,False,False
2018-02-25,False,False,False,False,True
