### pandas Pathon Data Analysis Library

#### 数据分析三剑客  pandas numpy matplotlib

Series 是一种类似一维数组的对象， 
- index : 相关数据索引标签
- values : 一组数据

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
from pandas import Series, DataFrame

In [15]:
s1 = Series([1, 2, 3, 4])

In [16]:
s1

0    1
1    2
2    3
3    4
dtype: int64

In [17]:
s1.index  # 索引

RangeIndex(start=0, stop=4, step=1)

In [19]:
s1.values  # 所有的值， 是一个数组

array([1, 2, 3, 4], dtype=int64)

In [20]:
s2 = Series(np.random.randint(1, 100, size=(2, 4)))

Exception: Data must be 1-dimensional

### 通过 index 设置索引

- 指定索引，注意索引的个数要和值的个数是对应的

In [29]:
s2 =Series([1, 2, 3], index=['a', 'b', 'c'])

### 由字典来创建

In [23]:
s = {'a': 1, 'b': 2, 'c': 3}
ss = Series(s)

In [24]:
type(ss)

pandas.core.series.Series

In [43]:
s3 = Series([150, 140, 120, 123], index=["语文","数学","英语","综合"])

### 索引和切片

(1) 显示索引
- 使用 index 中的元素作为索引
- 使用 .loc 作为索引

In [31]:
s2

a    1
b    2
c    3
dtype: int64

In [32]:
s2.loc["a"]

1

In [34]:
s2["b"]

2

In [35]:
s2.loc[["a", "b"]]

a    1
b    2
dtype: int64

In [36]:
s2.loc[["c"]]

c    3
dtype: int64

In [37]:
s2[["b"]]

b    2
dtype: int64

### 隐式索引，获取元素

In [38]:
s2[0]

1

In [40]:
s2.iloc[[0]]  # indexlocation  通过索引号定位元素

a    1
dtype: int64

### 切片

In [44]:
s3

语文    150
数学    140
英语    120
综合    123
dtype: int64

In [45]:
s3[0:2]

语文    150
数学    140
dtype: int64

In [46]:
s3.loc["数学"]

140

In [47]:
s3["数学"]

140

In [49]:
s3[1]

140

In [50]:
s3.iloc[1]

140

### 切片
### 常用的属性和方法

shape size index values
- shape 是一个一维的数组
- size 元素的个数
- index  索引
- values 值

In [52]:
s1.shape

(4,)

### head()  tail()

- head()  取前面几个值  默认值是 5
- tail() 取后面的几个值  默认是5

In [54]:
s1

0    1
1    2
2    3
3    4
dtype: int64

In [55]:
s1.head()
s1.tail()

0    1
1    2
2    3
3    4
dtype: int64

### 多维数组如果有 None 和 nan 就无法进行求和

#### Series 中的 会忽略 None 和 nan 进行求和

- isnull() 内部元素是 nan 则返回 True 
- notnull()  不是 nan 返回 True

In [69]:
n1 = np.array([1, 2, 3, 4, 45, np.nan, None])


In [70]:
s3 = Series([1, 2, 3, 4, 45, np.nan, None])

In [71]:
s3.sum()

55.0

In [72]:
s3

0     1.0
1     2.0
2     3.0
3     4.0
4    45.0
5     NaN
6     NaN
dtype: float64

In [73]:
s3.isnull()

0    False
1    False
2    False
3    False
4    False
5     True
6     True
dtype: bool

### 对数据进行清洗， 剔除空值


In [75]:
s3

0     1.0
1     2.0
2     3.0
3     4.0
4    45.0
5     NaN
6     NaN
dtype: float64

In [77]:
condition = s3.isnull()

In [82]:
s3[condition] = 0

In [83]:
s3

0     1.0
1     2.0
2     3.0
3     4.0
4    45.0
5     0.0
6     0.0
dtype: float64

### name 属性

In [89]:
s3 = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"], name="a")

In [90]:
s4 = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"], name="b")

In [91]:
s3

a    1
b    2
c    3
d    4
e    5
Name: a, dtype: int64

In [92]:
s4

a    1
b    2
c    3
d    4
e    5
Name: b, dtype: int64

In [93]:
s4 + 50

a    51
b    52
c    53
d    54
e    55
Name: b, dtype: int64

In [94]:
s4 * 2

a     2
b     4
c     6
d     8
e    10
Name: b, dtype: int64

### 运算

- 是根据对应的索引来查找对应的位置进行运算

In [96]:
s4 + s3

a     2
b     4
c     6
d     8
e    10
dtype: int64

In [97]:
s3

a    1
b    2
c    3
d    4
e    5
Name: a, dtype: int64

In [98]:
s1

0    1
1    2
2    3
3    4
dtype: int64

In [99]:
s1 + s3

0   NaN
1   NaN
2   NaN
3   NaN
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
dtype: float64

In [101]:
s1.add(s3, fill_value=0)

0    1.0
1    2.0
2    3.0
3    4.0
a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
dtype: float64

### 作业

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame

In [7]:
ndarray1 = np.zeros(shape=10)
ndarray1[4] = 1
ndarray1

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

In [10]:
ndarray2 = np.arange(10, 50)
ndarray2[::-1]

array([49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
       32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
       15, 14, 13, 12, 11, 10])

In [20]:
ndarray4 = np.random.random(size=(10, 10))
np.max(ndarray4)
np.min(ndarray4)
ndarray5 = np.zeros(shape=(10, 10))
ndarray5[[0, -1]] = 1
ndarray5[:,[0, -1]] = 1

ndarray5

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [21]:
nd = np.random.randint(1, 100, size=(5, 5))

nd

array([[78, 86, 89, 79, 90],
       [90,  6,  3, 99, 44],
       [78, 84, 92, 54, 30],
       [57, 48, 48, 92, 19],
       [96,  6, 42, 98,  9]])

In [24]:
nd[[0, -1]]

array([[78, 86, 89, 79, 90],
       [96,  6, 42, 98,  9]])

In [23]:
nd[[0, -1], [0, -1]]

array([78,  9])

In [29]:
nd[[0, -1], [0, -1]]

array([78,  9])

In [33]:
n1 = np.zeros(shape=5)
n2 = np.arange(5)
n1 + n2.reshape(5, 1)

array([[0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1.],
       [2., 2., 2., 2., 2.],
       [3., 3., 3., 3., 3.],
       [4., 4., 4., 4., 4.]])

In [36]:
np.full(shape=(5, 5), fill_value=(0,1,2,3,4))

array([[0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4]])

In [42]:
ndarray8 = np.random.randint(0, 100, 10)
ndarray8

array([65, 88, 48, 31, 33, 80, 71, 71, 34, 14])

In [40]:
np.sort(ndarray8)

array([ 1,  2, 13, 23, 26, 32, 40, 44, 56, 71])

In [46]:
index = np.argmax(ndarray8)
index

1

In [48]:
ndarray8[index] = 0
ndarray8

array([65,  0, 48, 31, 33, 80, 71, 71, 34, 14])

In [49]:
ndarray10 = np.random.randint(0, 100, size=(5, 5))
ndarray10

array([[82, 11, 57, 26,  4],
       [68, 42,  1, 12, 46],
       [90, 71, 18, 62, 88],
       [72, 47, 15, 25, 66],
       [47, 92, 33, 61, 59]])

In [54]:
ndarray10[:, 2]

array([57,  1, 18, 15, 33])

In [55]:
# 排序，按照数值的大小把索引返回

index2 = ndarray10[:, 2].argsort()
index

array([1, 3, 2, 4, 0], dtype=int64)

In [56]:
ndarray10[index2]

array([[68, 42,  1, 12, 46],
       [72, 47, 15, 25, 66],
       [90, 71, 18, 62, 88],
       [47, 92, 33, 61, 59],
       [82, 11, 57, 26,  4]])

### DataFrame

DataFrame 是一个表格型的户数结构， 既有行索引，也有列索引

- 行索引： index
- 列索引： column
- 值： values(numpy的二维数组)

In [57]:
from pandas import DataFrame

In [84]:
df = DataFrame(data=([1, 2, 3, 4, 5]))
ndarray = np.random.randint(0, 150, size=(5, 2))
df2 = DataFrame(data=ndarray, index=list('abcde'), columns=["first", "second"])


AttributeError: module 'numpy' has no attribute 'float8'

In [85]:
df2.values

array([[ 61, 105],
       [ 80, 149],
       [117, 115],
       [131,   4],
       [ 40,  14]])

In [86]:
df2.index
df2.shape

(5, 2)

In [96]:
df2.columns
"语文 数学 英语 理综".split()
"张 里".split()

['张', '里']

In [97]:
data = np.random,randint(0, 150, size=(4, 2))
index = "语文 数学 英语 理综".split()
col = "张 里".split()
df = DataFrame(data=data, index=index， columns=col)

SyntaxError: invalid character in identifier (<ipython-input-97-f63058301405>, line 4)

### 列索引， 行切片

### 对列进行索引

- `df.columns_name ` 
- `df["columns_name"]`
- 获取的一列是一个 Series 对象

### 对行进行索引

- `df.loc["index_name"]`
- `df.iloc[index]`
- loc 方法可以传入行的名称和列的名称来定位元素， 但是必须行元素在列元素的前面

### 终极方法
- `df.values[0, 0]`
- df.values 得到的是一个数组，通过索引操作获取对应的元素

### 切片
- `df[0:2]` 获取的是行

### 列切片

- `df.iloc[:, 0:2]`
- 需要通过 iloc[]  进行对列的切片

### DataFrame 的运算

- \+ 对内部所有的values都加 对应的数值
- df["columns_name"] += 1

### DataFrame 之间做运算

- 加法：行和列都对应的 对应的元素相加
    - 不对应的位置相加为 NaN

In [103]:
df1 = DataFrame(np.random.randint(0, 150, size=(5, 4)), 
               index=list("12345"),
               columns=list("ABCD"))
df2 = DataFrame(np.random.randint(0, 150, size=(5, 5)), 
               index=list("12345"),
               columns=list("ABCDE"))

In [104]:
display(df1, df2)

Unnamed: 0,A,B,C,D
1,8,0,27,27
2,117,142,112,68
3,111,103,85,17
4,34,105,65,60
5,20,44,144,106


Unnamed: 0,A,B,C,D,E
1,48,137,9,20,51
2,53,111,126,19,0
3,133,81,26,143,73
4,71,97,81,55,47
5,77,124,41,140,43


In [105]:
df1 + df2

Unnamed: 0,A,B,C,D,E
1,56,137,36,47,
2,170,253,238,87,
3,244,184,111,160,
4,105,202,146,115,
5,97,168,185,246,


### DateFrame 和 Series 运算

- 广播机制
- Series 和 DataFrame 的索引对应， 对应位置相加

- 默认 是Series的索引 和 DataFrame 的列索引("columns")进行相加
- 可以指定 来进行 index 进行就想家
- `s2.add(s1, axis="index")`

In [110]:
from pandas import DataFrame

data = np.random.randint(50, 150, size=(3, 4))
index = ["张三", "李四", "王五"]
columns = ["语文","数学", "英语","Python"]
df1 = DataFrame(data=data, index=index, columns=columns)

In [113]:
data = np.random.randint(50, 150, size=(3, 4))
index = ["张三", "李四", "王五"]
columns = ["语文","数学", "英语","Python"]
df2 = DataFrame(data=data, index=index, columns=columns)

In [115]:
display(df1, df2)

Unnamed: 0,语文,数学,英语,Python
张三,107,95,93,103
李四,125,56,54,86
王五,64,110,144,141


Unnamed: 0,语文,数学,英语,Python
张三,62,64,71,80
李四,51,97,114,68
王五,112,83,69,104


In [116]:
df1 + df2

Unnamed: 0,语文,数学,英语,Python
张三,169,159,164,183
李四,176,153,168,154
王五,176,193,213,245


In [118]:
(df1 + df2) / 2

Unnamed: 0,语文,数学,英语,Python
张三,84.5,79.5,82.0,91.5
李四,88.0,76.5,84.0,77.0
王五,88.0,96.5,106.5,122.5


In [120]:
display(df1, df2)

Unnamed: 0,语文,数学,英语,Python
张三,107,95,93,103
李四,125,56,54,86
王五,64,110,144,141


Unnamed: 0,语文,数学,英语,Python
张三,62,64,71,80
李四,51,97,114,68
王五,112,83,69,104


In [123]:
df1.loc["张三"].loc["数学"] = 0
df1

Unnamed: 0,语文,数学,英语,Python
张三,107,0,93,103
李四,125,56,54,86
王五,64,110,144,141


In [124]:
df1.loc["李四"] += 100
df1

Unnamed: 0,语文,数学,英语,Python
张三,107,0,93,103
李四,225,156,154,186
王五,64,110,144,141


In [125]:
df1 += 10
df1 

Unnamed: 0,语文,数学,英语,Python
张三,117,10,103,113
李四,235,166,164,196
王五,74,120,154,151


In [132]:
dates = pd.date_range(start="20130101", end="20130202")
dates

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-06 08:00:00',
               '2013-01-11 16:00:00', '2013-01-17 00:00:00',
               '2013-01-22 08:00:00', '2013-01-27 16:00:00',
               '2013-02-02 00:00:00'],
              dtype='datetime64[ns]', freq=None)

In [135]:
dates = pd.date_range("20130101", periods=6) # periods 取几个
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [136]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("abcd"))

In [137]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-2.370559,0.592836
2013-01-04,-0.190139,0.830868,2.148413,-0.182896
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [138]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-2.370559,0.592836
2013-01-04,-0.190139,0.830868,2.148413,-0.182896
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [139]:
df.values

array([[ 2.17625451e+00,  2.95262433e-01,  2.28860279e-01,
         2.26367207e+00],
       [ 1.75313800e-01, -8.89545369e-01, -7.38617573e-04,
         1.21630512e+00],
       [ 7.90816162e-01, -8.38169090e-01, -2.37055856e+00,
         5.92835555e-01],
       [-1.90138989e-01,  8.30867670e-01,  2.14841341e+00,
        -1.82896449e-01],
       [ 6.05591194e-01, -8.78522094e-01, -4.63230848e-01,
        -1.44436880e+00],
       [-2.01424133e-01, -1.32097000e+00,  6.79868484e-01,
         6.36110663e-01]])

In [140]:
df.describe()

Unnamed: 0,a,b,c,d
count,6.0,6.0,6.0,6.0
mean,0.559402,-0.466846,0.037102,0.51361
std,0.889729,0.834318,1.48028,1.256709
min,-0.201424,-1.32097,-2.370559,-1.444369
25%,-0.098776,-0.88679,-0.347608,0.011037
50%,0.390452,-0.858346,0.114061,0.614473
75%,0.74451,0.011905,0.567116,1.071257
max,2.176255,0.830868,2.148413,2.263672


In [142]:
display(df, df.T)

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-2.370559,0.592836
2013-01-04,-0.190139,0.830868,2.148413,-0.182896
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
a,2.176255,0.175314,0.790816,-0.190139,0.605591,-0.201424
b,0.295262,-0.889545,-0.838169,0.830868,-0.878522,-1.32097
c,0.22886,-0.000739,-2.370559,2.148413,-0.463231,0.679868
d,2.263672,1.216305,0.592836,-0.182896,-1.444369,0.636111


In [146]:
df.sort_index(axis=1, ascending=True) # ascending False 倒序

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-2.370559,0.592836
2013-01-04,-0.190139,0.830868,2.148413,-0.182896
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [149]:
df.sort_values(by="b")   # 按某列的值进行 小 到大排序

Unnamed: 0,a,b,c,d
2013-01-06,-0.201424,-1.32097,0.679868,0.636111
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-03,0.790816,-0.838169,-2.370559,0.592836
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-04,-0.190139,0.830868,2.148413,-0.182896


In [150]:
df.loc[dates[0]]

a    2.176255
b    0.295262
c    0.228860
d    2.263672
Name: 2013-01-01 00:00:00, dtype: float64

In [153]:
df.a.values

array([ 2.17625451,  0.1753138 ,  0.79081616, -0.19013899,  0.60559119,
       -0.20142413])

In [155]:
df.loc[:,["a", "b"]].values

array([[ 2.17625451,  0.29526243],
       [ 0.1753138 , -0.88954537],
       [ 0.79081616, -0.83816909],
       [-0.19013899,  0.83086767],
       [ 0.60559119, -0.87852209],
       [-0.20142413, -1.32097   ]])

In [157]:
df.loc[dates[0]:dates[2], ["a", "b"]]

Unnamed: 0,a,b
2013-01-01,2.176255,0.295262
2013-01-02,0.175314,-0.889545
2013-01-03,0.790816,-0.838169


In [160]:
df.loc[dates[0], ["a", "b"]]

a    2.176255
b    0.295262
Name: 2013-01-01 00:00:00, dtype: float64

In [163]:
df.at[dates[0], "a"]

2.176254511689243

## 处理数据丢失

- None: Pyhton 自带的数据类型，不能参与到任何的计算中, 直接报异常
- np.nan(NaN) : float类型， 能参与计算， 但结果是 NaN

In [165]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

### np.nan(NaN)

- 数组直接运算会得到 nan
- 使用 `np.nan*()` 会将nan 当做 0 处理
- Series 和 DataFrame 默认将 nan 当做 0 处理

In [178]:
ndarray1 = np.array([1, 2, 3 ,4, np.nan])
ndarray1

array([ 1.,  2.,  3.,  4., nan])

In [179]:
np.sum(ndarray1)

nan

In [180]:
np.nansum(ndarray1)

10.0

In [181]:
s1 = Series([1, 2, 3, np.nan])
s1.sum()

6.0

In [182]:
# 将 nan 
df1 = DataFrame([1, 2, 3, np.nan])
df1.sum()

0    6.0
dtype: float64

### pandas 中 的None 和 np.nan

- pandas 中的 None 和 np.nan 都视为 NaN

In [183]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-2.370559,0.592836
2013-01-04,-0.190139,0.830868,2.148413,-0.182896
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [192]:
df.iloc[2].loc["c"] = None
df.iloc[3].loc["a"] = np.nan
df["d"].iloc[3] = None
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,,0.830868,2.148413,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [196]:
df.isnull()  # NaN 返回的是 True

Unnamed: 0,a,b,c,d
2013-01-01,False,False,False,False
2013-01-02,False,False,False,False
2013-01-03,False,False,True,False
2013-01-04,True,False,False,True
2013-01-05,False,False,False,False
2013-01-06,False,False,False,False


In [198]:
df.notnull()  # 不是空值返回True

Unnamed: 0,a,b,c,d
2013-01-01,True,True,True,True
2013-01-02,True,True,True,True
2013-01-03,True,True,False,True
2013-01-04,False,True,True,False
2013-01-05,True,True,True,True
2013-01-06,True,True,True,True


In [202]:
df.isnull()

Unnamed: 0,a,b,c,d
2013-01-01,False,False,False,False
2013-01-02,False,False,False,False
2013-01-03,False,False,True,False
2013-01-04,True,False,False,True
2013-01-05,False,False,False,False
2013-01-06,False,False,False,False


In [203]:
df.isnull().any() # 只要有 True 就是True 

a     True
b    False
c     True
d     True
dtype: bool

In [205]:
df.isnull().any(axis=1)

2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04     True
2013-01-05    False
2013-01-06    False
Freq: D, dtype: bool

### dropna() 

- 将有空值的 行进行删除  默认的 axis 是 0， 改为 1 会对列进行处理
- how 表示怎么处理， 默认是 any 有一个空的就删除
- all 全部是空的才删除


In [212]:
df.dropna()
df.dropna(axis=0, how="all")

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,,0.830868,2.148413,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [214]:
df["b"].iloc[3] = None
df["c"].iloc[3] = None

In [217]:
df.dropna(axis=0, how="all")

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


### 空值填充 fillna()

- 遇到空值设置为我们指定的值 value = 
- 在 空值 所处的位置的 前后找值进行填充 method="bfill" / "pad" "fill"

In [219]:
df.fillna(value=0)

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,0.0,0.592836
2013-01-04,0.0,0.0,0.0,0.0
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [221]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,,,,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [222]:
df.fillna(method="bfill")  # 默认是在 columns 中前后进行查找填充

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-0.463231,0.592836
2013-01-04,0.605591,-0.878522,-0.463231,-1.444369
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [225]:
df.fillna(method="ffill", axis=1)  # axis 是1 是 是按照 index 进行查找

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,-0.838169,0.592836
2013-01-04,,,,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [226]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,,,,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,0.679868,0.636111


In [227]:
df["c"].iloc[5] = np.nan

In [228]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,,,,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,,0.636111


In [232]:
df.fillna(method="bfill", limit=1) # limit 限制 向前查找几次

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,0.605591,-0.878522,-0.463231,-1.444369
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,,0.636111


In [233]:
df

Unnamed: 0,a,b,c,d
2013-01-01,2.176255,0.295262,0.22886,2.263672
2013-01-02,0.175314,-0.889545,-0.000739,1.216305
2013-01-03,0.790816,-0.838169,,0.592836
2013-01-04,,,,
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,,0.636111


In [235]:
df.iloc[3:5, 1:2]

Unnamed: 0,b
2013-01-04,
2013-01-05,-0.878522


In [236]:
df.iloc[3]

a   NaN
b   NaN
c   NaN
d   NaN
Name: 2013-01-04 00:00:00, dtype: float64

In [237]:
df.iloc[[1, 2, 3], [0, 2]]

Unnamed: 0,a,c
2013-01-02,0.175314,-0.000739
2013-01-03,0.790816,
2013-01-04,,


In [238]:
df.iloc[:, 1:3]

Unnamed: 0,b,c
2013-01-01,0.295262,0.22886
2013-01-02,-0.889545,-0.000739
2013-01-03,-0.838169,
2013-01-04,,
2013-01-05,-0.878522,-0.463231
2013-01-06,-1.32097,


In [239]:
df.iloc[1,1]

-0.8895453692451164

In [243]:
df.iat[1, 1] # iat 和 iloc 用法相同，但功能没有 iloc 强大

-0.8895453692451164

In [249]:
df[df < 0]

Unnamed: 0,a,b,c,d
2013-01-01,,,,
2013-01-02,,-0.889545,-0.000739,
2013-01-03,,-0.838169,,
2013-01-04,,,,
2013-01-05,,-0.878522,-0.463231,-1.444369
2013-01-06,-0.201424,-1.32097,,


In [250]:
df2 = df.copy()


In [251]:
df2["E"] = ["one","two","three","four","five","six"]

In [252]:
df2

Unnamed: 0,a,b,c,d,E
2013-01-01,2.176255,0.295262,0.22886,2.263672,one
2013-01-02,0.175314,-0.889545,-0.000739,1.216305,two
2013-01-03,0.790816,-0.838169,,0.592836,three
2013-01-04,,,,,four
2013-01-05,0.605591,-0.878522,-0.463231,-1.444369,five
2013-01-06,-0.201424,-1.32097,,0.636111,six


In [253]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,a,b,c,d,E
2013-01-02,0.175314,-0.889545,-0.000739,1.216305,two
2013-01-04,,,,,four
