In [2]:
import pandas as pd
import numpy as np

In [3]:
x = pd.Series([1, 2, np.nan, 3, 4, 5, 6, np.nan, 8])
x

0    1.0
1    2.0
2    NaN
3    3.0
4    4.0
5    5.0
6    6.0
7    NaN
8    8.0
dtype: float64

In [4]:
# 是否存在缺失值
x.hasnans

True

In [5]:
x.values.dtype

dtype('float64')

In [6]:
# 填充缺失值，填充为平均值
x.fillna(x.mean())

0    1.000000
1    2.000000
2    4.142857
3    3.000000
4    4.000000
5    5.000000
6    6.000000
7    4.142857
8    8.000000
dtype: float64

In [7]:
df3 = pd.DataFrame({'A': [5, 3, None, 4],
                    'B': [None, 2, 4, 3],
                    'C': [4, 3, 8, 5],
                    'D': [5, 4, 2, None]})
df3

Unnamed: 0,A,B,C,D
0,5.0,,4,5.0
1,3.0,2.0,3,4.0
2,,4.0,8,2.0
3,4.0,3.0,5,


In [8]:
# 查看是否有缺失值
df3.isnull()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,False,False,False
2,True,False,False,False
3,False,False,False,True


In [9]:
# 查看每行有多少个缺失值
df3.isnull().sum()

A    1
B    1
C    0
D    1
dtype: int64

In [10]:
# 用上一行填充
df3.ffill()

Unnamed: 0,A,B,C,D
0,5.0,,4,5.0
1,3.0,2.0,3,4.0
2,3.0,4.0,8,2.0
3,4.0,3.0,5,2.0


In [11]:
# 用上一列填充
df3.ffill(axis=1)

Unnamed: 0,A,B,C,D
0,5.0,5.0,4.0,5.0
1,3.0,2.0,3.0,4.0
2,,4.0,8.0,2.0
3,4.0,3.0,5.0,5.0


In [12]:
df3.info

<bound method DataFrame.info of      A    B  C    D
0  5.0  NaN  4  5.0
1  3.0  2.0  3  4.0
2  NaN  4.0  8  2.0
3  4.0  3.0  5  NaN>

In [13]:
# 删除缺失行
df3.dropna()

Unnamed: 0,A,B,C,D
1,3.0,2.0,3,4.0


In [14]:
df3.dropna(axis=1)

Unnamed: 0,C
0,4
1,3
2,8
3,5


In [15]:
df3

Unnamed: 0,A,B,C,D
0,5.0,,4,5.0
1,3.0,2.0,3,4.0
2,,4.0,8,2.0
3,4.0,3.0,5,


In [36]:
# datafram 可以连接 Series ，但要series index= 属性名
# Can only append a Series if ignore_index=True or if the Series has a name
# 除此以外，还要指定 ignore_index=True 或  Series has a name
# 前者将索引依次向下加一个，后者将name 当成索引名
a = pd.Series([4, 3, 5, None], index=list('ABCD'), name='ha')
print(a.T)
df4 = df3.append(a)
df4

A    4.0
B    3.0
C    5.0
D    NaN
Name: ha, dtype: float64


Unnamed: 0,A,B,C,D
0,5.0,,4.0,5.0
1,3.0,2.0,3.0,4.0
2,,4.0,8.0,2.0
3,4.0,3.0,5.0,
ha,4.0,3.0,5.0,


In [34]:
# 重复值处理
df4.drop_duplicates()

Unnamed: 0,A,B,C,D
0,5.0,,4.0,5.0
1,3.0,2.0,3.0,4.0
2,,4.0,8.0,2.0
3,4.0,3.0,5.0,


In [35]:
list('abcd')

['a', 'b', 'c', 'd']