# 数据清洗与准备

## 7.1、处理缺失值

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows=10

In [4]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])
print(string_data)

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


In [5]:
string_data[0]=None
print(string_data)

0         None
1    artichoke
2          NaN
3      avocado
dtype: object


In [6]:
print(string_data.isnull())        #可以发现，None和NaN都被认为是NA，not available

0     True
1    False
2     True
3    False
dtype: bool


### 1.1、过滤缺失值

In [3]:
#利用dropna来过滤缺失值
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [4]:
data[data.notnull()]    #与上面等价

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
#对于DataFrame，dropna默认情况下会删除包含缺失值的行
data=pd.DataFrame([[1,6.5,3],[1,NA,NA],
                  [NA,NA,NA],[NA,6.5,3]])
data1=data.dropna()
print(data)
print(data1)

#利用参数how可以实现仅删除所有值都是NA的行
print(data.dropna(how='all'))

#利用参数axis可以实现删除NA存在的列
print(data.dropna(axis=1,how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [13]:
#利用参数thresh实现保留一定数量的观察值的行
data2=pd.DataFrame(np.random.randn(7,3))
data2.iloc[:4,1]=NA
data2.iloc[:2,2]=NA
print(data2.dropna(thresh=2))
print(data2.dropna(thresh=1))

          0         1         2
2 -0.671067       NaN  1.284127
3 -0.594093       NaN -2.444752
4  0.514992 -0.601770 -0.837686
5  0.028874 -0.900232 -1.162911
6 -0.774774  0.501525  0.433749
          0         1         2
0  0.233087       NaN       NaN
1 -0.902660       NaN       NaN
2 -0.671067       NaN  1.284127
3 -0.594093       NaN -2.444752
4  0.514992 -0.601770 -0.837686
5  0.028874 -0.900232 -1.162911
6 -0.774774  0.501525  0.433749


### 1.2、补全缺失值

In [18]:
#利用fillna来替代缺失值
data2=pd.DataFrame(np.random.randn(7,3))
data2.iloc[:4,1]=NA
data2.iloc[:2,2]=NA
print(data2.fillna(0))

#调用fillna时使用字典，可以使得不同列可以设定不同的填充值
print(data2.fillna({1:0.5,2:0}))

#调用fillna时，利用参数inplace可以使得修改原数据
_=data2.fillna(0,inplace=True)
print(data2)

          0         1         2
0 -0.804293  0.000000  0.000000
1 -1.220486  0.000000  0.000000
2  0.100599  0.000000 -0.103862
3 -1.122888  0.000000 -0.231285
4 -1.647301 -0.043402 -2.505836
5 -0.831211 -1.375719  1.157523
6  2.953561  0.996073  0.656459
          0         1         2
0 -0.804293  0.500000  0.000000
1 -1.220486  0.500000  0.000000
2  0.100599  0.500000 -0.103862
3 -1.122888  0.500000 -0.231285
4 -1.647301 -0.043402 -2.505836
5 -0.831211 -1.375719  1.157523
6  2.953561  0.996073  0.656459
          0         1         2
0 -0.804293  0.000000  0.000000
1 -1.220486  0.000000  0.000000
2  0.100599  0.000000 -0.103862
3 -1.122888  0.000000 -0.231285
4 -1.647301 -0.043402 -2.505836
5 -0.831211 -1.375719  1.157523
6  2.953561  0.996073  0.656459


In [19]:
#fillna拥有很多参数可用
#利用method参数可用选择前向填充或后向填充
data3=pd.DataFrame(np.random.randn(6,3))
data3.iloc[2:,1]=NA
data3.iloc[4:,2]=NA
print(data3)
print(data3.fillna(method='ffill'))              #后向填充
print(data3.fillna(method='ffill',limit=2))      #后向填充，并设置填充长度最大为2

          0         1         2
0  0.264246  0.243948 -0.362505
1 -0.387553 -0.619872 -1.531149
2 -1.321191       NaN -0.318601
3  1.069199       NaN  0.290524
4  0.226386       NaN       NaN
5 -0.829300       NaN       NaN
          0         1         2
0  0.264246  0.243948 -0.362505
1 -0.387553 -0.619872 -1.531149
2 -1.321191 -0.619872 -0.318601
3  1.069199 -0.619872  0.290524
4  0.226386 -0.619872  0.290524
5 -0.829300 -0.619872  0.290524
          0         1         2
0  0.264246  0.243948 -0.362505
1 -0.387553 -0.619872 -1.531149
2 -1.321191 -0.619872 -0.318601
3  1.069199 -0.619872  0.290524
4  0.226386       NaN  0.290524
5 -0.829300       NaN  0.290524


## 2、数据转换

### 2.1、删除重复值

In [20]:
#DataFrame中会出现重复行
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                  'k2':[1,1,2,3,3,4,4]})
print(data)

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4


In [22]:
print(data.duplicated())      #返回某行是否有重复行

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool


In [23]:
print(data.drop_duplicates())   #返回没有重复行的内容

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4


In [24]:
#可用根据某一列的内容去除重复值
print(data.drop_duplicates(['k1']))      #根据'k1'列，删去有重复的行

    k1  k2
0  one   1
1  two   1


In [28]:
#keep参数可设置保留重复行的最后一行
data['va']=range(7)
print(data)
print(data.drop_duplicates(['k1','k2'],keep='last'))

    k1  k2  va
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6
    k1  k2  va
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
6  two   4   6


### 1.2.2、使用函数或映射进行数据转换

In [32]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                  'ounces':[4,3,12,6,7.5,8,3,5,6]})
print(data)

meat_to_animal={
    'bacon':'pig',
    'pulled pork':'cow',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
lower_food=data.food.str.lower()
data['animal']=lower_food.map(meat_to_animal)          #Series的map中可以是函数，也可以是包含映射关系的字典型对象
print(data)


data['animal']=data['food'].map(lambda x: meat_to_animal[x.lower()])
print(data)
'''
两个实现效果接近
'''

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     cow
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon
          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     cow
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon


### 2.3、替代值

In [36]:
#fillna是替换的特殊情况，替换缺失值。replace提供了一种更加灵活的替换
data=pd.Series([1,-999,2,-999,-1000,3])
print(data.replace(-999,NA))
print(data.replace([-999,-1000],NA))                #可以将多个值进行替换
print(data.replace([-999,-1000],[NA,500]))          #可以将不同值替换成不同值
print(data.replace({-999:433,-1000:344}))           #可以用字典

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
0      1.0
1      NaN
2      2.0
3      NaN
4    500.0
5      3.0
dtype: float64
0      1
1    433
2      2
3    433
4    344
5      3
dtype: int64


### 2.4、重命名轴索引

In [41]:
#轴标签也可以像Series中的值一样进行类似的使用函数或映射的转换
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])
print(data.index.map(lambda x:x[:4].upper()))
data.index=data.index.map(lambda x:x[:4].upper())
print(data)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')
      one  two  three  four
OHIO    0    1      2     3
COLO    4    5      6     7
NEW     8    9     10    11


In [44]:
#可以使用rename改变轴上标签
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])
print(data.rename(index=str.title,columns=str.upper))
print(data.rename(index={'Ohio':'INDIANA'},
                 columns={'three':'peekaboo'}))              #可以传入字典，进行定向转换
data.rename(index={'Ohio':'INDIANA'},inplace=True)           #传入inplace参数可以修改原数据
print(data)

          ONE  TWO  THREE  FOUR
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11
          one  two  peekaboo  four
INDIANA     0    1         2     3
Colorado    4    5         6     7
New York    8    9        10    11
          one  two  three  four
INDIANA     0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11


### 2.5、离散化和分箱

In [2]:
#连续值的离散化，或者分成几个部分，可以使用cut函数
ages=[18,20,22,25,27,21,23,37,31,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)
print(cats)
'''
返回的是ages中的每一个值所处的分区
bins中即所要划分的分区：(18,25],(25,35],(35,60],(65,100].这些范围含左不含右，因此第一个18给出的分区为NaN
'''

[NaN, (18.0, 25.0], (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 13
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]


'\n返回的是ages中的每一个值所处的分区\nbins中即所要划分的分区：(18,25],(25,35],(35,60],(65,100]\n'