# 第七章

检验 windows 下的文件读取方法

In [1]:
pwd

'F:\\Python\\Notebook\\Python数据分析'

In [2]:
!type examples\ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [3]:
import pandas as pd
file = ("examples\ex1.csv")
data = pd.read_csv(file)
obj = pd.DataFrame(data)
obj

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## 一、 Handling Missing Data 处理错误数据

查看缺失值：isnull



In [4]:
import numpy as np
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [5]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0] = None
# None 是 Python 的内置函数
# 将空值复制给索引为 0 的 fame

In [8]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 1. Filtering Missing Data 过滤错误数据

In [9]:
from numpy import nan as NA

In [10]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [13]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data = pd.DataFrame([[1.0, 6.5, 3.0], [1.0, NA, NA],
                   [NA, NA, NA], [NA, 6.5, 3.0]])

dropna 删除所有包含错误值的行

In [15]:
cleaned = data.dropna()

In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


how = 'all' 属性只删除全部都是 NA 的行

In [18]:
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


删除列 使用 axis = 1

In [19]:
data[4] = NA

In [20]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(axis = 1,how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


仅保留包含特定值得行

In [22]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.878589,-0.739774,2.063944
1,0.616102,0.907718,-1.631115
2,-0.544798,-1.354601,1.072065
3,-1.47077,-0.241647,-0.690339
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


In [23]:
df.iloc[:4, 1] = NA

In [24]:
df.iloc[:2, 2] = NA

In [25]:
df

Unnamed: 0,0,1,2
0,-0.878589,,
1,0.616102,,
2,-0.544798,,1.072065
3,-1.47077,,-0.690339
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


In [26]:
df.dropna()

Unnamed: 0,0,1,2
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


In [27]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,-0.544798,,1.072065
3,-1.47077,,-0.690339
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


### 2.  Filling 填充错误数据

使用 fillna 将缺失数据填充为 0

In [28]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.878589,0.0,0.0
1,0.616102,0.0,0.0
2,-0.544798,0.0,1.072065
3,-1.47077,0.0,-0.690339
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


使用字典进行填充列

In [29]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,-0.878589,0.5,0.0
1,0.616102,0.5,0.0
2,-0.544798,0.5,1.072065
3,-1.47077,0.5,-0.690339
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


In [30]:
_ = df.fillna(0, inplace = True)

In [31]:
df

Unnamed: 0,0,1,2
0,-0.878589,0.0,0.0
1,0.616102,0.0,0.0
2,-0.544798,0.0,1.072065
3,-1.47077,0.0,-0.690339
4,0.862887,-0.824684,0.277868
5,-0.348799,1.369401,0.01425
6,1.167614,-0.64194,-0.192408


In [32]:
df = pd.DataFrame(np.random.randn(6, 3))

In [33]:
df.iloc[2:, 1] = NA

In [34]:
df.iloc[4:, 2] = NA

In [35]:
df

Unnamed: 0,0,1,2
0,-0.946443,0.709868,0.279179
1,-1.794342,0.210019,-0.518011
2,-2.1688,,-0.585664
3,0.302907,,-0.710417
4,0.212693,,
5,1.526024,,


In [36]:
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,-0.946443,0.709868,0.279179
1,-1.794342,0.210019,-0.518011
2,-2.1688,0.210019,-0.585664
3,0.302907,0.210019,-0.710417
4,0.212693,0.210019,-0.710417
5,1.526024,0.210019,-0.710417


In [37]:
df.fillna(method = 'ffill', limit = 2)

Unnamed: 0,0,1,2
0,-0.946443,0.709868,0.279179
1,-1.794342,0.210019,-0.518011
2,-2.1688,0.210019,-0.585664
3,0.302907,0.210019,-0.710417
4,0.212693,,-0.710417
5,1.526024,,-0.710417


In [38]:
data = pd.Series([1.0, NA, 3,5, NA, 7])
data

0    1.0
1    NaN
2    3.0
3    5.0
4    NaN
5    7.0
dtype: float64

In [39]:
data.fillna(data.mean())

0    1.0
1    4.0
2    3.0
3    5.0
4    4.0
5    7.0
dtype: float64

## 二、重复值处理

In [34]:
import pandas as pd
data = pd.DataFrame({'k1': ['cat', 'dog','dog', 'tiger','tiger', 'panda', 'panda'],
                     'k2': [1, 2, 2, 3 ,5, 4, 4]})
data

Unnamed: 0,k1,k2
0,cat,1
1,dog,2
2,dog,2
3,tiger,3
4,tiger,5
5,panda,4
6,panda,4


### 1.1 判断是否是重复值

In [35]:
# 使用 .duplicated() 方法判断数据是否重复
data.duplicated()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
dtype: bool

In [36]:
# 获取重复的数据( 所有列都一致 )
data[data.duplicated()]

Unnamed: 0,k1,k2
2,dog,2
6,panda,4


In [38]:
# 针对某一列重的值进行取值
data[data.duplicated('k2')]

Unnamed: 0,k1,k2
2,dog,2
6,panda,4


In [40]:
data[data.duplicated('k1')]

Unnamed: 0,k1,k2
2,dog,2
4,tiger,5
6,panda,4


### 1.2 筛选出重复的值

    1. 采用drop_duplicates对数据去两次重，一次将重复数据全部去除（keep=False）记为data1,另一次将重复数据保留一个（keep='first）记为data2;
    2. 求data1和data2的差集即可:data2.append(data1).drop_duplicates(keep=False)

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [22]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [16]:
data2 = data.drop_duplicates(['k1'], keep = 'first')
data2

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [19]:
data2.append(data1).drop_duplicates(keep = False)

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2


### 1.3. 剔除重复数据

In [40]:
import pandas as pd

In [41]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3 ,3, 4, 4]})

In [42]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [43]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [44]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [45]:
data['v1'] = range(7)

In [46]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [47]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [48]:
data.drop_duplicates(['k1', 'k2'], keep = 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 使用函数或者 map 转换数据

## 替换值

### 4. 索引重命名

使用 `rename`方法

In [52]:
import pandas as pd
import numpy as np
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                   index = ['Ohio', 'Colorado', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])

In [55]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [57]:
data.rename(index={'Ohio':'INDIANA'},
           columns = {'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### 5.  离散化和分组  Discretization and Binning

In [58]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [59]:
bins = [18, 25, 35, 60, 100]

In [60]:
cats = pd.cut(ages, bins)

In [61]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

## 删除和拟合异常值 Outlier

## Permutation and Random sampling 随机重排列和随机采样

## 计算指示器和虚拟变量

# 三、string Manipulation  字符串处理

## 字符串对象方法

## 正则表达式

## Vectorized String Function in pandas  字符串向量化函数