# 数据清理

In [1]:
import pandas as pd
import numpy as np

## 用stack清理变量值作为列名

In [6]:
state_fruit = pd.read_csv('state_fruit.csv', index_col=0)
state_fruit

Unnamed: 0,Apple,Orange,Banana
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190


In [7]:
# stack 方法可以将所有列名，转变为垂直的一级行索引
state_fruit.stack()

Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [8]:
# 使用reset_index()， 将结果变为dataframe
state_fruit_tidy = state_fruit.stack().reset_index()
state_fruit_tidy

Unnamed: 0,level_0,level_1,0
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [9]:
# 重命名列名
state_fruit_tidy.columns = ['state', 'fruit', 'weight']
state_fruit_tidy

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [10]:
# 也可以使用rename_axis给不同的行索引层级命名
state_fruit.stack() .rename_axis(['state', 'fruit'])

state    fruit 
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [12]:
state_fruit.stack() .rename_axis(['state', 'fruit']).index

MultiIndex([(  'Texas',  'Apple'),
            (  'Texas', 'Orange'),
            (  'Texas', 'Banana'),
            ('Arizona',  'Apple'),
            ('Arizona', 'Orange'),
            ('Arizona', 'Banana'),
            ('Florida',  'Apple'),
            ('Florida', 'Orange'),
            ('Florida', 'Banana')],
           names=['state', 'fruit'])

In [11]:
state_fruit.stack().rename_axis(['state', 'fruit']).reset_index(name='weight')  
# reset_index()：这会将当前的多层索引（state 和 fruit）转化为普通的列，并且重新生成一个默认的整数索引。
# 并会把原来的数据值（即 stack() 操作后的数据）放到新的列中，并命名为 'weight'

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [13]:
state_fruit2 = pd.read_csv('state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [14]:
# state 不在行索引的位置上，使用stack将所有列名变为一个长series
state_fruit2.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [15]:
# 先设置state作为行索引名，再stack，可以得到和前面相似的结果
state_fruit2.set_index('State').stack()

State          
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

## 用melt清理变量值作为列名

In [16]:
tate_fruit2 = pd.read_csv('state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [17]:
# 使用melt方法，将列传给id_vars和value_vars。melt可以将原先的列名作为变量，原先的值作为值
state_fruit2.melt(id_vars=['State'],  value_vars=['Apple', 'Orange', 'Banana'])

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [18]:
# 随意设定一个行索引
state_fruit2.index=list('abc')
state_fruit2.index.name = 'letter'
state_fruit2

Unnamed: 0_level_0,State,Apple,Orange,Banana
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,Texas,12,10,40
b,Arizona,9,7,12
c,Florida,0,14,190


In [19]:
state_fruit2.melt(id_vars=['State'],  value_vars=['Apple', 'Orange', 'Banana'],  var_name='Fruit',  value_name='Weight')

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [20]:
# 直接使用melt，将值放一列，标签放一列
state_fruit2.melt()

Unnamed: 0,variable,value
0,State,Texas
1,State,Arizona
2,State,Florida
3,Apple,12
4,Apple,9
5,Apple,0
6,Orange,10
7,Orange,7
8,Orange,14
9,Banana,40


In [21]:
# 要指明id变量，只需使用id_vars参数
state_fruit2.melt(id_vars='State')

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190
