# DataFrame Manipulation

### DataFrame Indexing
d.f.的索引可透過 loc 和 iloc 達成:

In [3]:
import numpy as np
import pandas as pd

In [3]:
# 將d.f.當作二維陣列，使用 loc & iloc
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
popu = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'popu':popu})
data

Unnamed: 0,area,popu
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [14]:
# 指定名稱使用loc
# 注意 loc 為 explicit表示
data.loc[:'New York',['popu']]

Unnamed: 0,popu
California,38332521
Texas,26448193
New York,19651127


In [13]:
# 注意popu欄位有無中括號在顯示上的差異(一個是Series，一個是d.f.)
data.loc[:'New York','popu'] , type(data.loc[:'New York','popu']) , type(data.loc[:'New York',['popu']])

(California    38332521
 Texas         26448193
 New York      19651127
 Name: popu, dtype: int64,
 pandas.core.series.Series,
 pandas.core.frame.DataFrame)

In [18]:
# iloc為implicit表示，所以不包含end point
data.iloc[2:4,0:1] , type(data.iloc[2:4,0:1])

(            area
 New York  141297
 Florida   170312,
 pandas.core.frame.DataFrame)

In [19]:
# 注意給範圍和單一value所回傳的型別差異(給範圍回傳d.f.，給單一值回傳Series)
data.iloc[2:4,0] , type(data.iloc[2:4,0])

(New York    141297
 Florida     170312
 Name: area, dtype: int64,
 pandas.core.series.Series)

In [20]:
# 作用在row的特殊索引
data[data.area>170000]

Unnamed: 0,area,popu
California,423967,38332521
Texas,695662,26448193
Florida,170312,19552860


In [21]:
data[1:3]

Unnamed: 0,area,popu
Texas,695662,26448193
New York,141297,19651127


In [25]:
# 注意欄位往前搜尋要給-1，否則預設向下搜尋
data['Texas':'California']

Unnamed: 0,area,popu


In [26]:
data['Texas':'California':-1]

Unnamed: 0,area,popu
Texas,695662,26448193
California,423967,38332521


In [30]:
# 使用mask手法用true/false值來搜尋 (注意長度要給對，否則會報錯)
data[ [True,False,True,False,False] ]

Unnamed: 0,area,popu
California,423967,38332521
New York,141297,19651127


### Data Sampling(資料抽樣)
根據 Pandas官方API文件: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html  
> `.sample([n=None | frac=None], replace=False, weights=None, random_state=None, axis=None, ignore_index=False)`  

其中:  
n/frac表示要抽出幾筆/幾%的資料，不能與相互共存  
replace代表是否要抽出放回，預設為false  
weights為每一筆資料(row)的權重，越大代表取出機率越高  
random_state為亂數基底(seed)  
axis為{0 or ‘index’, 1 or ‘columns’, None}其中一種，預設為None，對於Series和DataFrame來說，抽樣是取row  
ignore_index預設為False，如果為True則回傳index為預設(即0,1,2,3,...)

In [32]:
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])
df

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [39]:
# 指定欄位抽取
df['num_legs'].sample(n=3, random_state=1)

fish      0
spider    8
falcon    2
Name: num_legs, dtype: int64

In [38]:
# 指定取後放回且抽出50%的資料筆數
df.sample(frac=0.5, replace=True, random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
dog,4,0,2
fish,0,0,8


In [40]:
# 指定抽出200%資料
df.sample(frac=2, replace=True, random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
dog,4,0,2
fish,0,0,8
falcon,2,2,10
falcon,2,2,10
fish,0,0,8
dog,4,0,2
fish,0,0,8
dog,4,0,2


In [41]:
# 指定特定欄位當作權重
df.sample(n=2, weights='num_specimen_seen', random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
fish,0,0,8


In [43]:
# 測試/訓練資料切割
train = df.sample(frac=.7)
test = df.drop(train.index)
print(f's1 :\n{df}\n\n'
      f'train :\n{train}\n\n'
      f'test :\n{test}')

s1 :
        num_legs  num_wings  num_specimen_seen
falcon         2          2                 10
dog            4          0                  2
spider         8          0                  1
fish           0          0                  8

train :
        num_legs  num_wings  num_specimen_seen
falcon         2          2                 10
spider         8          0                  1
dog            4          0                  2

test :
      num_legs  num_wings  num_specimen_seen
fish         0          0                  8


In [44]:
s1 = pd.DataFrame(np.arange(50).reshape(-1, 5),  
                   columns=['Ohio', 'Texas', 'California', 'illinos', 'New York'])
s1

Unnamed: 0,Ohio,Texas,California,illinos,New York
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [45]:
# 使用scikit learn套件做切割
from sklearn.model_selection import train_test_split
y = s1['New York']
# s1.drop(['New York'],axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(s1.iloc[:,:-1], y, test_size=0.3, random_state=42)
print(f'X_train :\n{X_train}\n\n'
      f'y_train :\n{y_train}\n\n'
      f'X_test :\n{X_test}\n\n'
      f'y_test :\n{y_test}')

X_train :
   Ohio  Texas  California  illinos
0     0      1           2        3
7    35     36          37       38
2    10     11          12       13
9    45     46          47       48
4    20     21          22       23
3    15     16          17       18
6    30     31          32       33

y_train :
0     4
7    39
2    14
9    49
4    24
3    19
6    34
Name: New York, dtype: int32

X_test :
   Ohio  Texas  California  illinos
8    40     41          42       43
1     5      6           7        8
5    25     26          27       28

y_test :
8    44
1     9
5    29
Name: New York, dtype: int32


In [46]:
type(s1.iloc[:,:-1])

pandas.core.frame.DataFrame

In [48]:
type(X_train)

pandas.core.frame.DataFrame

### Hierarchical Indexing (階層式索引)
假設我們的index以tuple的方式成對儲存在list內，那麼就可以利用MultiIndex來做方便的索引!

##### 傳統做法 vs Pandas作法

In [10]:
index = [('A',2000),('A',2010),('N',2000),('N',2010),('T',2000),('T',2010)]
popu = [2500,3500,4000,6000,1800,7200]
pop = pd.Series(popu , index=index)
pop

(A, 2000)    2500
(A, 2010)    3500
(N, 2000)    4000
(N, 2010)    6000
(T, 2000)    1800
(T, 2010)    7200
dtype: int64

In [11]:
# 可透過fancy index
pop[ ('A',2010):('T',2000)]

(A, 2010)    3500
(N, 2000)    4000
(N, 2010)    6000
(T, 2000)    1800
dtype: int64

In [12]:
# 但如果只要找2010的資料，那麼作法就會很棘手
pop[ [i for i in pop.index if i[1]==2010] ]

(A, 2010)    3500
(N, 2010)    6000
(T, 2010)    7200
dtype: int64

In [13]:
# Pandas提供多重 Index 索引功能
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('A', 2000),
            ('A', 2010),
            ('N', 2000),
            ('N', 2010),
            ('T', 2000),
            ('T', 2010)],
           )

In [14]:
# 使用新的索引套用在之前的Series上面
# 注意整個inddex排序變化
pop = pop.reindex(index)
pop

A  2000    2500
   2010    3500
N  2000    4000
   2010    6000
T  2000    1800
   2010    7200
dtype: int64

In [15]:
# 此時使用一般的切片寫法就能輕鬆取得想要的資料
pop[:,2010] , type(pop[:,2010])

(A    3500
 N    6000
 T    7200
 dtype: int64,
 pandas.core.series.Series)

In [16]:
# 在一維Series上使用二階索引，可透過unstack()函數來達成DataFrame的轉換(把其中一層索引轉成Column)
# 套用這個想法，每一個額外的index階層都能表示資料的另一個維度
pop = pd.DataFrame({'total':pop , 'under18':[1425,2017,688,947,1125,3038]})
pop

Unnamed: 0,Unnamed: 1,total,under18
A,2000,2500,1425
A,2010,3500,2017
N,2000,4000,688
N,2010,6000,947
T,2000,1800,1125
T,2010,7200,3038


In [18]:
f_u18 = pop['under18'] / pop['total']
f_u18

A  2000    0.570000
   2010    0.576286
N  2000    0.172000
   2010    0.157833
T  2000    0.625000
   2010    0.421944
dtype: float64

In [20]:
# 從最右邊的Index轉換成Columns
f_u18.unstack()

Unnamed: 0,2000,2010
A,0.57,0.576286
N,0.172,0.157833
T,0.625,0.421944


### 建立Multi Index
透過input多維list，可讓pd自動生成多重索引(透過Series或者DataFrame的constructor完成!)  
建立MultiIndex的物件型別有以下幾種:
1. .from_arrays(array)
2. .from_tuples(tuples_indide_list) 
3. .from_product(list1 , list2 , ...)

In [17]:
# 使用array
df = pd.DataFrame(np.random.rand(4,2) , 
                 index = [ ['a','a','b','b'] , [1,2,1,2] ],
                 columns = ['data1' , 'data2']
                 )
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.266746,0.829699
a,2,0.435036,0.524915
b,1,0.572001,0.116113
b,2,0.078122,0.929028


In [18]:
# 使用 tuples in a list
# 如果傳入以tuple為key的dict，則pd會自動識別
data = { ('A',2000):10000,
        ('A',2010):20000,
        ('N',2000):30000,
        ('N',2010):500000,
        ('T',2000):150000,
        ('T',2010):160000 }
pd.Series(data)

A  2000     10000
   2010     20000
N  2000     30000
   2010    500000
T  2000    150000
   2010    160000
dtype: int64

In [19]:
# unique索引值 (笛卡爾座標)
pd.MultiIndex.from_product([ ['x','y'] , [1,2] ])

MultiIndex([('x', 1),
            ('x', 2),
            ('y', 1),
            ('y', 2)],
           )

### Row的多重階層命名

In [22]:
pop

Unnamed: 0,Unnamed: 1,total,under18
A,2000,2500,1425
A,2010,3500,2017
N,2000,4000,688
N,2010,6000,947
T,2000,1800,1125
T,2010,7200,3038


In [24]:
pop.index.names = ['Type','Year']
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
Type,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2000,2500,1425
A,2010,3500,2017
N,2000,4000,688
N,2010,6000,947
T,2000,1800,1125
T,2010,7200,3038


### Column 的多重階層命名

In [25]:
index = pd.MultiIndex.from_product( [[2013,2014],[1,2]] , names=['year','visit'] )
columns = pd.MultiIndex.from_product( [ ['Bob','Guido','Sue'],['HR','Temp'] ] , names = ['subject','type'] )
data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 38

health_data = pd.DataFrame(data,index=index , columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,35.0,38.7,42.0,38.5,28.0,39.3
2013,2,47.0,38.4,35.0,38.4,42.0,37.7
2014,1,34.0,39.3,47.0,36.8,46.0,38.5
2014,2,34.0,36.5,36.0,37.4,41.0,39.2


### MultiIndex Slicing
要對d.f.使用多重索引，最好使用Pandas內建的Indexslice物件! 

In [26]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,42.0,38.5
2013,2,35.0,38.4
2014,1,47.0,36.8
2014,2,36.0,37.4


In [27]:
health_data['Guido','HR']

year  visit
2013  1        42.0
      2        35.0
2014  1        47.0
      2        36.0
Name: (Guido, HR), dtype: float64

In [48]:
health_data['Guido'][['HR']]

Unnamed: 0_level_0,type,HR
year,visit,Unnamed: 2_level_1
2013,1,42.0
2013,2,35.0
2014,1,47.0
2014,2,36.0


In [32]:
health_data['Guido'].HR[2013,1]

42.0

In [49]:
# 使用 int 的 iloc
health_data.iloc[:2,:2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,35.0,38.7
2013,2,47.0,38.4


In [51]:
# 使用 string 的 loc
health_data.loc[:,('Bob','Temp')]

year  visit
2013  1        38.7
      2        38.4
2014  1        39.3
      2        36.5
Name: (Bob, Temp), dtype: float64

In [62]:
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
Type,Yr,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2000,10000,1425
A,2010,20000,2017
N,2000,30000,688
N,2010,500000,947
T,2000,150000,1125
T,2010,160000,3038


In [65]:
# 使用Indexslice物件對d.f.做索引
idx = pd.IndexSlice
pop.loc[ idx['N',2010] , idx['under18'] ]

947

In [68]:
pop.loc[ idx[ ['A','T'],:] , idx[['under18']] ] , type(pop.loc[ idx[ ['A','T'],:] , idx[['under18']] ])

Unnamed: 0_level_0,Unnamed: 1_level_0,under18
Type,Yr,Unnamed: 2_level_1
A,2000,1425
A,2010,2017
T,2000,1125
T,2010,3038


In [69]:
pop.loc[ idx[ ['A','T'],:] , idx['under18'] ] , type(pop.loc[ idx[ ['A','T'],:] , idx['under18'] ])

(Type  Yr  
 A     2000    1425
       2010    2017
 T     2000    1125
       2010    3038
 Name: under18, dtype: int64,
 pandas.core.series.Series)

### Re-Index (重新排列索引)
在使用MultiIndex需要注意以下:
1. 未排序的索引使用MultiIndex操作會報錯
2. reindex()函數可以重設index，超出資料筆數的index會給予NaN值作對應

In [72]:
# 建立原物件
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [71]:
# reindex遇到超出範圍的會自動補NaN
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])  # e NaN
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [80]:
index = pd.MultiIndex.from_product([['a','c','b'] , [1,2]])
data = pd.Series(np.random.rand(6) , index=index)
data.index.names = ['AAA','BBB']
data

AAA  BBB
a    1      0.940780
     2      0.297738
c    1      0.600395
     2      0.980403
b    1      0.283563
     2      0.167472
dtype: float64

In [77]:
# 因為 MultiIndex還沒排序，所以以下指令會報錯
data['a':'b']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [81]:
# 將index排序後，就不會報錯啦!
data = data.sort_index()
data

AAA  BBB
a    1      0.940780
     2      0.297738
b    1      0.283563
     2      0.167472
c    1      0.600395
     2      0.980403
dtype: float64

In [82]:
data['a':'b']

AAA  BBB
a    1      0.940780
     2      0.297738
b    1      0.283563
     2      0.167472
dtype: float64

### 索引的stack() 與 unstack()
可透過level引數指定要將哪一個 index 轉為column!

In [83]:
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
Type,Yr,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2000,10000,1425
A,2010,20000,2017
N,2000,30000,688
N,2010,500000,947
T,2000,150000,1125
T,2010,160000,3038


In [85]:
pop.unstack(level=0)

Unnamed: 0_level_0,total,total,total,under18,under18,under18
Type,A,N,T,A,N,T
Yr,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2000,10000,30000,150000,1425,688,1125
2010,20000,500000,160000,2017,947,3038


In [86]:
pop.unstack(level=1)

Unnamed: 0_level_0,total,total,under18,under18
Yr,2000,2010,2000,2010
Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,10000,20000,1425,2017
N,30000,500000,688,947
T,150000,160000,1125,3038


In [88]:
# 被unstack()可以用相反操作還原資料
pop.unstack(level=1).stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
Type,Yr,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2000,10000,1425
A,2010,20000,2017
N,2000,30000,688
N,2010,500000,947
T,2000,150000,1125
T,2010,160000,3038


### Set & Reset Index(索引設定與重設)
> .reset_index('IndexName') : 設定指定索引名稱成為欄位  
> .set_index(['colName1' , 'colName2' , ...]) : 設定指定欄位成為索引 

In [89]:
pop

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
Type,Yr,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2000,10000,1425
A,2010,20000,2017
N,2000,30000,688
N,2010,500000,947
T,2000,150000,1125
T,2010,160000,3038


In [95]:
a = pop.reset_index(['Type' , 'Yr'])
a

Unnamed: 0,Type,Yr,total,under18
0,A,2000,10000,1425
1,A,2010,20000,2017
2,N,2000,30000,688
3,N,2010,500000,947
4,T,2000,150000,1125
5,T,2010,160000,3038


In [96]:
a.set_index(['Type' , 'Yr'])

Unnamed: 0_level_0,Unnamed: 1_level_0,total,under18
Type,Yr,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2000,10000,1425
A,2010,20000,2017
N,2000,30000,688
N,2010,500000,947
T,2000,150000,1125
T,2010,160000,3038


### 多重索引aggregation
此項功能可以用groupby完全取代!

In [97]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,35.0,38.7,42.0,38.5,28.0,39.3
2013,2,47.0,38.4,35.0,38.4,42.0,37.7
2014,1,34.0,39.3,47.0,36.8,46.0,38.5
2014,2,34.0,36.5,36.0,37.4,41.0,39.2


In [98]:
health_data.mean(level='year')

  health_data.mean(level='year')


subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,41.0,38.55,38.5,38.45,35.0,38.5
2014,34.0,37.9,41.5,37.1,43.5,38.85


In [99]:
health_data.mean(axis=1,level='subject')

  health_data.mean(axis=1,level='subject')


Unnamed: 0_level_0,subject,Bob,Guido,Sue
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,1,36.85,40.25,33.65
2013,2,42.7,36.7,39.85
2014,1,36.65,41.9,42.25
2014,2,35.25,36.7,40.1
