# DataFrame Manipulation

### DataFrame Indexing
d.f.的索引可透過 loc 和 iloc 達成:

In [2]:
import numpy as np
import pandas as pd

In [3]:
# 將d.f.當作二維陣列，使用 loc & iloc
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
popu = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'popu':popu})
data

Unnamed: 0,area,popu
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [14]:
# 指定名稱使用loc
# 注意 loc 為 explicit表示
data.loc[:'New York',['popu']]

Unnamed: 0,popu
California,38332521
Texas,26448193
New York,19651127


In [13]:
# 注意popu欄位有無中括號在顯示上的差異(一個是Series，一個是d.f.)
data.loc[:'New York','popu'] , type(data.loc[:'New York','popu']) , type(data.loc[:'New York',['popu']])

(California    38332521
 Texas         26448193
 New York      19651127
 Name: popu, dtype: int64,
 pandas.core.series.Series,
 pandas.core.frame.DataFrame)

In [18]:
# iloc為implicit表示，所以不包含end point
data.iloc[2:4,0:1] , type(data.iloc[2:4,0:1])

(            area
 New York  141297
 Florida   170312,
 pandas.core.frame.DataFrame)

In [19]:
# 注意給範圍和單一value所回傳的型別差異(給範圍回傳d.f.，給單一值回傳Series)
data.iloc[2:4,0] , type(data.iloc[2:4,0])

(New York    141297
 Florida     170312
 Name: area, dtype: int64,
 pandas.core.series.Series)

In [20]:
# 作用在row的特殊索引
data[data.area>170000]

Unnamed: 0,area,popu
California,423967,38332521
Texas,695662,26448193
Florida,170312,19552860


In [21]:
data[1:3]

Unnamed: 0,area,popu
Texas,695662,26448193
New York,141297,19651127


In [25]:
# 注意欄位往前搜尋要給-1，否則預設向下搜尋
data['Texas':'California']

Unnamed: 0,area,popu


In [26]:
data['Texas':'California':-1]

Unnamed: 0,area,popu
Texas,695662,26448193
California,423967,38332521


In [30]:
# 使用mask手法用true/false值來搜尋 (注意長度要給對，否則會報錯)
data[ [True,False,True,False,False] ]

Unnamed: 0,area,popu
California,423967,38332521
New York,141297,19651127


### Data Sampling(資料抽樣)
根據 Pandas官方API文件: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html  
> `.sample([n=None | frac=None], replace=False, weights=None, random_state=None, axis=None, ignore_index=False)`  

其中:  
n/frac表示要抽出幾筆/幾%的資料，不能與相互共存  
replace代表是否要抽出放回，預設為false  
weights為每一筆資料(row)的權重，越大代表取出機率越高  
random_state為亂數基底(seed)  
axis為{0 or ‘index’, 1 or ‘columns’, None}其中一種，預設為None，對於Series和DataFrame來說，抽樣是取row  
ignore_index預設為False，如果為True則回傳index為預設(即0,1,2,3,...)

In [32]:
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])
df

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [39]:
# 指定欄位抽取
df['num_legs'].sample(n=3, random_state=1)

fish      0
spider    8
falcon    2
Name: num_legs, dtype: int64

In [38]:
# 指定取後放回且抽出50%的資料筆數
df.sample(frac=0.5, replace=True, random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
dog,4,0,2
fish,0,0,8


In [40]:
# 指定抽出200%資料
df.sample(frac=2, replace=True, random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
dog,4,0,2
fish,0,0,8
falcon,2,2,10
falcon,2,2,10
fish,0,0,8
dog,4,0,2
fish,0,0,8
dog,4,0,2


In [41]:
# 指定特定欄位當作權重
df.sample(n=2, weights='num_specimen_seen', random_state=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
fish,0,0,8


In [43]:
# 測試/訓練資料切割
train = df.sample(frac=.7)
test = df.drop(train.index)
print(f's1 :\n{df}\n\n'
      f'train :\n{train}\n\n'
      f'test :\n{test}')

s1 :
        num_legs  num_wings  num_specimen_seen
falcon         2          2                 10
dog            4          0                  2
spider         8          0                  1
fish           0          0                  8

train :
        num_legs  num_wings  num_specimen_seen
falcon         2          2                 10
spider         8          0                  1
dog            4          0                  2

test :
      num_legs  num_wings  num_specimen_seen
fish         0          0                  8


In [44]:
s1 = pd.DataFrame(np.arange(50).reshape(-1, 5),  
                   columns=['Ohio', 'Texas', 'California', 'illinos', 'New York'])
s1

Unnamed: 0,Ohio,Texas,California,illinos,New York
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24
5,25,26,27,28,29
6,30,31,32,33,34
7,35,36,37,38,39
8,40,41,42,43,44
9,45,46,47,48,49


In [45]:
# 使用scikit learn套件做切割
from sklearn.model_selection import train_test_split
y = s1['New York']
# s1.drop(['New York'],axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(s1.iloc[:,:-1], y, test_size=0.3, random_state=42)
print(f'X_train :\n{X_train}\n\n'
      f'y_train :\n{y_train}\n\n'
      f'X_test :\n{X_test}\n\n'
      f'y_test :\n{y_test}')

X_train :
   Ohio  Texas  California  illinos
0     0      1           2        3
7    35     36          37       38
2    10     11          12       13
9    45     46          47       48
4    20     21          22       23
3    15     16          17       18
6    30     31          32       33

y_train :
0     4
7    39
2    14
9    49
4    24
3    19
6    34
Name: New York, dtype: int32

X_test :
   Ohio  Texas  California  illinos
8    40     41          42       43
1     5      6           7        8
5    25     26          27       28

y_test :
8    44
1     9
5    29
Name: New York, dtype: int32


In [46]:
type(s1.iloc[:,:-1])

pandas.core.frame.DataFrame

In [48]:
type(X_train)

pandas.core.frame.DataFrame