# Mod10 DataFrame Indexing and Selection

## Data Selection in DataFrame

### DataFrame as a dictionary

In [1]:
import numpy as np
import pandas as pd

In [2]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [5]:
data['density'] = data['pop'] / data['area']      # 透過直接給值增加一欄資料
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [4]:
print(data['area'],end="\n------------------\n")   #dictionary-style indexing
print(data.area)                                   #attribute-style access

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
------------------
California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64


In [5]:
data.area is data['area']

True

In [6]:
# 因為DataFrame有一個方法pop()，所以會造成衝突。建議全部使用data["pop"]來取值
data.pop is data['pop']

False

### DataFrame as two-dimensional array
<details>
    <summary><b>dataframe結構圖</b></summary>
    <img src='./img/creating_dataframe1.png'>
</details>

In [4]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


transpose the full ``DataFrame`` to swap rows and columns:

passing a single index to an array accesses a row:

In [5]:
data.values[0,1]                                   # 如果要取其中一個值，必須先用.values把dataframe轉成array，或是用loc、iloc去取

38332521.0

In [10]:
display(data["area"])                              # index只能用column-name
print(type(data["area"]))                          # 用一個中括號取出來的是series
display(data[["area"]])                           
print(type(data[["area"]]))                        # 用兩個中括號取出來的是dataframe

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

<class 'pandas.core.series.Series'>


Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


<class 'pandas.core.frame.DataFrame'>


In [14]:
data[['area','pop']]                       # fancy-index

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [20]:
display(data['Florida':'Illinois'])        # slice只能用row-name或序位
display(data[3:])

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [16]:
df = pd.DataFrame([{'x':i+1, 'y':i*2} for i in range(4)]); display(df)
df[1:3]                                    # # DataFrame的index是數字時，參數slice是序位

Unnamed: 0,x,y
0,1,0
1,2,2
2,3,4
3,4,6


Unnamed: 0,x,y
1,2,2
2,3,4


#### masking in Dataframe

In [41]:
data[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


In [42]:
data[(data['pop'] > 20000000) & (data['area'] > 500000)]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874


In [78]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


### Indexers: loc, iloc

In [11]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [13]:
print(data.loc['Illinois', 'pop'],end="\n--------------------\n")
print(data.loc['Texas'],end="\n--------------------\n")
print(data.loc[:'Texas', ['area','pop']],end="\n--------------------\n")
print(data.at['Texas','area'],end="\n--------------------\n")

12882135
--------------------
area       6.956620e+05
pop        2.644819e+07
density    3.801874e+01
Name: Texas, dtype: float64
--------------------
              area       pop
California  423967  38332521
Texas       695662  26448193
--------------------
695662
--------------------


In [25]:
print(data.iloc[4,1],end="\n--------------------\n")
print(data.iloc[1],end="\n--------------------\n")
print(data.iloc[:2, :2],end="\n--------------------\n")
print(data.iat[4,0],end="\n--------------------\n")

12882135
--------------------
area       6.956620e+05
pop        2.644819e+07
density    3.801874e+01
Name: Texas, dtype: float64
--------------------
              area       pop
California  423967  38332521
Texas       695662  26448193
--------------------
149995
--------------------


In [28]:
display(data.loc[data.density > 100, ['pop', 'density']])
display(data.loc[data.density > 100, 'pop':'density'])

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [27]:
display(data.iloc[list(data.density > 100), [0,2]])    # 用iloc()一定要轉換成list  
display(data.iloc[list(data.density > 100), 0:3])      # 用iloc()一定要轉換成list  

Unnamed: 0,area,density
New York,141297,139.076746
Florida,170312,114.806121


Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


to set or modify values

## Lab

<b>有一個 DataFrame df，取得如下的資料:
* 'two' 整欄的資料
* 'three' 與 'one' 這兩欄的資料
</b>

In [29]:
np.random.seed(51)
df = pd.DataFrame(np.random.randint(11,70,size=20).reshape((4, 5)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four', 'five'])
df

Unnamed: 0,one,two,three,four,five
Ohio,68,48,43,20,16
Colorado,27,32,41,63,52
Utah,48,69,57,48,41
New York,39,31,47,46,38


In [30]:
df["two"]

Ohio        48
Colorado    32
Utah        69
New York    31
Name: two, dtype: int32

In [58]:
df[["three","one"]]

Unnamed: 0,three,one
Ohio,43,68
Colorado,41,27
Utah,57,48
New York,47,39


<b>透過 loc 以及 iloc 取得 'three' 與 'one' 這兩欄的 'Ohio' 與 'Utah' 的資料</b>

In [42]:
df.loc[["Ohio","Utah"],["three","one"]]

Unnamed: 0,three,one
Ohio,43,68
Utah,57,48


In [43]:
df.iloc[[0,0],[2,1]]

Unnamed: 0,three,two
Ohio,43,48
Ohio,43,48


<b>透過 at 以及 iat 取得 'Utah' 的 'three' 的資料</b>

In [48]:
df.at["Utah","three"]

57

In [49]:
df.iat[2,2]

57