## Data Selection

In [27]:
import pandas as pd

In [28]:
df = pd.read_excel("../data/course_students.xlsx")
df

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


## Index

In [29]:
df.index.name = "user_id"
df

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


## Selecting Data by label

In [30]:
# Select 1 value - Scalar
df.loc[0, "name"]

'Mark'

In [31]:
# Select 2 values - Series
df.loc[[0, 2], "age"]

user_id
0    55
2    41
Name: age, dtype: int64

In [32]:
# Select 1 column - Series
df.loc[:, "country"]

user_id
0      Italy
1        USA
2        USA
3    Germany
Name: country, dtype: object

In [33]:
# Select 1 row - Series
df.loc[1, :]

user_id         1000
name            John
age               33
country          USA
score            6.7
continent    America
Name: 1, dtype: object

In [34]:
# Select 1 column in an array - DataFrame
df.loc[:, ["country"]]

Unnamed: 0_level_0,country
user_id,Unnamed: 1_level_1
0,Italy
1,USA
2,USA
3,Germany


In [35]:
# Select 1 row in an array - DataFrame
df.loc[[1], :]

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1000,John,33,USA,6.7,America


In [36]:
# Select multiple columns - DataFrame
df.loc[:, ["name", "country"]]

Unnamed: 0_level_0,name,country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Mark,Italy
1,John,USA
2,Tim,USA
3,Jenny,Germany


In [37]:
# Select range of columns - DataFrame
df.loc[:, "name":"country"]

Unnamed: 0_level_0,name,age,country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Mark,55,Italy
1,John,33,USA
2,Tim,41,USA
3,Jenny,12,Germany


## Selecting Data by position

In [38]:
# Select 1 value - Scalar
df.iloc[0, 1]

'Mark'

In [39]:
# Select 1 column - Series
df.iloc[:, 1]

user_id
0     Mark
1     John
2      Tim
3    Jenny
Name: name, dtype: object

In [40]:
# Select 1 row - Series
df.iloc[1, :]

user_id         1000
name            John
age               33
country          USA
score            6.7
continent    America
Name: 1, dtype: object

In [41]:
# Select 1 column in an array - DataFrame
df.iloc[:, [1]]

Unnamed: 0_level_0,name
user_id,Unnamed: 1_level_1
0,Mark
1,John
2,Tim
3,Jenny


In [42]:
# Select 1 row in an array - DataFrame
df.iloc[[1], :]

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1000,John,33,USA,6.7,America


In [43]:
# Select multiple columns - DataFrame 
df.iloc[:, [2, 1]]

Unnamed: 0_level_0,age,name
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,55,Mark
1,33,John
2,41,Tim
3,12,Jenny


In [44]:
# Select multiple rows - DataFrame 
df.iloc[[3, 1], :]

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,1003,Jenny,12,Germany,9.0,Europe
1,1000,John,33,USA,6.7,America


In [45]:
# Select range of columns - DataFrame
df.iloc[:, 1:3]

Unnamed: 0_level_0,name,age
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Mark,55
1,John,33
2,Tim,41
3,Jenny,12


In [46]:
df.iloc[:, :3]

Unnamed: 0_level_0,user_id,name,age
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1001,Mark,55
1,1000,John,33
2,1002,Tim,41
3,1003,Jenny,12


In [47]:
# Select range of rows - DataFrame
df.iloc[1:3, :]

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America


In [48]:
df.iloc[:3, :]

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America


## Selecting Data by boolean indexing

In [49]:
# create a boolean series
tf = (df["age"] > 40) & (df["country"] == "USA")

In [50]:
# select dataframe by boolean series
df.loc[tf, :]

Unnamed: 0_level_0,user_id,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,1002,Tim,41,USA,3.9,America


## Selecting by using a MultiIndex

In [52]:
# setup a MultiIndex
df_multi = df.set_index(["continent", "country"])
df_multi = df_multi.sort_index()
df_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,name,age,score
continent,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
America,USA,1000,John,33,6.7
America,USA,1002,Tim,41,3.9
Europe,Germany,1003,Jenny,12,9.0
Europe,Italy,1001,Mark,55,4.5


In [53]:
# select by index value Europe
df_multi.loc["Europe", :]

Unnamed: 0_level_0,user_id,name,age,score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Germany,1003,Jenny,12,9.0
Italy,1001,Mark,55,4.5


In [55]:
# select by multi indexes values Europe and Germany
df_multi.loc[("Europe", "Germany"), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,name,age,score
continent,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Europe,Germany,1003,Jenny,12,9.0
