[Reference](https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b90)

In [1]:
import pandas as pd

# information about employees
id_number = ['128', '478', '257', '299', '175', '328', '099', '457', '144', '222']
name = ['Patrick', 'Amanda', 'Antonella', 'Eduard', 'John', 'Alejandra', 'Layton', 'Melanie', 'David', 'Lewis']
surname = ['Miller', 'Torres', 'Brown', 'Iglesias', 'Wright', 'Campos', 'Platt', 'Cavill', 'Lange', 'Bellow']
division = ['Sales', 'IT', 'IT', 'Sales', 'Marketing', 'Engineering', 'Engineering', 'Sales', 'Engineering', 'Sales']
salary = [30000, 54000, 80000, 79000, 15000, 18000, 30000, 35000, 45000, 30500]
telephone = ['7366578', '7366444', '7366120', '7366574', '7366113', '7366117', '7366777', '7366579', '7366441', '7366440']
type_contract = ['permanent', 'temporary', 'temporary', 'permanent', 'internship', 'internship', 'permanent', 'temporary', 'permanent', 'permanent']

# data frame containing information about employees
df_employees = pd.DataFrame({'name': name, 'surname': surname, 'division': division,
                             'salary': salary, 'telephone': telephone, 'type_contract': type_contract}, index=id_number)

df_employees

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
175,John,Wright,Marketing,15000,7366113,internship
328,Alejandra,Campos,Engineering,18000,7366117,internship
99,Layton,Platt,Engineering,30000,7366777,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary
144,David,Lange,Engineering,45000,7366441,permanent
222,Lewis,Bellow,Sales,30500,7366440,permanent


# Selecting a single column by label

In [4]:
salary = df_employees.salary
salary_2 = df_employees['salary']

print(type(salary))
print(type(salary_2))

salary

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


128    30000
478    54000
257    80000
299    79000
175    15000
328    18000
099    30000
457    35000
144    45000
222    30500
Name: salary, dtype: int64

In [5]:
df_employees['salary']

128    30000
478    54000
257    80000
299    79000
175    15000
328    18000
099    30000
457    35000
144    45000
222    30500
Name: salary, dtype: int64

In [6]:
df_employees[['salary']]

Unnamed: 0,salary
128,30000
478,54000
257,80000
299,79000
175,15000
328,18000
99,30000
457,35000
144,45000
222,30500


# Selecting multiple columns by label

In [7]:
df_employees[['division', 'salary']]

Unnamed: 0,division,salary
128,Sales,30000
478,IT,54000
257,IT,80000
299,Sales,79000
175,Marketing,15000
328,Engineering,18000
99,Engineering,30000
457,Sales,35000
144,Engineering,45000
222,Sales,30500


# Selecting columns by data type

In [8]:
import numpy as np

# select numeric columns - numpy object
numeric_inputs = df_employees.select_dtypes(include=np.number)

# check selected columns with the .columns attribute
numeric_inputs.columns
# Index(['salary'], dtype='object')

# the method returns a DataFrame object
print(type(numeric_inputs))
# <class 'pandas.core.frame.DataFrame'>

# select numeric columns - string
numeric_inputs_2 = df_employees.select_dtypes(include='number')

# check selected columns with the .columns attribute
numeric_inputs_2.columns
# Index(['salary'], dtype='object')

# the method returns a DataFrame object
print(type(numeric_inputs_2))
# <class 'pandas.core.frame.DataFrame'>

# visualize the data frame
numeric_inputs

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,salary
128,30000
478,54000
257,80000
299,79000
175,15000
328,18000
99,30000
457,35000
144,45000
222,30500


In [9]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 128 to 222
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           10 non-null     object
 1   surname        10 non-null     object
 2   division       10 non-null     object
 3   salary         10 non-null     int64 
 4   telephone      10 non-null     object
 5   type_contract  10 non-null     object
dtypes: int64(1), object(5)
memory usage: 880.0+ bytes


In [10]:
df_employees.dtypes

name             object
surname          object
division         object
salary            int64
telephone        object
type_contract    object
dtype: object

# Selecting a single row by label

In [11]:
df_employees.index

Index(['128', '478', '257', '299', '175', '328', '099', '457', '144', '222'], dtype='object')

In [12]:
df_employees.loc['478']

name                Amanda
surname             Torres
division                IT
salary               54000
telephone          7366444
type_contract    temporary
Name: 478, dtype: object

In [13]:
df_employees.loc[['478']]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
478,Amanda,Torres,IT,54000,7366444,temporary


# Selecting multiple rows by label

In [14]:
df_employees.loc[['478', '222']]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
478,Amanda,Torres,IT,54000,7366444,temporary
222,Lewis,Bellow,Sales,30500,7366440,permanent


# Selecting a single row by position

In [15]:
df_employees.iloc[0]

name               Patrick
surname             Miller
division             Sales
salary               30000
telephone          7366578
type_contract    permanent
Name: 128, dtype: object

In [16]:
df_employees.iloc[-1]

name                 Lewis
surname             Bellow
division             Sales
salary               30500
telephone          7366440
type_contract    permanent
Name: 222, dtype: object

In [17]:
df_employees.iloc[[-1]]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
222,Lewis,Bellow,Sales,30500,7366440,permanent


In [18]:
df_employees.shape

(10, 6)

In [19]:
df_employees.iloc[10]

IndexError: ignored

# Selecting multiple rows by position

In [20]:
df_employees.iloc[[0, 1, 2, 3, 4]]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
175,John,Wright,Marketing,15000,7366113,internship


In [21]:
df_employees.iloc[0:5]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
175,John,Wright,Marketing,15000,7366113,internship


# Selecting rows and columns simultaneously

In [22]:
df_employees.iloc[1, 3]

54000

In [23]:
df_employees.loc['478', 'salary']

54000

In [24]:
df_employees.iloc[1, [0, 1, 3]]

name       Amanda
surname    Torres
salary      54000
Name: 478, dtype: object

In [25]:
df_employees.loc['478', ['name', 'surname', 'salary']]

name       Amanda
surname    Torres
salary      54000
Name: 478, dtype: object

In [26]:
df_employees.iloc[[1, 9], [0, 1, 3]]

Unnamed: 0,name,surname,salary
478,Amanda,Torres,54000
222,Lewis,Bellow,30500


In [27]:
df_employees.loc[['478', '222'], ['name', 'surname', 'salary']]

Unnamed: 0,name,surname,salary
478,Amanda,Torres,54000
222,Lewis,Bellow,30500


In [28]:
df_employees.iloc[:4, [0, 1, 3]]

Unnamed: 0,name,surname,salary
128,Patrick,Miller,30000
478,Amanda,Torres,54000
257,Antonella,Brown,80000
299,Eduard,Iglesias,79000


In [29]:
df_employees.loc[:'299', ['name', 'surname', 'salary']]

Unnamed: 0,name,surname,salary
128,Patrick,Miller,30000
478,Amanda,Torres,54000
257,Antonella,Brown,80000
299,Eduard,Iglesias,79000


# Selecting a scalar value using the .at[] and .iat[] indexers

In [30]:
df_employees.iat[1, 3]

54000

In [31]:
df_employees.at['478', 'salary']

54000

In [32]:
%timeit df_employees.loc['478', 'salary']

The slowest run took 16.05 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 7.38 µs per loop


In [33]:
%timeit df_employees.at['478', 'salary']

The slowest run took 27.07 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 4.33 µs per loop


In [34]:
%timeit df_employees.iloc[1, 3]

The slowest run took 16.32 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 7.55 µs per loop


In [35]:
%timeit df_employees.iat[1, 3]

The slowest run took 13.05 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 4.98 µs per loop


In [36]:
df_employees.at['478', ['name', 'surname', 'salary']]

TypeError: ignored

# Selecting rows using Boolean selection

In [37]:
df_employees['salary'] > 45000

128    False
478     True
257     True
299     True
175    False
328    False
099    False
457    False
144    False
222    False
Name: salary, dtype: bool

In [38]:
df_employees[df_employees['salary'] > 45000]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent


In [39]:
df_employees[(df_employees['salary'] > 45000) & (df_employees['type_contract'] == 'permanent')]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
299,Eduard,Iglesias,Sales,79000,7366574,permanent


In [40]:
df_employees[(df_employees['type_contract'] == 'temporary') | (df_employees['type_contract'] == 'permanent')]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
99,Layton,Platt,Engineering,30000,7366777,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary
144,David,Lange,Engineering,45000,7366441,permanent
222,Lewis,Bellow,Sales,30500,7366440,permanent


In [41]:
df_employees[df_employees['type_contract'].isin(['temporary', 'permanent'])]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
99,Layton,Platt,Engineering,30000,7366777,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary
144,David,Lange,Engineering,45000,7366441,permanent
222,Lewis,Bellow,Sales,30500,7366440,permanent


In [42]:
# execution time with the logical operator |
%timeit df_employees[(df_employees['type_contract'] == 'temporary') | (df_employees['type_contract'] == 'permanent')]

# execution time with the isin method
%timeit df_employees[df_employees['type_contract'].isin(['temporary', 'permanent'])]

1000 loops, best of 3: 680 µs per loop
1000 loops, best of 3: 476 µs per loop


In [43]:
df_employees[df_employees['salary'].between(30000, 80000)]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
99,Layton,Platt,Engineering,30000,7366777,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary
144,David,Lange,Engineering,45000,7366441,permanent
222,Lewis,Bellow,Sales,30500,7366440,permanent


In [44]:
df_employees[df_employees['salary'].between(30000, 80000, inclusive=False)]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
478,Amanda,Torres,IT,54000,7366444,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary
144,David,Lange,Engineering,45000,7366441,permanent
222,Lewis,Bellow,Sales,30500,7366440,permanent


In [45]:
df_employees[(df_employees['salary']>=30000) & (df_employees['salary']<=80000)]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
299,Eduard,Iglesias,Sales,79000,7366574,permanent
99,Layton,Platt,Engineering,30000,7366777,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary
144,David,Lange,Engineering,45000,7366441,permanent
222,Lewis,Bellow,Sales,30500,7366440,permanent


In [46]:
df_employees[df_employees['telephone'].str.contains('57')]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
128,Patrick,Miller,Sales,30000,7366578,permanent
299,Eduard,Iglesias,Sales,79000,7366574,permanent
457,Melanie,Cavill,Sales,35000,7366579,temporary


In [47]:
df_employees[df_employees['name'].str.startswith('A')]

Unnamed: 0,name,surname,division,salary,telephone,type_contract
478,Amanda,Torres,IT,54000,7366444,temporary
257,Antonella,Brown,IT,80000,7366120,temporary
328,Alejandra,Campos,Engineering,18000,7366117,internship
