In [1]:
import pandas as pd
import numpy as np

  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)


# Filtering DataFrames

## reducing memory

Whenever importing a data set, it’s important to consider whether each column stores
its data in the most optimal type. The “best” data type is the one that consumes the
least memory or provides the most utility

Let's convert each column to the proper datatype

In [5]:
employees = pd.read_csv('/home/diego/Documents/Data/employees.csv',
            parse_dates=['Start Date'])
employees

  employees = pd.read_csv('/home/diego/Documents/Data/employees.csv',


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [6]:
employees.info()
# we observe that mgmt values are strings, but they are True or False so let's convert them into Booleans

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        933 non-null    object        
 5   Team        957 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 47.1+ KB


In [9]:
# with the astype method 
employees = employees.astype({'Mgmt': bool})

In [11]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 40.2+ KB


In [12]:
# now we will cast the Salary column to integers
employees = employees.astype({'Salary': int})


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer: Error while type casting for column 'Salary'

We need to change the NaN datatypes to be able to use the astype method, we use `fillna()` method to replace the nan values to another we specify

In [15]:
employees = employees.fillna({'Salary': 0})

In [19]:
employees

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0.0,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


Now converting the values with the `astype()`

In [23]:
employees = employees.astype({'Salary': int})

In [24]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(3)
memory usage: 40.2+ KB


In [31]:
# casting the Gender and Team columns into categorical datatype because they only have two possible options in a total of 1001 rows
employees = employees.astype({'Gender': 'category', 'Team': 'category'})

In [36]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int64         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](1), int64(1), object(1)
memory usage: 27.0+ KB


# filtering with conditions

With the following syntax we compare each value of a Series with the other value

`Series == value` -> Series of booleans

In [38]:
employees['First Name'] == 'Maria'

0       False
1       False
2        True
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: First Name, Length: 1001, dtype: bool

In [42]:
# To extract the rows that meets the criteria, the following permits us to do
marias = employees['First Name'] == 'Maria'  # this returns a Series of Booleans
employees[marias]

pandas.core.series.Series

What if we want to extract a subset of employees who
are not on the Finance team?

In [48]:
not_finance = employees.Team != 'Finance'
employees[not_finance]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


What if we want to retrieve all the managers in the company?

In [53]:
# because Mgmt values are booleans is not necessary to do this:
# managers = employees['Mgmt'] == True
# employees[managers]

# instead just past the colum
employees[employees.Mgmt]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,112769,True,Finance
993,Tina,Female,1997-05-15,56450,True,Engineering
994,George,Male,2013-06-21,98874,True,Marketing
999,Albert,Male,2012-05-15,129949,True,Sales


Salary values greater than $100,000

In [55]:
gt_100k = employees.Salary > 100_000
employees[gt_100k]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
9,Frances,Female,2002-08-08,139852,True,Business Dev
...,...,...,...,...,...,...
990,Robin,Female,1987-07-24,100765,True,IT
991,Rose,Female,2002-08-25,134505,True,Marketing
992,Anthony,Male,2011-10-16,112769,True,Finance
995,Henry,,2014-11-23,132483,False,Distribution


Suppose that we want to find all female employees who work on the business development team.

In [58]:
is_female = employees.Gender == 'Female'
bd =  employees.Team == 'Business Dev'

In [63]:
employees[is_female & bd]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
33,Jean,Female,1993-12-18,119082,False,Business Dev
36,Rachel,Female,2009-02-16,142032,False,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
61,Denise,Female,2001-11-06,106862,False,Business Dev
66,Nancy,Female,2012-12-15,125250,True,Business Dev
92,Linda,Female,2000-05-25,119009,True,Business Dev
111,Bonnie,Female,1999-12-17,42153,True,Business Dev
114,Ashley,Female,2002-08-04,58698,True,Business Dev
118,Andrea,Female,2012-01-12,120204,False,Business Dev


We want to identify all employees with a Salary below $40,000 or a Start
Date after January 1, 2015.

In [66]:
salary_lt40_000 = employees.Salary < 40_000
date_after_2015 = employees['Start Date'] > '2015-01-01'

In [67]:
date_after_2015

0       False
1       False
2       False
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: Start Date, Length: 1001, dtype: bool

In [69]:
employees[salary_lt40_000 | date_after_2015]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
7,,Female,2015-07-20,45906,True,Finance
15,Lillian,Female,2016-06-05,59414,False,Product
25,,Male,2012-10-08,37076,True,IT
26,Craig,Male,2000-02-27,37598,True,Marketing
...,...,...,...,...,...,...
958,Gloria,Female,1987-10-24,39833,False,Engineering
964,Bruce,Male,1980-05-07,35802,True,Sales
967,Thomas,Male,2016-03-12,105681,False,Engineering
989,Justin,,1991-02-10,38344,False,Legal


## inverser ~

In [70]:
series_booleans = pd.Series(data=[True, False, True])
series_booleans

0     True
1    False
2     True
dtype: bool

In [71]:
# inverting the values
~series_booleans

0    False
1     True
2    False
dtype: bool

## isin method

What if we want to isolate the employees who belong to either the Sales, Legal, or Marketing team?

In [73]:
teams_interest = ['Sales', 'Legal', 'Marketing']
filter_team = employees.Team.isin(teams_interest)
employees[filter_team]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
5,Dennis,Male,1987-04-18,115163,False,Legal
11,Julie,Female,1997-10-26,102508,True,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
20,Lois,,1995-04-22,64714,True,Legal
...,...,...,...,...,...,...
986,Donna,Female,1982-11-26,82871,False,Marketing
989,Justin,,1991-02-10,38344,False,Legal
991,Rose,Female,2002-08-25,134505,True,Marketing
994,George,Male,2013-06-21,98874,True,Marketing


between method

In [82]:
bt_8_9 = employees.Salary.between(80_000, 90_000)
employees[bt_8_9]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
19,Donna,Female,2010-07-22,81014,False,Product
31,Joyce,,2005-02-20,88657,False,Product
35,Theresa,Female,2006-10-10,85182,False,Sales
45,Roger,Male,1980-04-17,88010,True,Sales
54,Sara,Female,2007-08-15,83677,False,Engineering
...,...,...,...,...,...,...
930,Nancy,Female,2001-09-10,85213,True,Marketing
956,Beverly,Female,1986-10-17,80838,False,Engineering
963,Ann,Female,1994-09-23,89443,True,Sales
985,Stephen,,1983-07-10,85668,False,Legal


## isnull and notnull methods

In [83]:
employees.Team.isnull()

0       False
1        True
2       False
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000     True
Name: Team, Length: 1001, dtype: bool

## dropna method

Remove thw row if a single NaN value is found

In [85]:
# importing again 
employees = pd.read_csv('/home/diego/Documents/Data/employees.csv', parse_dates=['Start Date'])
employees

  employees = pd.read_csv('/home/diego/Documents/Data/employees.csv', parse_dates=['Start Date'])


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [88]:
employees.dropna()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


# Duplicated values