In [1]:
import pandas as pd
import datetime as dt

# 3. DataFrames II: Filter Df

### Table of Contents

1. This Module's Dataset + Memory Optimization
2. Filter a DataFrame Based on a Condition
3. Filter with More than One Condition (AND)
4. Filter with More than One Condition (OR)
5. The isin Method
6. The isnull and notnull Methods
7. The between Method
8. The duplicated Method
9. The drop_duplicates Method # alternative for duplicated() method
10. The unique and nunique Methods


## 1. This Module's Dataset + Memory Optimization

- The `pd.to_datetime` method converts a Series to hold datetime values.
- The `format` parameter informs pandas of the format that the times are stored in.
- We pass symbols designating the segments of the string. For example, `%m` means "month" and `%d` means "day."
- The `dt` attribute reveals an object with many datetime-related attributes and methods.
- The `dt.time` attribute extracts only the time from each value in a datetime Series.
- Use the `astype` method to convert the values in a Series to another type.
- The `parse_dates` parameter of `read_csv` is an alternate way to parse strings as datetimes.


In [2]:
employees = pd.read_csv("employees.csv")
employees

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [3]:
employees.info()
# Pandas labels 6 columns as 'object'
# this is an umbrella term: Pandas gives this label as soon as it encounters a string character
# therefore, best to specify the exact datatype of a column to be able to access appropriate methods for this datatype

# a) datatime
# for example, column 'Start Date' is a date, but Pandas labels it as an 'object' which in practice means that it treats it as a string
# however, the methods that are appropriate to handle dates can not be applied to a string object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [4]:
# to change a column to date datatype: use pd.to_datetime()

pd.to_datetime(employees["Start Date"]) # Pandas in this case is able to read date format automatically
pd.to_datetime(employees["Start Date"], format="%m/%d/%Y") # but recommended to explicitly specify exact date format

# Common symbols used in date formats:
# - `%Y` : 4-digit year (e.g., 2023)
# - `%y` : 2-digit year (e.g., 23 for 2023)
# - `%m` : 2-digit month (e.g., 01 for January)
# - `%d` : 2-digit day of the month (e.g., 31)
# - `%H` : 2-digit hour (24-hour clock, e.g., 14 for 2 PM)
# - `%I` : 2-digit hour (12-hour clock, e.g., 02 for 2 PM)
# - `%M` : 2-digit minute (e.g., 59)
# - `%S` : 2-digit second (e.g., 45)
# - `%p` : AM/PM (e.g., PM)
# - `%B` : Full month name (e.g., January)
# - `%b` : Abbreviated month name (e.g., Jan)
# - `%A` : Full weekday name (e.g., Monday)
# - `%a` : Abbreviated weekday name (e.g., Mon)

# Example usage:
# pd.to_datetime(employees["Start Date"], format="%d-%b-%Y")  # e.g., 31-Jan-2023

0     1993-08-06
1     1996-03-31
2     1993-04-23
3     2005-03-04
4     1998-01-24
         ...    
995   2014-11-23
996   1984-01-31
997   2013-05-20
998   2013-04-20
999   2012-05-15
Name: Start Date, Length: 1000, dtype: datetime64[ns]

In [5]:
# overwrite original dataframe with updated datatype information
employees["Start Date"] = pd.to_datetime(employees["Start Date"], format="%m/%d/%Y")

In [6]:
# do the same for the other datetime columns

pd.to_datetime(employees["Last Login Time"], format="%H:%M %p")

0     1900-01-01 12:42:00
1     1900-01-01 06:53:00
2     1900-01-01 11:17:00
3     1900-01-01 01:00:00
4     1900-01-01 04:47:00
              ...        
995   1900-01-01 06:09:00
996   1900-01-01 06:30:00
997   1900-01-01 12:39:00
998   1900-01-01 04:45:00
999   1900-01-01 06:24:00
Name: Last Login Time, Length: 1000, dtype: datetime64[ns]

In [7]:
# however, this returns BOTH date AND time
# if we want to isolate the time from the date in this datetime object
# we can use attribute ".dt", which returns an accessor object
# that itself has attributes to extract date OR time:
# date: dt.date
# time: dt.time
pd.to_datetime(employees["Last Login Time"], format="%H:%M %p").dt.date
pd.to_datetime(employees["Last Login Time"], format="%H:%M %p").dt.time

0      12:42:00
1      06:53:00
2      11:17:00
3      01:00:00
4      04:47:00
         ...   
995    06:09:00
996    06:30:00
997    12:39:00
998    04:45:00
999    06:24:00
Name: Last Login Time, Length: 1000, dtype: object

In [8]:
# we again overwrite the original column with this updated datatype column
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format="%H:%M %p").dt.time

In [9]:
# b) boolean
# the column 'Senior Management' is also labeled 'object', but is in fact a boolean
# change with astype() method

employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Senior Management"]

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996    False
997    False
998    False
999     True
Name: Senior Management, Length: 1000, dtype: bool

In [10]:
# c) category
# the 'Gender' column only has 2 unique values for a 1000 rows
# ideal to turn into a category to save memory space

employees["Gender"] = employees["Gender"].astype("category")
employees["Gender"]

0        Male
1        Male
2      Female
3        Male
4        Male
        ...  
995       NaN
996      Male
997      Male
998      Male
999      Male
Name: Gender, Length: 1000, dtype: category
Categories (2, object): ['Female', 'Male']

In [11]:
# now, look back at dataframe after all conversions:

employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 49.1+ KB


## 2. Filter a DataFrame Based on a Condition

- Pandas needs a Series of Booleans to perform a filter: True = keep, False = drop.
- Pass the Boolean Series inside square brackets after the DataFrame.
- We can generate a Boolean Series using a wide variety of operations: `==`, `!=`, `<`, `>`, `in``, ...


In [12]:
# load dataset with correct datatype metadata

employees = pd.read_csv("employees.csv")
employees["Gender"] = employees["Gender"].astype("category")
employees["Start Date"] = pd.to_datetime(employees["Start Date"], format="%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format="%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [13]:
# we want to keep Male employees
# we make a boolean Series
employees["Gender"] == "Male" # compare each Series value with string "Male"

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [14]:
# to extract only the Male employees use boolean series itself in subsetting
employees[employees["Gender"] == "Male"]

# if this is confusing: assign boolean Series to a variable
male_employees = employees["Gender"] == "Male"
employees[male_employees]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,05:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [15]:
# in case of Senior Management, we already have boolean Series, so we can simply pass the column in itself
employees[employees["Senior Management"]]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,03:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,05:47:00,98874,4.479,True,Marketing


In [16]:
# with datetime objects: simply put between parentheses

employees["Start Date"] < "1985-01-01"

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996     True
997    False
998    False
999    False
Name: Start Date, Length: 1000, dtype: bool

In [17]:
# all employees that started before 1st January 1985
employees[employees["Start Date"] < "1985-01-01"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,06:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,08:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,10:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,08:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,07:04:00,82871,17.999,False,Marketing


In [18]:
# with time not as easy as with date: will not work to compare with string
# need for datetime library, abbreviated with dt

# dt.time(12,0,0) to instantiate time class; pass in arguments: 12:00:00
# this allows us to create a conditional time object against which to compare the column values in "Last Login Time"

# all employees who logged in after noon
# boolean Series:
employees["Last Login Time"] < dt.time(12,0,0)
# subsetting
employees[employees["Last Login Time"] > dt.time(12,0,0)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
34,Jerry,Male,2004-01-10,12:56:00,95734,19.096,False,Client Services
49,Chris,,1980-01-24,12:13:00,113590,3.055,False,Sales
61,Denise,Female,2001-11-06,12:03:00,106862,3.699,False,Business Development
76,Margaret,Female,1988-09-10,12:42:00,131604,7.353,True,Distribution
...,...,...,...,...,...,...,...,...
945,Gerald,,1989-04-15,12:44:00,93712,17.426,True,Distribution
956,Beverly,Female,1986-10-17,12:51:00,80838,8.115,False,Engineering
962,Jonathan,Male,2013-08-21,12:45:00,121797,16.923,False,Product
980,Kimberly,Female,2013-01-26,12:57:00,46233,8.862,True,Engineering


## 3. Filter with More than One Condition (AND)

- Add the `&` operator in between two Boolean Series to filter by multiple conditions.
- We can assign the Series to variables to make the syntax more readable.


In [19]:
# load dataset with correct datatype metadata

employees = pd.read_csv("employees.csv")
employees["Gender"] = employees["Gender"].astype("category")
employees["Start Date"] = pd.to_datetime(employees["Start Date"], format="%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format="%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [20]:
# female employees who work in marketing
# 2 criteria to be met

employees[(employees["Gender"] == "Female") & (employees["Team"] == "Marketing")]

# or:
is_female = employees["Gender"] == "Female"
is_in_marketing = employees["Team"] == "Marketing"

employees[is_female & is_in_marketing]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
43,Marilyn,Female,1980-12-07,03:16:00,73524,5.207,True,Marketing
62,,Female,2007-06-12,05:25:00,58112,19.414,True,Marketing
98,Tina,Female,2016-06-16,07:47:00,100705,16.961,True,Marketing
140,Shirley,Female,1981-02-28,01:23:00,113850,1.854,False,Marketing
158,Norma,Female,1999-02-28,08:45:00,114412,8.756,True,Marketing
201,Kimberly,Female,1997-07-15,05:57:00,36643,7.953,False,Marketing
220,,Female,1991-06-17,12:49:00,71945,5.56,True,Marketing
305,Margaret,Female,1993-02-06,01:05:00,125220,3.733,False,Marketing
319,Jacqueline,Female,1981-11-25,03:01:00,145988,18.243,False,Marketing
331,Evelyn,Female,1983-09-03,01:58:00,36759,17.269,True,Marketing


## 4. Filter with More than One Condition (OR)

- Use the `|` operator in between two Boolean Series to filter by **either** condition.


In [21]:
# female employees who work in marketing
# 2 criteria to be met

employees[(employees["Gender"] == "Female") | (employees["Team"] == "Marketing")]

# or:
is_female = employees["Gender"] == "Female"
is_in_marketing = employees["Team"] == "Marketing"

employees[is_female | is_in_marketing]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
...,...,...,...,...,...,...,...,...
988,Alice,Female,2004-10-05,09:34:00,47638,11.209,False,Human Resources
990,Robin,Female,1987-07-24,01:35:00,100765,10.982,True,Client Services
991,Rose,Female,2002-08-25,05:12:00,134505,11.051,True,Marketing
993,Tina,Female,1997-05-15,03:53:00,56450,19.040,True,Engineering


## 5. The `isin` Method

- The `isin` Series method accepts a collection object like a list, tuple, or Series.
- The method returns `True` for a row if its value is found in the collection.


In [22]:
# a more convenient way to test multiple OR conditions : .isin()
# make a boolean Series
employees["Team"].isin(["Legal","Sales","Product"])

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997     True
998    False
999     True
Name: Team, Length: 1000, dtype: bool

In [23]:
# take boolean Series to subset Df
employees[employees["Team"].isin(["Legal","Sales","Product"])]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,03:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,11:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,05:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,08:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,04:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product


## 6. The `isnull` and `notnull` Methods

- The `isnull` method returns `True` for `NaN` values in a Series.
- The `notnull` method returns `True` for present values in a Series.


In [24]:
# works in same way as filtering on condition: both methods return a Boolean Series, which can be used to subset the Df
employees[employees["Team"].isnull()]
employees[employees["Team"].notnull()]
employees[(employees["First Name"].isnull()) & (employees["Team"].notnull())]# we can also mix

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
25,,Male,2012-10-08,01:12:00,37076,18.576,True,Client Services
39,,Male,2016-01-29,02:33:00,122173,7.797,True,Client Services
51,,,2011-12-17,08:29:00,41126,14.009,True,Sales
62,,Female,2007-06-12,05:25:00,58112,19.414,True,Marketing
116,,Male,1991-06-22,08:58:00,76189,18.988,True,Legal
149,,Female,2014-08-17,02:00:00,86230,8.578,True,Distribution
157,,Female,2005-07-27,08:32:00,79536,14.443,True,Product
165,,Female,2014-03-23,01:28:00,59148,9.061,True,Legal
166,,Female,1991-07-09,06:52:00,42341,7.014,True,Sales


## 7. The `between` Method

- The `between` method returns `True` if a Series value is found within its range.


In [25]:
# again returns a Boolean Series
# all employees who have wage between 60.000$ and 70.000$
employees["Salary"].between(60000,70000) # start & end value inclusive

0      False
1       True
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998     True
999    False
Name: Salary, Length: 1000, dtype: bool

In [26]:
# pass in Boolean Series in subsetting to filter
employees[employees["Salary"].between(60000,70000)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,07:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
965,Catherine,Female,1989-09-25,01:31:00,68164,18.393,False,Client Services
970,Alice,Female,1988-09-03,08:54:00,63571,15.397,True,Product
974,Harry,Male,2011-08-30,06:31:00,67656,16.455,True,Client Services
978,Sean,Male,1983-01-17,02:23:00,66146,11.178,False,Human Resources


## 8. The `duplicated` Method

- The `duplicated` method returns `True` if a Series value is a duplicate.
- Pandas will mark one occurrence of a repeated value as a non-duplicate.
- Use the `keep` parameter to designate whether the first or last occurrence of a repeated value should be considered the "non-duplicate."
- Pass `False` to the `keep` parameter to mark all occurrences of repeated values as duplicates.
- Use the tilde symbol (`~`) to invert a Series's values. Trues will become Falses, and Falses will become Trues.


In [27]:
# again a Boolean Series is made
employees["First Name"].duplicated() # first time Pandas encounters a value it considers it not a duplicate
# next time(s) Pandas encounters the same value it considers it a duplicate
# Pandas runs from top to bottom

# use this to filter and keep first occurence
employees[employees["First Name"].duplicated()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
23,,Male,2012-06-14,04:19:00,125792,5.042,True,
25,,Male,2012-10-08,01:12:00,37076,18.576,True,Client Services
32,,Male,1998-08-21,02:27:00,122340,6.417,True,
34,Jerry,Male,2004-01-10,12:56:00,95734,19.096,False,Client Services
39,,Male,2016-01-29,02:33:00,122173,7.797,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [28]:
employees[employees["First Name"].duplicated(keep = "last")] # we can also ask Pandas to reverse order in which it traverses Df
employees[employees["First Name"].duplicated(keep = False)] # we can also ask to keep any value that has a duplicate
employees[~employees["First Name"].duplicated(keep = False)] # tilde symbol inverts values: we only keep the unique values

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,10:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,03:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,08:48:00,92242,15.407,False,Legal


## 9. The `drop_duplicates` Method # alternative for duplicated() method

- The `drop_duplicates` method deletes rows with duplicate values.
- By default, it will remove a row if **all** of its values are shared with another row.
- The `subset` parameter configures the columns to look for duplicate values within.
- Pass a list to the `subset` parameter to look for duplicates across multiple columns.


In [29]:
employees.drop_duplicates() # Pandas only considers rows duplicates if ALL values are same
employees.drop_duplicates(subset = "Team") # we can also subset Df to look for duplicate values in only 1 or more columns
# Pandas considers all but the first occurence of row as duplicate and will drop it

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,01:08:00,112807,17.492,True,Human Resources
13,Gary,Male,2008-01-27,11:40:00,109831,5.831,False,Sales


In [30]:
employees.drop_duplicates(subset = "Team", keep = "first") # traverse top-down
employees.drop_duplicates(subset = "Team", keep = "first") # traverse bottom-up
employees.drop_duplicates(subset = "First Name", keep = False) # remove all duplicates: these are all unique values

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,10:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,03:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,08:48:00,92242,15.407,False,Legal


In [31]:
employees.drop_duplicates(subset = ["Senior Management", "Team"]).sort_values("Team") 
# will look for any unique combination of value for Senior Management and value for Team
# only rows that have duplicate values in both columns will be removed

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
18,Diana,Female,1981-10-23,10:27:00,132940,19.082,False,Client Services
60,Paula,,2005-11-23,02:01:00,48866,4.271,False,Distribution
40,Michael,Male,2008-10-10,11:25:00,99283,2.665,True,Distribution
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
54,Sara,Female,2007-08-15,09:23:00,83677,8.999,False,Engineering
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance


## 10. The `unique` and `nunique` Methods

- The `unique` method on a Series returns a collection of its unique values. The method does not exist on a DataFrame.
- The `nunique` method returns a count of the number of unique values in the Series/DataFrame.
- The `dropna` parameter configures whether to include or exclude missing (`NaN`) values.


In [32]:
employees["Gender"].unique() # shows you all unique values in a column

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [33]:
employees["Gender"].nunique() # will give count of unique values

2

In [34]:
employees["Gender"].nunique(dropna = False) # if you want to include NaN values

3

In [35]:
employees.nunique() 

First Name           200
Gender                 2
Start Date           972
Last Login Time      542
Salary               995
Bonus %              971
Senior Management      2
Team                  10
dtype: int64