### Introduction

In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('employees.csv')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [30]:
df.info()  # object is pandas lingo for string

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [31]:
df['Start Date']  = pd.to_datetime(df['Start Date']) 
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df.head(3)
# converting string(obj) to datetime obj. Helpful for operations on date-time objects

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance


In [32]:
df['Senior Management'] = df['Senior Management'].astype('bool') # converting string to bool
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance


In [33]:
df['Gender'].value_counts()
df['Gender'].nunique()

2

In [34]:
df['Gender'] = df['Gender'].astype('category') # converting to category col to save memory
df.head(2)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,


In [35]:
df.info() # less memory consumed after conversion to category col

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


In [36]:
df = pd.read_csv('employees.csv', parse_dates = ['Start Date', 'Last Login Time']) # converts to datetime when reading csv
#df['Start Date']  = pd.to_datetime(df['Start Date']) 
#df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance


### Filter a DataFrame Based on a Condition

In [37]:
df['Gender'] == 'Male'

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [38]:
df[df['Gender'] == 'Male'] # returns only rows with Gender as Male, checks for the condition

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2023-03-01 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2023-03-01 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-03-01 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2023-03-01 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2023-03-01 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2023-03-01 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2023-03-01 16:45:00,60500,11.985,False,Business Development


In [39]:
# Alt syntax, using Mask

mask = df['Team'] == 'Finance' # stroing in a variable(mask) based on a condition
df[mask]                       # returns rows with Team as Finance

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-03-01 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2023-03-01 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2023-03-01 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2023-03-01 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2023-03-01 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2023-03-01 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2023-03-01 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2023-03-01 08:35:00,112769,11.625,True,Finance


In [40]:
df[df['Senior Management']] # no need to check it T/F (has T/F as values), returns rows with SM as True

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2023-03-01 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2023-03-01 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2023-03-01 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2023-03-01 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2023-03-01 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2023-03-01 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2023-03-01 17:47:00,98874,4.479,True,Marketing


In [41]:
df[df['Salary'] > 110000] # returns rows where salary greater than 110,000

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-03-01 13:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,2023-03-01 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2023-03-01 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2023-03-01 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,2023-03-01 05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,2023-03-01 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2023-03-01 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2023-03-01 06:09:00,132483,16.655,False,Distribution


In [42]:
df[df['Start Date'] <= '1985-01-01'] # date comparison, get all records before specified date

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2023-03-01 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2023-03-01 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2023-03-01 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2023-03-01 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2023-03-01 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2023-03-01 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2023-03-01 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2023-03-01 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2023-03-01 07:04:00,82871,17.999,False,Marketing


### Filter with More than one Condition (AND)

In [43]:
mask1 = df['Gender'] == "Male"
mask2 = df['Team'] == 'Marketing' 

In [44]:
df[mask1 & mask2].head(5) # gives rows that have gender = Male and Team = Marketing

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2023-03-01 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2023-03-01 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2023-03-01 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2023-03-01 20:13:00,107391,1.26,True,Marketing


### OR Condition

In [45]:
mask1 = df['Senior Management']
mask2 = df['Start Date'] < '1990-01-01'

df[mask1 | mask2].head(5) # rows are outputed if either of the conditions are True

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2023-03-01 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-03-01 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-03-01 01:35:00,115163,10.125,False,Legal


In [46]:
# Combining AND and OR conditions

mask1 = df['First Name'] == 'Robert'
mask2 = df['Team'] == 'Client Services'
mask3 = df['Start Date'] > '2016-06-01'

df[(mask1 & mask2) | mask3] # more complex than other egs, combines the 2 logical operators

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2023-03-01 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2023-03-01 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2023-03-01 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2023-03-01 00:29:00,140002,19.49,True,Marketing


### The .isin() Method

Checks if values are present in a List, tuple, Series etc.

In [47]:
mask = df['Team'].isin(['Legal', 'Sales', 'Product']) # checks for these Team values, clean and ez syntax
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2023-03-01 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2023-03-01 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2023-03-01 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2023-03-01 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2023-03-01 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2023-03-01 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2023-03-01 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2023-03-01 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2023-03-01 12:39:00,96914,1.421,False,Product


### The isnull() and notnull() Methods

In [73]:
df = pd.read_csv('employees.csv', parse_dates = ['Start Date', 'Last Login Time']) 
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance


In [49]:
df['Team'].isnull()


0      False
1       True
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: Team, Length: 1000, dtype: bool

In [50]:
mask = df['Team'].isnull()
df[mask].head()     # returns rows where Team is null(NaN)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2023-03-01 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2023-03-01 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2023-03-01 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2023-03-01 23:00:00,128771,8.309,False,


In [51]:
condition = df['Team'].notnull()
df[condition].head()     # returns rows where Team is not null

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2023-03-01 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2023-03-01 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2023-03-01 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2023-03-01 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2023-03-01 01:35:00,115163,10.125,False,Legal


### The between() method

In [52]:
df[df['Salary'].between(60000, 70000)].head() # returns rows with salary >= 60000 and <=70000. Both bounds are inclusive

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2023-03-01 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2023-03-01 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2023-03-01 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2023-03-01 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2023-03-01 01:08:00,66582,11.308,True,Business Development


In [53]:
df[df['Start Date'].between('1991-01-01', '1992-01-01')].head() # works even for dates

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2023-03-01 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2023-03-01 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2023-03-01 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2023-03-01 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2023-03-01 02:24:00,124488,14.837,True,Sales


### The duplicated() method

In [54]:
df.sort_values('First Name', inplace = True)
df.head(7)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-03-01 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2023-03-01 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2023-03-01 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2023-03-01 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2023-03-01 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2023-03-01 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2023-03-01 11:59:00,71276,5.027,True,Human Resources


In [55]:
df['First Name'].duplicated(keep = 'first') # does not consider(keeps) the first occurence as a duplicate (False)

101    False
327     True
440     True
937     True
137    False
       ...  
902     True
925     True
946     True
947     True
951     True
Name: First Name, Length: 1000, dtype: bool

In [56]:
df[df['First Name'].duplicated(keep = 'first')] # wont return the row pertaining to the first occurrence of a duplicate val

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2023-03-01 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2023-03-01 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2023-03-01 19:39:00,63126,18.424,False,Client Services
141,Adam,Male,1990-12-24,2023-03-01 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2023-03-01 11:59:00,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2023-03-01 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2023-03-01 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2023-03-01 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2023-03-01 15:07:00,107351,5.329,True,Marketing


In [57]:
df[df['First Name'].duplicated(keep = 'last')].head(6) # does not consider(keeps) the last occurence as a duplicate 

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-03-01 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2023-03-01 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2023-03-01 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2023-03-01 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2023-03-01 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2023-03-01 11:59:00,71276,5.027,True,Human Resources


In [58]:
df[df['First Name'].duplicated(keep = False)].head(6) # considers all occurrences as duplicates

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-03-01 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2023-03-01 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2023-03-01 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2023-03-01 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2023-03-01 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2023-03-01 20:57:00,110194,14.727,True,Product


In [59]:
mask = ~df['First Name'].duplicated(keep = False) # all duplicates will not be considered (False)

### ~ operator can be used to switch operations to exact opposite. 

df[mask] # got rid of all the rows which have duplicate values in First Name col

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2023-03-01 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2023-03-01 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2023-03-01 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2023-03-01 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2023-03-01 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2023-03-01 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2023-03-01 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2023-03-01 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2023-03-01 10:30:00,132839,17.463,True,Client Services


### The drop_duplicates() method

In [60]:
df.drop_duplicates() # looks for rows which have all values the same and drops them

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2023-03-01 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2023-03-01 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2023-03-01 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2023-03-01 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2023-03-01 01:45:00,95327,15.120,False,Distribution
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2023-03-01 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2023-03-01 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2023-03-01 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2023-03-01 15:07:00,107351,5.329,True,Marketing


In [67]:
df.drop_duplicates(subset = ['First Name'], keep = 'last').head() #checks for duplicates in First Name, keep works the same

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2023-03-01 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2023-03-01 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2023-03-01 00:26:00,41453,10.084,False,Product
959,Albert,Male,1992-09-19,2023-03-01 02:35:00,45094,5.85,True,Business Development
693,Alice,Female,1995-10-16,2023-03-01 21:19:00,92799,2.782,False,Sales


In [69]:
df.drop_duplicates(subset = ['Gender'], keep = False).head() # wont get anything, no unique values in Gender -> keep= false

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


In [72]:
df.drop_duplicates(subset = ['First Name', 'Team'], keep = False, inplace = True)
# drops rows only if values are the same in both these cols(subset)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
137,Adam,Male,2011-05-21,2023-03-01 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2023-03-01 20:57:00,110194,14.727,True,Product
300,Alan,Male,1988-06-26,2023-03-01 03:54:00,111786,3.592,True,Engineering
53,Alan,,2014-03-03,2023-03-01 13:28:00,40341,17.578,True,Finance
610,Alan,Male,2012-02-17,2023-03-01 00:26:00,41453,10.084,False,Product


### The unique() and nunique() methods

In [75]:
df['Gender'].unique()
df['Team'].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [76]:
len(df['Team'].unique())

11

In [77]:
df['Team'].nunique() # Does not consider nan in the count by default

10

In [78]:
df['Team'].nunique(dropna = False) # made to consider nan in the count as well

11