In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("employees.csv")

In [6]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


**Date and time problems**

In [9]:
df["Start Date"] = pd.to_datetime(df["Start Date"])

In [10]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance


In [11]:
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])

In [12]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance


In [16]:
df["Senior Management"] = df["Senior Management"].astype("bool")

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    object        
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), datetime64[ns](2), float64(1), int64(1), object(3)
memory usage: 55.8+ KB


In [22]:
df.Gender = df.Gender.astype("category")
df.Team = df.Team.astype("category")

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


**Easy Way**

In [24]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df.Gender = df.Gender.astype("category")
df.Team = df.Team.astype("category")

In [25]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


## Filter A Dataframe Based On Condition

In [27]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance


In [29]:
df.Gender == "Male"

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [30]:
df[df.Gender == "Male"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-03-19 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-03-19 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-03-19 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-03-19 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-03-19 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-03-19 16:45:00,60500,11.985,False,Business Development


In [33]:
cond = df.Team == "Finance"

In [34]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-03-19 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2022-03-19 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2022-03-19 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2022-03-19 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2022-03-19 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2022-03-19 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2022-03-19 08:35:00,112769,11.625,True,Finance


In [35]:
cond = df["Senior Management"]

In [36]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-03-19 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-03-19 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-03-19 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-03-19 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-03-19 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-03-19 17:47:00,98874,4.479,True,Marketing


In [41]:
cond = df.Team != "Marketing"

In [42]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-03-19 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-03-19 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2022-03-19 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2022-03-19 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-03-19 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-03-19 16:45:00,60500,11.985,False,Business Development


In [43]:
cond = df.Salary >= 100000

In [44]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-03-19 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-03-19 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2022-03-19 06:51:00,139852,7.524,True,Business Development
...,...,...,...,...,...,...,...,...
990,Robin,Female,1987-07-24,2022-03-19 13:35:00,100765,10.982,True,Client Services
991,Rose,Female,2002-08-25,2022-03-19 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-03-19 08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,2022-03-19 06:09:00,132483,16.655,False,Distribution


In [45]:
cond = df["Bonus %"] < 1.5

In [46]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2022-03-19 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2022-03-19 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2022-03-19 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2022-03-19 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2022-03-19 20:20:00,146651,1.451,True,Engineering
189,Clarence,Male,1998-05-02,2022-03-19 03:16:00,85700,1.215,False,Sales
217,Douglas,Male,1999-09-03,2022-03-19 16:00:00,83341,1.015,True,Client Services
273,Nicholas,Male,1994-04-12,2022-03-19 20:21:00,74669,1.113,True,Product
279,Ruby,Female,2000-11-08,2022-03-19 19:35:00,105946,1.139,False,Business Development
365,Gloria,,1983-07-19,2022-03-19 01:57:00,140885,1.113,False,Human Resources


In [49]:
cond = df["Start Date"] <= "1985-01-01"

In [50]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2022-03-19 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2022-03-19 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2022-03-19 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2022-03-19 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2022-03-19 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2022-03-19 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2022-03-19 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2022-03-19 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2022-03-19 07:04:00,82871,17.999,False,Marketing


## Multiple Conditions

In [56]:
cond1 = df.Gender == "Male"
cond2 = df.Team == "Marketing"
cond3 = df.Salary >= 100000

In [60]:
df[cond1 & (cond2 | cond3)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-03-19 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-03-19 01:35:00,115163,10.125,False,Legal
12,Brandon,Male,1980-12-01,2022-03-19 01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-03-19 17:19:00,148985,19.280,False,Legal
983,John,Male,1982-12-23,2022-03-19 22:35:00,146907,11.738,False,Engineering
992,Anthony,Male,2011-10-16,2022-03-19 08:35:00,112769,11.625,True,Finance
994,George,Male,2013-06-21,2022-03-19 17:47:00,98874,4.479,True,Marketing


## .isin( ) Method

In [63]:
cond = df.Team.isin(["Legal", "Sales", "Product"])

In [64]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-03-19 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-03-19 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-03-19 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-03-19 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-03-19 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-03-19 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2022-03-19 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-03-19 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2022-03-19 12:39:00,96914,1.421,False,Product


## The .isnull() and .notnull() Methods

In [65]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance


In [69]:
condis = df.Team.isnull()
condnot = df.Team.notnull()

In [71]:
df[condis].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2022-03-19 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2022-03-19 16:19:00,125792,5.042,True,


In [72]:
df[condnot].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-03-19 13:00:00,138705,9.34,True,Finance


## The .between() Method

In [7]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [75]:
cond = df.Salary.between(60000, 70000)

In [76]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.170,True,
6,Ruby,Female,1987-08-17,2022-03-19 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2022-03-19 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2022-03-19 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2022-03-19 01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
965,Catherine,Female,1989-09-25,2022-03-19 01:31:00,68164,18.393,False,Client Services
970,Alice,Female,1988-09-03,2022-03-19 20:54:00,63571,15.397,True,Product
974,Harry,Male,2011-08-30,2022-03-19 18:31:00,67656,16.455,True,Client Services
978,Sean,Male,1983-01-17,2022-03-19 14:23:00,66146,11.178,False,Human Resources


In [77]:
cond = df["Bonus %"].between(2.0, 5.0)

In [78]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.170,True,
20,Lois,,1995-04-22,2022-03-19 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2022-03-19 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2022-03-19 12:13:00,113590,3.055,False,Sales
60,Paula,,2005-11-23,2022-03-19 14:01:00,48866,4.271,False,Distribution
...,...,...,...,...,...,...,...,...
943,Wayne,Male,2006-09-08,2022-03-19 11:09:00,67471,2.728,False,Engineering
961,Antonio,,1989-06-18,2022-03-19 21:37:00,103050,3.050,False,Legal
976,Denise,Female,1992-10-19,2022-03-19 05:42:00,137954,4.195,True,Legal
989,Justin,,1991-02-10,2022-03-19 16:58:00,38344,3.794,False,Legal


In [8]:
cond = df["Start Date"].between("1999-01-01", "2000-01-01")

In [9]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
26,Craig,Male,2/27/2000,7:45 AM,37598,7.757,True,Marketing
31,Joyce,,2/20/2005,2:40 PM,88657,12.752,False,Product
36,Rachel,Female,2/16/2009,8:47 PM,142032,12.599,False,Business Development
52,Todd,Male,2/18/1990,2:41 AM,49339,1.695,True,Human Resources
85,Jeremy,Male,2/1/2008,8:50 AM,100238,3.887,True,Client Services
...,...,...,...,...,...,...,...,...
923,Irene,,2/28/1991,10:23 PM,135369,4.380,False,Business Development
953,Randy,Male,2/25/2012,5:33 AM,57266,14.077,False,Product
957,Jeffrey,Male,2/3/1984,1:26 PM,70990,15.901,True,Sales
969,Linda,Female,2/4/2010,8:49 PM,44486,17.308,True,Engineering


In [10]:
cond = df["Last Login Time"].between("08:30AM", "9:00AM")

In [86]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
85,Jeremy,Male,2008-02-01,2022-03-19 08:50:00,100238,3.887,True,Client Services
96,Cynthia,Female,1994-03-21,2022-03-19 08:34:00,142321,1.737,False,Finance
121,Kathleen,,2016-05-09,2022-03-19 08:55:00,119735,18.74,False,Product
157,,Female,2005-07-27,2022-03-19 08:32:00,79536,14.443,True,Product
259,Henry,Male,1995-02-06,2022-03-19 08:34:00,89258,15.585,True,Human Resources
345,Steven,Male,2006-11-21,2022-03-19 08:30:00,83706,6.96,True,Human Resources
385,Debra,Female,2010-01-19,2022-03-19 08:48:00,70492,8.895,False,Client Services
397,Clarence,Male,2005-01-13,2022-03-19 09:00:00,116693,13.835,True,Distribution
429,Rose,Female,2015-05-28,2022-03-19 08:40:00,149903,5.63,False,Human Resources
446,Cheryl,Female,1994-08-16,2022-03-19 08:33:00,67150,15.85,True,Marketing


## The .duplicated( ) Method

In [14]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df.Gender = df.Gender.astype("category")
df.Team = df.Team.astype("category")
df.sort_values("First Name", inplace=True)

In [16]:
cond = ~df["First Name"].duplicated(keep = False)

In [19]:
df[cond]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-03-19 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-03-19 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-03-19 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-03-19 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-03-19 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2022-03-19 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2022-03-19 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2022-03-19 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2022-03-19 10:30:00,132839,17.463,True,Client Services


## .drop_duplicates( ) Method

In [104]:
df.sort_values("First Name", inplace=True)

In [113]:
df.head(4)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-03-19 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-03-19 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-03-19 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-03-19 19:39:00,63126,18.424,False,Client Services


In [108]:
df.shape

(1000, 8)

In [109]:
len(df.drop_duplicates())

1000

In [111]:
df.drop_duplicates(subset = ["First Name"], keep="first").head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-03-19 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2022-03-19 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2022-03-19 03:54:00,111786,3.592,True,Engineering


In [112]:
df.drop_duplicates(subset = ["First Name"], keep="last").head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2022-03-19 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2022-03-19 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2022-03-19 00:26:00,41453,10.084,False,Product


In [116]:
df.drop_duplicates(subset = ["First Name", "Team"]).head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-03-19 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2022-03-19 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2022-03-19 01:45:00,95327,15.12,False,Distribution


## The .unique( ) and .nunique( ) Methods

In [119]:
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df.Gender = df.Gender.astype("category")
df.Team = df.Team.astype("category")

In [120]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-03-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-03-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-03-19 11:17:00,130590,11.858,False,Finance


In [121]:
df.Gender.unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [122]:
df.Team.unique()

['Marketing', NaN, 'Finance', 'Client Services', 'Legal', ..., 'Engineering', 'Business Development', 'Human Resources', 'Sales', 'Distribution']
Length: 11
Categories (10, object): ['Business Development', 'Client Services', 'Distribution', 'Engineering', ..., 'Legal', 'Marketing', 'Product', 'Sales']

In [123]:
df.Team.nunique()

10