# Memory Optimization

In [1]:
import pandas as pd
df = pd.read_csv("employees.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [2]:
df["Start Date"] = pd.to_datetime(df["Start Date"])

In [3]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [4]:
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


In [6]:
df = pd.read_csv("employees.csv", parse_dates = ["Last Login Time", "Start Date"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services


# Filter a DataFrame based on A Condition

In [7]:
df["Gender"] == "Male"

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [8]:
df[df["Gender"] == "Male"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-06 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-12-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-06 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-06 16:45:00,60500,11.985,False,Business Development


In [9]:
mask = df["Senior Management"]
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2022-12-06 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-12-06 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-06 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2022-12-06 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2022-12-06 17:47:00,98874,4.479,True,Marketing


In [10]:
df[df["Start Date"] > "1993-08-06"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services
7,,Female,2015-07-20,2022-12-06 10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,2022-12-06 06:29:00,95570,18.523,True,Engineering
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-12-06 17:47:00,98874,4.479,True,Marketing
995,Henry,,2014-11-23,2022-12-06 06:09:00,132483,16.655,False,Distribution
997,Russell,Male,2013-05-20,2022-12-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-06 16:45:00,60500,11.985,False,Business Development


# Filter DataFrame with more than one condition (AND-&)


In [11]:
df = pd.read_csv("employees.csv", parse_dates = ["Last Login Time", "Start Date"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services


In [12]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Finance"
df[mask1 & mask2]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
46,Bruce,Male,2009-11-28,2022-12-06 22:47:00,114796,6.796,False,Finance
56,Carl,Male,2006-05-03,2022-12-06 17:55:00,130276,16.084,True,Finance
68,Jose,Male,2004-10-30,2022-12-06 13:39:00,84834,14.33,True,Finance
83,Shawn,Male,2005-09-23,2022-12-06 02:55:00,148115,6.539,True,Finance
117,Steven,Male,1995-03-01,2022-12-06 15:03:00,109095,9.494,False,Finance
200,Gary,Male,1987-08-12,2022-12-06 00:04:00,89661,8.525,False,Finance
205,Brandon,Male,2006-03-27,2022-12-06 17:54:00,115711,8.012,True,Finance
219,Billy,Male,1995-03-13,2022-12-06 12:05:00,120444,7.768,True,Finance
222,Jason,Male,1999-10-17,2022-12-06 22:09:00,78417,3.067,False,Finance


# Filter DataFrame with more than one condition (OR-|)

In [13]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Finance"
df[mask1 | mask2]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-12-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-06 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-06 16:45:00,60500,11.985,False,Business Development


In [14]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Finance"
mask3 = df["Start Date"] > "2016-06-01"
df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
15,Lillian,Female,2016-06-05,2022-12-06 06:09:00,59414,1.256,False,Product
46,Bruce,Male,2009-11-28,2022-12-06 22:47:00,114796,6.796,False,Finance
56,Carl,Male,2006-05-03,2022-12-06 17:55:00,130276,16.084,True,Finance
68,Jose,Male,2004-10-30,2022-12-06 13:39:00,84834,14.33,True,Finance
83,Shawn,Male,2005-09-23,2022-12-06 02:55:00,148115,6.539,True,Finance
98,Tina,Female,2016-06-16,2022-12-06 19:47:00,100705,16.961,True,Marketing
117,Steven,Male,1995-03-01,2022-12-06 15:03:00,109095,9.494,False,Finance
200,Gary,Male,1987-08-12,2022-12-06 00:04:00,89661,8.525,False,Finance
205,Brandon,Male,2006-03-27,2022-12-06 17:54:00,115711,8.012,True,Finance


# isin() Mehod

In [15]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services


In [16]:
df["Team"].isin(["Marketing", "Finance", "Sales"])

0       True
1      False
2       True
3       True
4      False
       ...  
995    False
996     True
997    False
998    False
999     True
Name: Team, Length: 1000, dtype: bool

In [17]:
mask = df["Team"].isin(["Marketing", "Finance", "Sales"])
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2022-12-06 10:43:00,45906,11.598,True,Finance
13,Gary,Male,2008-01-27,2022-12-06 23:40:00,109831,5.831,False,Sales
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2022-12-06 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2022-12-06 08:35:00,112769,11.625,True,Finance
994,George,Male,2013-06-21,2022-12-06 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-12-06 06:30:00,42392,19.675,False,Finance


# isnull() & notnull()

In [18]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services


In [19]:
df["Team"].isnull()

0      False
1       True
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: Team, Length: 1000, dtype: bool

In [20]:
df["Team"].notnull()

0       True
1      False
2       True
3       True
4       True
       ...  
995     True
996     True
997     True
998     True
999     True
Name: Team, Length: 1000, dtype: bool

In [21]:
df[df["Team"].notnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-12-06 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2022-12-06 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2022-12-06 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-12-06 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-12-06 16:45:00,60500,11.985,False,Business Development


# between()

In [22]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-12-06 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-12-06 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-12-06 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2022-12-06 16:47:00,101004,1.389,True,Client Services


In [23]:
df["Salary"].between(60000, 70000)

0      False
1       True
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998     True
999    False
Name: Salary, Length: 1000, dtype: bool

In [24]:
df[df["Salary"].between(60000, 70000)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-12-06 06:53:00,61933,4.170,True,
6,Ruby,Female,1987-08-17,2022-12-06 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2022-12-06 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2022-12-06 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2022-12-06 01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
965,Catherine,Female,1989-09-25,2022-12-06 01:31:00,68164,18.393,False,Client Services
970,Alice,Female,1988-09-03,2022-12-06 20:54:00,63571,15.397,True,Product
974,Harry,Male,2011-08-30,2022-12-06 18:31:00,67656,16.455,True,Client Services
978,Sean,Male,1983-01-17,2022-12-06 14:23:00,66146,11.178,False,Human Resources


In [25]:
df[df["Start Date"].between("1991-01-01", "1992-01-01")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2022-12-06 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2022-12-06 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2022-12-06 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2022-12-06 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2022-12-06 02:24:00,124488,14.837,True,Sales
166,,Female,1991-07-09,2022-12-06 18:52:00,42341,7.014,True,Sales
172,Sara,Female,1991-09-23,2022-12-06 18:17:00,97058,9.402,False,Finance
220,,Female,1991-06-17,2022-12-06 12:49:00,71945,5.56,True,Marketing
245,Victor,Male,1991-04-11,2022-12-06 07:44:00,70817,17.138,False,Engineering
277,Brenda,,1991-05-29,2022-12-06 06:32:00,82439,19.062,False,Sales


# duplicated Method

In [26]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.sort_values(by = "First Name", inplace = True)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-12-06 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-12-06 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-12-06 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-12-06 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2022-12-06 01:45:00,95327,15.12,False,Distribution


In [27]:
df["First Name"].duplicated()

101    False
327     True
440     True
937     True
137    False
       ...  
902     True
925     True
946     True
947     True
951     True
Name: First Name, Length: 1000, dtype: bool

In [28]:
df[df["First Name"].duplicated()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2022-12-06 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-12-06 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-12-06 19:39:00,63126,18.424,False,Client Services
141,Adam,Male,1990-12-24,2022-12-06 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2022-12-06 11:59:00,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2022-12-06 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2022-12-06 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2022-12-06 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2022-12-06 15:07:00,107351,5.329,True,Marketing


In [29]:
df[df["First Name"].duplicated(keep = False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-12-06 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-12-06 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-12-06 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-12-06 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2022-12-06 01:45:00,95327,15.120,False,Distribution
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2022-12-06 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2022-12-06 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2022-12-06 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2022-12-06 15:07:00,107351,5.329,True,Marketing


In [30]:
df[~df["First Name"].duplicated(keep = False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-12-06 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-12-06 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-12-06 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-12-06 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-12-06 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2022-12-06 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2022-12-06 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2022-12-06 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2022-12-06 10:30:00,132839,17.463,True,Client Services


# drop_duplicates()

In [31]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.sort_values(by = "First Name", inplace = True)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-12-06 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-12-06 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-12-06 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-12-06 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2022-12-06 01:45:00,95327,15.12,False,Distribution


In [34]:
len(df)

1000

In [33]:
len(df.drop_duplicates())

1000

In [35]:
df.drop_duplicates(subset = ["First Name"])

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-12-06 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2022-12-06 01:45:00,95327,15.120,False,Distribution
300,Alan,Male,1988-06-26,2022-12-06 03:54:00,111786,3.592,True,Engineering
372,Albert,Male,1997-02-01,2022-12-06 16:20:00,67827,19.717,True,Engineering
988,Alice,Female,2004-10-05,2022-12-06 09:34:00,47638,11.209,False,Human Resources
...,...,...,...,...,...,...,...,...
433,Wanda,Female,2008-07-20,2022-12-06 13:44:00,65362,7.132,True,Legal
177,Wayne,Male,2012-04-07,2022-12-06 08:00:00,102652,14.085,True,Distribution
820,William,Male,1993-11-18,2022-12-06 12:27:00,54058,5.182,True,Human Resources
450,Willie,Male,2009-08-22,2022-12-06 13:03:00,55038,19.691,False,Legal


In [36]:
len(df.drop_duplicates(subset = ["First Name"]))

201

In [37]:
len(df.drop_duplicates(subset = ["First Name", "Team"]))

769

In [38]:
len(df.drop_duplicates(subset = ["First Name"], keep = False))

9

In [39]:
df.drop_duplicates(subset = ["First Name"], keep = False, inplace = True)

In [40]:
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-12-06 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-12-06 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-12-06 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-12-06 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-12-06 01:35:00,115163,10.125,False,Legal
