# This Module's Dataset + Memory Optimization

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("employees.csv")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 39.1+ KB


In [7]:
df["Start Date"] = pd.to_datetime(df["Start Date"])
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance


In [8]:
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance


In [16]:
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 41.2+ KB


In [18]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
## df["Start Date"] = pd.to_datetime(df["Start Date"])
## df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance


# Filter a DataFrame Based On a Condition

In [19]:
df["Gender"]

0        Male
1        Male
2      Female
3        Male
4        Male
        ...  
995       NaN
996      Male
997      Male
998      Male
999      Male
Name: Gender, Length: 1000, dtype: category
Categories (2, object): ['Female', 'Male']

In [21]:
df[df["Gender"] == "Male"]
#remeber == is to filter, = is to assign value

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2022-10-16 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2022-10-16 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2022-10-16 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2022-10-16 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2022-10-16 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2022-10-16 16:45:00,60500,11.985,False,Business Development


In [34]:
df[df["Team"] == "Finance"].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2022-10-16 10:43:00,45906,11.598,True,Finance


In [33]:
mask = df["Team"] == "Finance"
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2022-10-16 10:43:00,45906,11.598,True,Finance


In [32]:
df[df["Senior Management"]].head(3)
# for boolens, it will anyway default true

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.34,True,Finance


In [35]:
df[df["Team"] != "Marketing"].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.34,True,Finance


In [38]:
df[df["Salary"] > 110000].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.34,True,Finance
5,Dennis,Male,1987-04-18,2022-10-16 01:35:00,115163,10.125,False,Legal


In [39]:
df[df["Bonus %"] < 1.5].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2022-10-16 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2022-10-16 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2022-10-16 07:18:00,72670,1.481,True,Engineering


In [43]:
df["Start Date"] <= "1985-01-01" 

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996     True
997    False
998    False
999    False
Name: Start Date, Length: 1000, dtype: bool

# Filter with More than One Condition (AND &)

In [44]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
## df["Start Date"] = pd.to_datetime(df["Start Date"])
## df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance


In [47]:
mask1 = df["Gender"] == "Male"
mask2 = df["Team"] == "Marketing"

In [49]:
df[mask1 & mask2].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2022-10-16 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2022-10-16 07:45:00,37598,7.757,True,Marketing


# Filter with More than One Condition (OR |)

In [52]:
mask1 = df["Senior Management"]
mask2 = df["Start Date"] < "1990-01-01"
df[mask1 | mask2].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2022-10-16 13:00:00,138705,9.34,True,Finance


In [54]:
mask1 = df["First Name"] == "Robert"
mask2 = df["Team"] == "Client Services"
mask3 = df["Start Date"] > "2016-06-01"
df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2022-10-16 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2022-10-16 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2022-10-16 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2022-10-16 00:29:00,140002,19.49,True,Marketing


# The .isin() Method

In [55]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
## df["Start Date"] = pd.to_datetime(df["Start Date"])
## df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance


In [57]:
mask1= df["Team"] == "Legal"
mask2= df["Team"] == "Sales"
mask3= df["Team"] == "Product"
df[mask1 | mask2 | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-10-16 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-10-16 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-10-16 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2022-10-16 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2022-10-16 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2022-10-16 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2022-10-16 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2022-10-16 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2022-10-16 12:39:00,96914,1.421,False,Product


In [60]:
mask = df["Team"].isin(["Legal","Sales","Product"])
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2022-10-16 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2022-10-16 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2022-10-16 15:19:00,102508,12.637,True,Legal


# The .isnull() and .notnull() Methods

In [63]:
mask = df["Team"].isnull()
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2022-10-16 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2022-10-16 16:19:00,125792,5.042,True,


In [67]:
condition = df["Gender"].notnull()
df[condition].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance


# The .between() Method

In [72]:
condition = df["Salary"].between(60000 , 70000)
df[condition].head(3)
## between including the upper/lower bound

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2022-10-16 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2022-10-16 09:01:00,63241,15.132,True,


In [74]:
df[df["Bonus %"].between(2.0, 5.0)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
20,Lois,,1995-04-22,2022-10-16 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2022-10-16 11:25:00,99283,2.665,True,Distribution


In [75]:
df[df["Start Date"].between("1991-01-01", "1992-01-01")].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2022-10-16 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2022-10-16 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2022-10-16 13:59:00,64088,6.155,True,Legal


In [76]:
df[df["Last Login Time"].between("2022-10-16 08:30:00", "2022-10-16 12:00:00")].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2022-10-16 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2022-10-16 09:01:00,63241,15.132,True,


In [77]:
df[df["Last Login Time"].between("08:30AM", "12:00PM")].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2022-10-16 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2022-10-16 09:01:00,63241,15.132,True,


# The .duplicated() Method

In [78]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace=True)
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-10-16 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-10-16 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-10-16 14:53:00,52119,11.343,True,Client Services


In [84]:
df[df["First Name"].duplicated()].head(3)
# the first is a unique value, it will be not included

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2022-10-16 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-10-16 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2022-10-16 19:39:00,63126,18.424,False,Client Services


In [85]:
df[df["First Name"].duplicated(keep= "last")].head(3)
# the last is a unique value, it will not be included

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-10-16 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-10-16 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-10-16 14:53:00,52119,11.343,True,Client Services


In [87]:
df[df["First Name"].duplicated(keep = False)].head(3)
# show all duplicated

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-10-16 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-10-16 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-10-16 14:53:00,52119,11.343,True,Client Services


In [88]:
df["First Name"].duplicated(keep = False)

101    True
327    True
440    True
937    True
137    True
       ... 
902    True
925    True
946    True
947    True
951    True
Name: First Name, Length: 1000, dtype: bool

In [90]:
~df["First Name"].duplicated(keep = False) #reverse false/true

101    False
327    False
440    False
937    False
137    False
       ...  
902    False
925    False
946    False
947    False
951    False
Name: First Name, Length: 1000, dtype: bool

In [91]:
mask = ~df["First Name"].duplicated(keep = False)
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-10-16 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-10-16 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-10-16 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2022-10-16 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2022-10-16 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2022-10-16 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2022-10-16 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2022-10-16 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2022-10-16 10:30:00,132839,17.463,True,Client Services


# The .drop_duplicates() Method

In [92]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace=True)
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-10-16 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2022-10-16 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2022-10-16 14:53:00,52119,11.343,True,Client Services


In [93]:
len(df)

1000

In [95]:
len(df.drop_duplicates())
# default is identical roll

1000

In [98]:
df.drop_duplicates(subset = ["First Name"], keep = "first").head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2022-10-16 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2022-10-16 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2022-10-16 03:54:00,111786,3.592,True,Engineering


In [99]:
df.drop_duplicates(subset = ["First Name"], keep = "last").head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2022-10-16 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2022-10-16 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2022-10-16 00:26:00,41453,10.084,False,Product


In [100]:
df.drop_duplicates(subset = ["First Name"], keep = False).head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2022-10-16 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2022-10-16 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2022-10-16 03:39:00,57783,9.129,False,Finance


In [102]:
df.drop_duplicates(subset = ["Team"], keep = False)
# no single team value occurs once

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


In [107]:
df.drop_duplicates(subset = ["First Name", "Team"], inplace = True)
# identical first name and team

In [109]:
len(df)

769

# The .unique() and .nunique() Methods

In [110]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date","Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2022-10-16 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2022-10-16 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2022-10-16 11:17:00,130590,11.858,False,Finance


In [112]:
df["Gender"].unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [113]:
df["Team"].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [114]:
len(df["Team"].unique())

11

In [115]:
df["Team"].nunique()

10