# Ch05 DataFrame 필터링

In [1]:
import pandas as pd
import numpy as np

In [2]:
employees = pd.read_csv('../../DATA/employees.csv', parse_dates=["Start Date"])
employees.head(10)

  employees = pd.read_csv('../../DATA/employees.csv', parse_dates=["Start Date"])


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
7,,Female,2015-07-20,45906.0,,Finance
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev


In [3]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        933 non-null    object        
 5   Team        957 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 47.0+ KB


#### 5.1.1 astype() 메서드를 사용하여 데이터 유형 변환

In [4]:
employees["Mgmt"].astype(bool)

0        True
1        True
2       False
3        True
4        True
        ...  
996     False
997     False
998     False
999      True
1000     True
Name: Mgmt, Length: 1001, dtype: bool

In [5]:
employees["Mgmt"] = employees["Mgmt"].astype(bool)

In [6]:
# "Mgmt" 열을 object -> bool 로 형변환 하였더니 메모리 사용량이 줄었다.
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 40.2+ KB


In [7]:
# NaN 값을 정수로 변환할 수 없으니 'Salary' 열이 부동소수점 타입인 것.
# 'Salary' 열을 정수로 형변환하기 위해 NaN 값을 0으로 대체.
employees["Salary"].fillna(0).head()

0         0.0
1     61933.0
2    130590.0
3    138705.0
4    101004.0
Name: Salary, dtype: float64

In [8]:
employees["Salary"].fillna(0).astype(int).head()

0         0
1     61933
2    130590
3    138705
4    101004
Name: Salary, dtype: int32

In [9]:
employees["Salary"] = employees["Salary"].fillna(0).astype(int)

In [10]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int32         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), datetime64[ns](1), int32(1), object(3)
memory usage: 36.3+ KB


In [11]:
employees.nunique()

First Name    200
Gender          2
Start Date    971
Salary        995
Mgmt            2
Team           10
dtype: int64

In [12]:
# 1001개 행 중에서 'Gender'열은 2개, 'Team'열은 10개의 고유값만을 가진다.
# category 자료형으로 형변환
employees["Gender"].astype("category")

0         Male
1         Male
2       Female
3          NaN
4         Male
         ...  
996       Male
997       Male
998       Male
999       Male
1000       NaN
Name: Gender, Length: 1001, dtype: category
Categories (2, object): ['Female', 'Male']

In [13]:
employees["Gender"] = employees["Gender"].astype("category")

In [14]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int32         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), int32(1), object(2)
memory usage: 29.6+ KB


In [15]:
employees["Team"] = employees["Team"].astype("category")

In [16]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    category      
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      1001 non-null   int32         
 4   Mgmt        1001 non-null   bool          
 5   Team        957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](1), int32(1), object(1)
memory usage: 23.1+ KB


#### 5.2 단일 조건으로 필터링

In [17]:
"Maria" == "Maria", "Maria" == "Taylor"

(True, False)

In [18]:
employees["First Name"] == "Maria"

0       False
1       False
2        True
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: First Name, Length: 1001, dtype: bool

In [20]:
# 불리언 인덱싱
employees[employees["First Name"] == "Maria"]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


In [21]:
mariaMask = employees["First Name"] == "Maria"
employees[mariaMask]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,NaT,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


In [22]:
employees["Team"] != "Finance"

0        True
1        True
2       False
3       False
4        True
        ...  
996     False
997      True
998      True
999      True
1000     True
Name: Team, Length: 1001, dtype: bool

In [23]:
employees[employees["Team"] != "Finance"]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483,False,Distribution
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
999,Albert,Male,2012-05-15,129949,True,Sales


In [24]:
employees[employees["Mgmt"]]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
6,Ruby,Female,1987-08-17,65476,True,Product
...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,112769,True,Finance
993,Tina,Female,1997-05-15,56450,True,Engineering
994,George,Male,2013-06-21,98874,True,Marketing
999,Albert,Male,2012-05-15,129949,True,Sales


In [25]:
employees[employees["Salary"] > 140000]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
36,Rachel,Female,2009-02-16,142032,False,Business Dev
44,Cynthia,Female,1988-11-16,145146,True,Product
83,Shawn,Male,2005-09-23,148115,True,Finance
87,Annie,Female,1993-01-30,144887,True,Sales
96,Cynthia,Female,1994-03-21,142321,False,Finance
...,...,...,...,...,...,...
948,Ashley,Female,2006-03-31,142410,True,Engineering
951,,Female,2010-09-14,143638,True,
979,Ernest,Male,2013-07-20,142935,True,Product
981,James,Male,1993-01-15,148985,False,Legal


#### 5.3 다중 조건으로 필터링

- 5.3.1 AND 조건

In [26]:
is_female = employees["Gender"] == "Female"

In [27]:
in_biz_dev = employees["Team"] == "Business Dev"

In [29]:
employees[is_female & in_biz_dev].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
33,Jean,Female,1993-12-18,119082,False,Business Dev
36,Rachel,Female,2009-02-16,142032,False,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
61,Denise,Female,2001-11-06,106862,False,Business Dev


In [30]:
is_manager = employees["Mgmt"]

In [32]:
employees[is_female & in_biz_dev & is_manager].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
66,Nancy,Female,2012-12-15,125250,True,Business Dev
92,Linda,Female,2000-05-25,119009,True,Business Dev
111,Bonnie,Female,1999-12-17,42153,True,Business Dev


- 5.3.2 OR 조건

In [35]:
earning_below_40k = employees['Salary'] < 40000
started_after_2015 = employees['Start Date'] > "2015-01-01"

In [36]:
employees[earning_below_40k | started_after_2015].tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
958,Gloria,Female,1987-10-24,39833,False,Engineering
964,Bruce,Male,1980-05-07,35802,True,Sales
967,Thomas,Male,2016-03-12,105681,False,Engineering
989,Justin,,1991-02-10,38344,False,Legal
1000,,,NaT,0,True,


- 5.3.3 ~ 기호로 반전

In [38]:
my_series = pd.Series([True, False, True])
my_series

0     True
1    False
2     True
dtype: bool

In [39]:
~my_series

0    False
1     True
2    False
dtype: bool

In [40]:
employees[employees['Salary'] < 100000].tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
1000,,,NaT,0,True,


In [41]:
employees[~(employees['Salary'] >= 100000)].tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
994,George,Male,2013-06-21,98874,True,Marketing
996,Phillip,Male,1984-01-31,42392,False,Finance
997,Russell,Male,2013-05-20,96914,False,Product
998,Larry,Male,2013-04-20,60500,False,Business Dev
1000,,,NaT,0,True,


- 5.3.4 불리언 메서드

In [42]:
employees["Team"] == "Marketing"

0        True
1       False
2       False
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: Team, Length: 1001, dtype: bool

In [43]:
employees["Team"].eq("Marketing")

0        True
1       False
2       False
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: Team, Length: 1001, dtype: bool

## 5.4 조건별 필터링

- 5.4.1 isin 메서드

In [44]:
all_star_teams = ["Sales", "Legal", "Marketing"]
on_all_star_teams = employees["Team"].isin(all_star_teams)
employees[on_all_star_teams].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
5,Dennis,Male,1987-04-18,115163,False,Legal
11,Julie,Female,1997-10-26,102508,True,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
20,Lois,,1995-04-22,64714,True,Legal


5.4.2 between 메서드

In [45]:
between_80k_and_90k = employees['Salary'].between(80000, 90000)
employees[between_80k_and_90k].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
19,Donna,Female,2010-07-22,81014,False,Product
31,Joyce,,2005-02-20,88657,False,Product
35,Theresa,Female,2006-10-10,85182,False,Sales
45,Roger,Male,1980-04-17,88010,True,Sales
54,Sara,Female,2007-08-15,83677,False,Engineering


In [46]:
eighties_folk = employees['Start Date'].between(left="1980-01-01", right="1990-01-01")
employees[eighties_folk]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
10,Louise,Female,1980-08-12,63241,True,
12,Brandon,Male,1980-12-01,112807,True,HR
17,Shawn,Male,1986-12-07,111737,False,Product
...,...,...,...,...,...,...
983,John,Male,1982-12-23,146907,False,Engineering
985,Stephen,,1983-07-10,85668,False,Legal
986,Donna,Female,1982-11-26,82871,False,Marketing
990,Robin,Female,1987-07-24,100765,True,IT


In [47]:
name_starts_with_r = employees["First Name"].between('R', 'S')
employees[name_starts_with_r].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
6,Ruby,Female,1987-08-17,65476,True,Product
36,Rachel,Female,2009-02-16,142032,False,Business Dev
45,Roger,Male,1980-04-17,88010,True,Sales
67,Rachel,Female,1999-08-16,51178,True,Finance
78,Robin,Female,1983-06-04,114797,True,Sales


- 5.4.3 isnull과 notnull 메서드

In [48]:
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT


In [49]:
employees["Team"].isnull().head()

0    False
1     True
2    False
3    False
4    False
Name: Team, dtype: bool

In [50]:
employees["Start Date"].isnull().head()

0    False
1    False
2     True
3    False
4    False
Name: Start Date, dtype: bool

In [51]:
employees["Team"].notnull().head()

0     True
1    False
2     True
3     True
4     True
Name: Team, dtype: bool

In [52]:
(~employees["Team"].isnull()).head()

0     True
1    False
2     True
3     True
4     True
Name: Team, dtype: bool

In [54]:
# 'Team' 열의 값이 누락된 모든 직원을 추출
no_team = employees['Team'].isnull()
employees[no_team].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
1,Thomas,Male,1996-03-31,61933,True,
10,Louise,Female,1980-08-12,63241,True,
23,,Male,2012-06-14,125792,True,
32,,Male,1998-08-21,122340,True,
91,James,,2005-01-26,128771,False,


In [56]:
# 'First Name' 열의 값이 존재하는 모든 직원을 추출
has_name = employees['First Name'].notnull()
employees[has_name].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,NaT,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT


- 5.4.4 null 값 다루기

In [58]:
# employees 새로 불러오기
employees = pd.read_csv('../../DATA/employees.csv', parse_dates=["Start Date"])
employees

  employees = pd.read_csv('../../DATA/employees.csv', parse_dates=["Start Date"])


Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [59]:
employees.dropna()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
...,...,...,...,...,...,...
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


In [60]:
employees.dropna(how='all')

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
995,Henry,,2014-11-23,132483.0,False,Distribution
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev


In [61]:
employees.dropna(subset=["Gender"]).tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
994,George,Male,2013-06-21,98874.0,True,Marketing
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [62]:
employees.dropna(subset=["Start Date", "Salary"]).tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
995,Henry,,2014-11-23,132483.0,False,Distribution
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


## 5.5 중복 처리

- 5.5.1 duplicated 메서드

In [63]:
employees['Team'].head(10)

0       Marketing
1             NaN
2         Finance
3         Finance
4              IT
5           Legal
6         Product
7         Finance
8     Engineering
9    Business Dev
Name: Team, dtype: object

In [64]:
employees['Team'].duplicated().head(10)

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7     True
8    False
9    False
Name: Team, dtype: bool

In [69]:
employees['Team'].duplicated(keep='last')

0        True
1        True
2        True
3        True
4        True
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: Team, Length: 1001, dtype: bool

In [70]:
first_one_in_team = ~employees['Team'].duplicated(keep='first')
employees[first_one_in_team]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
12,Brandon,Male,1980-12-01,112807.0,True,HR
13,Gary,Male,2008-01-27,109831.0,False,Sales


- 5.5.2 drop_duplicates 메서드

In [71]:
# 기본적으로, 행의 모든 값이 일치하는 행을 제거
# employees 에는 6개 열의 값이 모두 동일한 행이 없기 때문에 제거되는 행이 X
employees.drop_duplicates()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [72]:
employees.drop_duplicates(subset=['Team'])

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev
12,Brandon,Male,1980-12-01,112807.0,True,HR
13,Gary,Male,2008-01-27,109831.0,False,Sales


In [73]:
employees.drop_duplicates(subset=['Team'], keep='last')

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
988,Alice,Female,2004-10-05,47638.0,False,HR
989,Justin,,1991-02-10,38344.0,False,Legal
990,Robin,Female,1987-07-24,100765.0,True,IT
993,Tina,Female,1997-05-15,56450.0,True,Engineering
994,George,Male,2013-06-21,98874.0,True,Marketing
995,Henry,,2014-11-23,132483.0,False,Distribution
996,Phillip,Male,1984-01-31,42392.0,False,Finance
997,Russell,Male,2013-05-20,96914.0,False,Product
998,Larry,Male,2013-04-20,60500.0,False,Business Dev
999,Albert,Male,2012-05-15,129949.0,True,Sales


In [74]:
employees.drop_duplicates(subset=['Team'], keep=False)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team


In [75]:
employees.drop_duplicates(subset=['First Name', 'Gender'])

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
...,...,...,...,...,...,...
939,Ralph,,1995-07-28,70635.0,False,IT
950,Paula,Female,1983-05-21,58423.0,False,Business Dev
961,Antonio,,1989-06-18,103050.0,False,Legal
985,Stephen,,1983-07-10,85668.0,False,Legal


## 5.6 코딩 챌린지

In [77]:
pd.read_csv('../../DATA/netflix.csv')

Unnamed: 0,title,director,date_added,type
0,Alias Grace,,3-Nov-17,TV Show
1,A Patch of Fog,Michael Lennox,15-Apr-17,Movie
2,Lunatics,,19-Apr-19,TV Show
3,Uriyadi 2,Vijay Kumar,2-Aug-19,Movie
4,Shrek the Musical,Jason Moore,29-Dec-13,Movie
...,...,...,...,...
5832,The Pursuit,John Papola,7-Aug-19,Movie
5833,Hurricane Bianca,Matt Kugelman,1-Jan-17,Movie
5834,Amar's Hands,Khaled Youssef,26-Apr-19,Movie
5835,Bill Nye: Science Guy,Jason Sussberg,25-Apr-18,Movie


1. 제한된 메모리 사용량과 사용성을 최대화하기 위해 데이터셋을 최적화하세요.

In [78]:
netflix = pd.read_csv('../../DATA/netflix.csv', parse_dates=['date_added'])

  netflix = pd.read_csv('../../DATA/netflix.csv', parse_dates=['date_added'])


In [79]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5837 non-null   object        
 1   director    3936 non-null   object        
 2   date_added  5195 non-null   datetime64[ns]
 3   type        5837 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 182.5+ KB


In [80]:
netflix.nunique()

title         5780
director      3024
date_added    1092
type             2
dtype: int64

In [81]:
netflix['type'] = netflix['type'].astype('category')

In [82]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5837 non-null   object        
 1   director    3936 non-null   object        
 2   date_added  5195 non-null   datetime64[ns]
 3   type        5837 non-null   category      
dtypes: category(1), datetime64[ns](1), object(2)
memory usage: 142.8+ KB


2. 제목이 'Limitless'인 모든 콘텐츠를 찾으세요.

In [83]:
netflix[netflix['title'] == 'Limitless']

Unnamed: 0,title,director,date_added,type
1559,Limitless,Neil Burger,2019-05-16,Movie
2564,Limitless,,2016-07-01,TV Show
4579,Limitless,Vrinda Samartha,2019-10-01,Movie


3. 'Robert Rodriguez' 감독이 제작하고 유형이 'Movie'인 모든 콘텐츠를 찾으세요.

In [84]:
rr = netflix['director'] == 'Robert Rodriguez'
is_movie = netflix['type'] == 'Movie'
netflix[rr & is_movie]

Unnamed: 0,title,director,date_added,type
1384,Spy Kids: All the Time in the World,Robert Rodriguez,2019-02-19,Movie
1416,Spy Kids 3: Game Over,Robert Rodriguez,2019-04-01,Movie
1460,Spy Kids 2: The Island of Lost Dreams,Robert Rodriguez,2019-03-08,Movie
2890,Sin City,Robert Rodriguez,2019-10-01,Movie
3836,Shorts,Robert Rodriguez,2019-07-01,Movie
3883,Spy Kids,Robert Rodriguez,2019-04-01,Movie


4. 등록된 날짜가 '2019-07-31'이거나 감독이 'Robert Altman'인 모든 콘텐츠를 찾으세요.

In [85]:
is_theday = netflix['date_added'] == '2019-07-31'
ra = netflix['director'] == 'Robert Altman'
netflix[is_theday | ra]

Unnamed: 0,title,director,date_added,type
611,Popeye,Robert Altman,2019-11-24,Movie
1028,The Red Sea Diving Resort,Gideon Raff,2019-07-31,Movie
1092,Gosford Park,Robert Altman,2019-11-01,Movie
3473,Bangkok Love Stories: Innocence,,2019-07-31,TV Show
5117,Ramen Shop,Eric Khoo,2019-07-31,Movie


5. 감독이 'Orson Welles', 'Aditya Kripalani' 또는 'Sam Raimi'인 모든 콘텐츠를 찾으세요.

In [86]:
director3 = ['Orson Welles', 'Aditya Kripalani', 'Sam Raimi']
is_director3 = netflix['director'].isin(director3)
netflix[is_director3]

Unnamed: 0,title,director,date_added,type
946,The Stranger,Orson Welles,2018-07-19,Movie
1870,The Gift,Sam Raimi,2019-11-20,Movie
3706,Spider-Man 3,Sam Raimi,2019-11-01,Movie
4243,Tikli and Laxmi Bomb,Aditya Kripalani,2018-08-01,Movie
4475,The Other Side of the Wind,Orson Welles,2018-11-02,Movie
5115,Tottaa Pataaka Item Maal,Aditya Kripalani,2019-06-25,Movie


6. 2019년 5월 1일과 2019년 6월 1일 사이에 등록된 모든 콘텐츠를 찾으세요.

In [87]:
between_2019_May = netflix['date_added'].between('2019-05-01', '2019-06-01')
netflix[between_2019_May]

Unnamed: 0,title,director,date_added,type
29,Chopsticks,Sachin Yardi,2019-05-31,Movie
60,Away From Home,,2019-05-08,TV Show
82,III Smoking Barrels,Sanjib Dey,2019-06-01,Movie
108,Jailbirds,,2019-05-10,TV Show
124,Pegasus,Han Han,2019-05-31,Movie
...,...,...,...,...
5671,Satan & Adam,V. Scott Balcerek,2019-06-01,Movie
5675,Rim of the World,McG,2019-05-24,Movie
5677,Malibu Rescue,Savage Steve Holland,2019-05-13,Movie
5739,Mission Istaanbul: Darr Ke Aagey Jeet Hai,Apoorva Lakhia,2019-05-16,Movie


7. director 열에서 NaN 값이 있는 모든 행을 삭제하세요.

In [89]:
netflix.dropna(subset=['director'])

Unnamed: 0,title,director,date_added,type
1,A Patch of Fog,Michael Lennox,2017-04-15,Movie
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
5,Schubert In Love,Lars Büchel,2018-03-01,Movie
6,We Have Always Lived in the Castle,Stacie Passon,2019-09-14,Movie
...,...,...,...,...
5830,Bibi & Tina,Detlev Buck,2017-04-15,Movie
5832,The Pursuit,John Papola,2019-08-07,Movie
5833,Hurricane Bianca,Matt Kugelman,2017-01-01,Movie
5834,Amar's Hands,Khaled Youssef,2019-04-26,Movie


8. 넷플릭스가 콘텐츠를 단 하나만 등록한 날짜를 식별하세요.

In [90]:
netflix.drop_duplicates(subset=['date_added'], keep=False)

Unnamed: 0,title,director,date_added,type
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
12,Without Gorky,Cosima Spender,2017-05-31,Movie
30,Anjelah Johnson: Not Fancy,Jay Karas,2015-10-02,Movie
38,One Last Thing,Tim Rouhana,2019-08-25,Movie
70,Marvel's Iron Man & Hulk: Heroes United,Leo Riley,2014-02-16,Movie
...,...,...,...,...
5748,Menorca,John Barnard,2017-08-27,Movie
5749,Green Room,Jeremy Saulnier,2018-11-12,Movie
5788,Chris Brown: Welcome to My Life,Andrew Sandler,2017-10-07,Movie
5789,A Very Murray Christmas,Sofia Coppola,2015-12-04,Movie
