# chp7. 데이터 정제 및 준비
# 7.1 누락된 데이터 처리하기

In [1]:
# pandas 목표 중 하나는 누락된 데이터를 최대한 쉽게 처리할 수 있도록 하는 것

In [2]:
import pandas as pd
import numpy as np

In [3]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
# None 값 또한 nan 으로 인식된다
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

- 7.1.1 누락된 데이터 골라내기

In [6]:
# 직접 제거할 수도 있지만, dropna를 이용하면 쉽게 누락 시킬 수 있다.

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna() # 저절로 그 값을 뺀 값으로 가져올 수 있다.

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
# dataframe의 경우, 로우나 컬럼을 제외시키거나 na값을 하나라도 포함하고 있는 경우에 로우나 컬럼을 모두 제외시킬 수 있다.

In [15]:
df = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                 [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = df.dropna()
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [17]:
# how = 'all' 옵션을 넘기면 '모두 NA 값인 로우'만 제외시킨다
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
df[4] = NA

In [19]:
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [20]:
df.dropna(axis = 1, how='all') # axis=1 을 이용해서 column을 제외시킬 수 있다 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df1 = pd.DataFrame(np.random.randn(7, 3))
df1.iloc[:4, 1] = NA
df1.iloc[:2, 2] = NA
df1

Unnamed: 0,0,1,2
0,-0.701111,,
1,0.208145,,
2,1.491992,,1.645306
3,-1.185335,,0.015737
4,0.098682,-0.664833,-0.956607
5,0.463207,1.495176,-0.141645
6,0.813217,1.410566,1.35918


In [22]:
df1.dropna()

Unnamed: 0,0,1,2
4,0.098682,-0.664833,-0.956607
5,0.463207,1.495176,-0.141645
6,0.813217,1.410566,1.35918


In [25]:
df1.dropna(thresh=2)  # 2개 이상의 값이 있는 row를 확인한다

Unnamed: 0,0,1,2
2,1.491992,,1.645306
3,-1.185335,,0.015737
4,0.098682,-0.664833,-0.956607
5,0.463207,1.495176,-0.141645
6,0.813217,1.410566,1.35918


- 7.1.2 결측치 채우기

In [26]:
# 결측치를 채우기위해 fillna 매서드를 이용하면 된다

In [27]:
df1.fillna(0)

Unnamed: 0,0,1,2
0,-0.701111,0.0,0.0
1,0.208145,0.0,0.0
2,1.491992,0.0,1.645306
3,-1.185335,0.0,0.015737
4,0.098682,-0.664833,-0.956607
5,0.463207,1.495176,-0.141645
6,0.813217,1.410566,1.35918


In [28]:
df1.fillna({1: 0.5, 2:0}) # 로우마다 원하는 값을 넣어줄 수 있다

Unnamed: 0,0,1,2
0,-0.701111,0.5,0.0
1,0.208145,0.5,0.0
2,1.491992,0.5,1.645306
3,-1.185335,0.5,0.015737
4,0.098682,-0.664833,-0.956607
5,0.463207,1.495176,-0.141645
6,0.813217,1.410566,1.35918


In [29]:
_ = df1.fillna(0, inplace=True)  #옆의 방법으로 기존 객체 변경 가능
df1

Unnamed: 0,0,1,2
0,-0.701111,0.0,0.0
1,0.208145,0.0,0.0
2,1.491992,0.0,1.645306
3,-1.185335,0.0,0.015737
4,0.098682,-0.664833,-0.956607
5,0.463207,1.495176,-0.141645
6,0.813217,1.410566,1.35918


In [30]:
df2 = pd.DataFrame(np.random.randn(6, 3))
df2.iloc[2:, 1] = NA
df2.iloc[4:, 2] = NA
df2

Unnamed: 0,0,1,2
0,1.32159,1.625496,-0.229751
1,-0.350889,-1.505681,1.187448
2,-1.132664,,1.096875
3,0.835042,,-0.512577
4,-0.402757,,
5,-1.161097,,


In [32]:
df2.fillna(method='ffill')  # 컬럼의 마지막 값으로 모두 넣어줄 수 있다.

Unnamed: 0,0,1,2
0,1.32159,1.625496,-0.229751
1,-0.350889,-1.505681,1.187448
2,-1.132664,-1.505681,1.096875
3,0.835042,-1.505681,-0.512577
4,-0.402757,-1.505681,-0.512577
5,-1.161097,-1.505681,-0.512577


In [33]:
df2.fillna(method='ffill', limit = 2)  # limit를 주면 해당하는 컬럼에서만 적용된다.

Unnamed: 0,0,1,2
0,1.32159,1.625496,-0.229751
1,-0.350889,-1.505681,1.187448
2,-1.132664,-1.505681,1.096875
3,0.835042,-1.505681,-0.512577
4,-0.402757,,-0.512577
5,-1.161097,,-0.512577


In [35]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64