## 결측치 처리

In [3]:
import pandas as pd
import numpy as np

In [5]:
arr=np.random.randn(5,5)

In [8]:
df = pd.DataFrame(arr,columns = ['one','two','three','four','five'],
            index = ['a','b','c','d','e'])
df

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,0.518652,0.841761,1.295932,-0.070911
c,-0.059954,-0.300117,0.080563,0.09157,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [11]:
df.iloc[1:3,1] = np.nan
df

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,,0.841761,1.295932,-0.070911
c,-0.059954,,0.080563,0.09157,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [14]:
df.iloc[2,2:4] = np.nan
df

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,,0.841761,1.295932,-0.070911
c,-0.059954,,,,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


### isnull()
    null값일 경우 True, 아닐경우 False

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     5 non-null      float64
 1   two     3 non-null      float64
 2   three   4 non-null      float64
 3   four    4 non-null      float64
 4   five    5 non-null      float64
dtypes: float64(5)
memory usage: 400.0+ bytes


In [16]:
df.isnull()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,False,True,False,False,False
c,False,True,True,True,False
d,False,False,False,False,False
e,False,False,False,False,False


In [17]:
df.isnull().sum()

one      0
two      2
three    1
four     1
five     0
dtype: int64

In [18]:
df.isnull().sum(axis =1)

a    0
b    1
c    3
d    0
e    0
dtype: int64

### 처리방법
    1) null이 속한 열이나 칼럼을 삭제
    2) null 값을 다른 값으로 채우기

#### 1) null이 속한 열 or 칼럼 삭제
    dropna()
        *droupna(thresh = 00) : 00개 데이터 존재시 미삭제

In [19]:
df

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,,0.841761,1.295932,-0.070911
c,-0.059954,,,,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [21]:
df.dropna()

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [24]:
df.dropna(axis=1)  # null 값이 속한 col을 삭제

Unnamed: 0,one,five
a,-2.548996,-1.202978
b,-0.404242,-0.070911
c,-0.059954,1.001134
d,0.572133,-0.449538
e,0.719889,0.587895


In [26]:
df.dropna(thresh=3)  # 데이터가 00개 존재시 삭제하지 말것.

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,,0.841761,1.295932,-0.070911
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


#### 2) null값에 다른 값을 체우기
    fillna()

In [27]:
df

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,,0.841761,1.295932,-0.070911
c,-0.059954,,,,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [29]:
df.fillna(0)

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,0.0,0.841761,1.295932,-0.070911
c,-0.059954,0.0,0.0,0.0,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [31]:
df.fillna(method='ffill')  # 바로 윗값으로 체우기

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,0.346011,0.841761,1.295932,-0.070911
c,-0.059954,0.346011,0.841761,1.295932,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [34]:
df.fillna(method='bfill')  # 바로 아래값으로 체우기

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,0.470004,0.841761,1.295932,-0.070911
c,-0.059954,0.470004,-0.289977,-1.626867,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [35]:
df.fillna(100)

Unnamed: 0,one,two,three,four,five
a,-2.548996,0.346011,0.962642,-0.584553,-1.202978
b,-0.404242,100.0,0.841761,1.295932,-0.070911
c,-0.059954,100.0,100.0,100.0,1.001134
d,0.572133,0.470004,-0.289977,-1.626867,-0.449538
e,0.719889,-1.254132,0.003653,0.117221,0.587895


In [None]:
df.fillna()