In [207]:
import numpy as np
import pandas as pd

## 누락된 데이터 처리

In [208]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [209]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [210]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

|인자|설명|
|:------:|:---|
|dropna|누락된 데이터가 있는 축(low,column)을 제외시킨다. 어느정도의 누락데이터까지 용인할것인지 지정 할 수 있다.|
|fillna|누락된 데이터를 대신할 값을 채우거나 **'ffill'**이나 'bfill'같은 보간 메서드를 적용한다.|
|isnull|누락되거나 NA인 값을 알려주는 불리언 값이 저장된 같은 형의 객체를 반환|
|notnull|isnull과 반대되는 메서드|

In [211]:
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [212]:
data[data.notnull()] #notnull이 아닌 것만 출력하라.

0    1.0
2    3.5
4    7.0
dtype: float64

In [213]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [214]:
# dropna() : NaN이 하나라도 있으면 삭제
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [215]:
# how=all : 모두 NaN이면 삭제
cleaned = data.dropna(how="all")
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [216]:
#4번 열을 추가하고 NA값으로 지정
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [217]:
cleaned = data.dropna(axis='columns', how="all") #또는 axis = 1
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [218]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-2.177068,1.211273,-1.16988
1,-0.816505,-0.199263,-0.571461
2,-0.246924,0.303722,1.022237
3,0.585755,-0.329682,0.774167
4,-1.905758,1.610484,0.244408
5,1.083834,-0.037916,1.115177
6,-0.371978,-0.005408,-0.177064


In [219]:
#변경할 자료에 접근하기 위해서 iloc사용
df.iloc[:4,1] =NA
df.iloc[:2,2] =NA
df


Unnamed: 0,0,1,2
0,-2.177068,,
1,-0.816505,,
2,-0.246924,,1.022237
3,0.585755,,0.774167
4,-1.905758,1.610484,0.244408
5,1.083834,-0.037916,1.115177
6,-0.371978,-0.005408,-0.177064


In [220]:
#cleaned = df.dropna()
cleaned = df.dropna(thresh=2) #하나만 비어있는 것은 남아있고, 2개가 NA인것은 삭제
cleaned

Unnamed: 0,0,1,2
2,-0.246924,,1.022237
3,0.585755,,0.774167
4,-1.905758,1.610484,0.244408
5,1.083834,-0.037916,1.115177
6,-0.371978,-0.005408,-0.177064


In [221]:
# NaN을 0으로 채워보자

## 결측치 채우기

In [222]:
filled = df.fillna(0)
filled

Unnamed: 0,0,1,2
0,-2.177068,0.0,0.0
1,-0.816505,0.0,0.0
2,-0.246924,0.0,1.022237
3,0.585755,0.0,0.774167
4,-1.905758,1.610484,0.244408
5,1.083834,-0.037916,1.115177
6,-0.371978,-0.005408,-0.177064


In [223]:
# column별로 다르게 입력하고 싶을 때
filled = df.fillna({1:0.9,2:0})
filled

Unnamed: 0,0,1,2
0,-2.177068,0.9,0.0
1,-0.816505,0.9,0.0
2,-0.246924,0.9,1.022237
3,0.585755,0.9,0.774167
4,-1.905758,1.610484,0.244408
5,1.083834,-0.037916,1.115177
6,-0.371978,-0.005408,-0.177064


In [224]:
df.fillna(0,inplace=True) #위와 다르게 따로 변수를 지정해 주지 않아도, inplace가 업데이트 시킴.
df

Unnamed: 0,0,1,2
0,-2.177068,0.0,0.0
1,-0.816505,0.0,0.0
2,-0.246924,0.0,1.022237
3,0.585755,0.0,0.774167
4,-1.905758,1.610484,0.244408
5,1.083834,-0.037916,1.115177
6,-0.371978,-0.005408,-0.177064


In [225]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[:2,1] =NA
df.iloc[:4,2] =NA
df

Unnamed: 0,0,1,2
0,-0.323764,,
1,-0.043704,,
2,0.166316,-0.988915,
3,-1.083881,1.042617,
4,-0.838486,0.047442,0.646993
5,-1.713974,1.064576,0.857991


In [226]:
filled = df.fillna(method='ffill')
filled

Unnamed: 0,0,1,2
0,-0.323764,,
1,-0.043704,,
2,0.166316,-0.988915,
3,-1.083881,1.042617,
4,-0.838486,0.047442,0.646993
5,-1.713974,1.064576,0.857991


In [227]:
filled = df.fillna(method='ffill', limit=2)
filled

Unnamed: 0,0,1,2
0,-0.323764,,
1,-0.043704,,
2,0.166316,-0.988915,
3,-1.083881,1.042617,
4,-0.838486,0.047442,0.646993
5,-1.713974,1.064576,0.857991


### df.fillna(value=, method='ffill',axis=0, inplace=False, limit=)
- value : 비어있는 값을 채울 스칼라 값이나 dictionary 형식의 객체
- method : 보간법(기본 ffill)
- axis : 값을 채워 넣을 축 (기본 axis=0)
- inplace : 복사본을 생성하지 않고 호출한 객체에 값을 반환, 기본값=False
- limit : 값을 앞 또는 뒤로 몇개까지 채울지 지정

## 데이터변형
### 데이터 중복 제거

In [228]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [229]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [230]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [231]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [232]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [233]:
data.drop_duplicates(['k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [234]:
data.drop_duplicates(['k1','k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [235]:
data.drop_duplicates(['k1','k2'], keep='last') #keep='last' : 중복중에 첫번째삭제 마지막거 남겨짐.

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 데이터 변형하기

In [236]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [237]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [238]:
lower_cased = data['food'].str.lower()
lower_cased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [239]:
data['animal']= lower_cased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### 람다식으로 똑같이 표현

In [240]:
data['food'].map(lambda x :meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [241]:
# lower_cased = data['food'].str.lower()
# data['animal']= lower_cased.map(meat_to_animal)

data['animal']= data['food'].map(lambda x :meat_to_animal[x.lower()])
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [242]:
data = pd.Series([1.,-999.,2,-1000])
data

0       1.0
1    -999.0
2       2.0
3   -1000.0
dtype: float64

In [243]:
data2 = data.replace([-999,-1000], np.nan)
data2

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [244]:
data

0       1.0
1    -999.0
2       2.0
3   -1000.0
dtype: float64

In [245]:
data2 = data.replace([-999,-1000], [np.nan,0])
data2

0    1.0
1    NaN
2    2.0
3    0.0
dtype: float64

In [246]:
data2 = data.replace({-999: np.nan, -1000:0})
#-999 -> Nan, -1000 -> 0
data2

0    1.0
1    NaN
2    2.0
3    0.0
dtype: float64