#### 7.1 누락된 데이터 처리하기

In [4]:
#판다스에서는 float64 dtype을 가지는 데이터의 경우 NaN으로 누락된 데이터를 표시한다
import pandas as pd
import numpy as np
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [5]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
#파이썬의 내장 None 값도 NA 값으로 취급됨
string_data = pd.Series(["aadrdvark", np.nan, None, "avocado"])
string_data

0    aadrdvark
1          NaN
2         None
3      avocado
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [8]:
float_data = pd.Series([1,2,None], dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [9]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [10]:
#### 1) 누락된 데이터 골라내기
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data = pd.DataFrame([[1.,6.5,3.], [1.,np.nan, np.nan], [np.nan,np.nan,np.nan], [np.nan,6.5,3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
#all 옵션을 넘기면 모든 값이 NA인 행만 제외
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [25]:
df = pd.DataFrame(np.random.standard_normal((7,3)))

In [26]:
df.iloc[:4,1] = np.nan

In [27]:
df.iloc[:2,2] = np.nan

In [28]:
df.dropna()

Unnamed: 0,0,1,2
4,1.159349,0.701773,-0.017852
5,-0.756757,0.419342,-0.20869
6,-0.136325,-0.075884,0.559326


In [29]:
df.dropna(thresh=2) #결측치가 특정 개수보다 적은 행만 살펴보고 싶다면 값 설정

Unnamed: 0,0,1,2
2,-1.19997,,-0.066207
3,0.650569,,-0.800995
4,1.159349,0.701773,-0.017852
5,-0.756757,0.419342,-0.20869
6,-0.136325,-0.075884,0.559326


#### 2) 결측치 채우기 

In [30]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.214331,0.0,0.0
1,-0.309056,0.0,0.0
2,-1.19997,0.0,-0.066207
3,0.650569,0.0,-0.800995
4,1.159349,0.701773,-0.017852
5,-0.756757,0.419342,-0.20869
6,-0.136325,-0.075884,0.559326


In [31]:
#딕셔너리 값을 넘기면 각 열마다 다른 값이 채워진다
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,0.214331,0.5,0.0
1,-0.309056,0.5,0.0
2,-1.19997,0.5,-0.066207
3,0.650569,0.5,-0.800995
4,1.159349,0.701773,-0.017852
5,-0.756757,0.419342,-0.20869
6,-0.136325,-0.075884,0.559326


In [32]:
#재색인에서 사용가능한 보간 메서드는 fillna메서드에서도 사용가능
df = pd.DataFrame(np.random.standard_normal((6,3)))
df.iloc[2:,1]=np.nan
df.iloc[4:,2]=np.nan
df

Unnamed: 0,0,1,2
0,-0.503029,1.351297,1.719671
1,-0.470722,-0.875412,-0.309999
2,-0.11027,,-0.531089
3,-0.92802,,1.425601
4,1.058886,,
5,-2.302875,,


In [33]:
df.fillna(method = "ffill")

Unnamed: 0,0,1,2
0,-0.503029,1.351297,1.719671
1,-0.470722,-0.875412,-0.309999
2,-0.11027,-0.875412,-0.531089
3,-0.92802,-0.875412,1.425601
4,1.058886,-0.875412,1.425601
5,-2.302875,-0.875412,1.425601


In [34]:
#평균값이나 중간값을 넘겨서 데이터를채울 수도 있다
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

#### fillna함수 인수
- value : 비어있는 값을 채울 스칼라 값이나 딕셔너리 형식의 객체
- method : 보간방식이며 bfill을 이용해 뒤로 채우고 ffill으로 앞으로 채움. 기본값은 None
- axis : 값을 채워 넣을 축
- limit : 값을 앞 혹은 뒤에서부터 몇 개까지 채울지 지정

### 2. 데이터 변형
#### 1) 중복 제거하기

In [36]:
data = pd.DataFrame({"k1":["one","two"]*3 +["two"], "k2":[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [37]:
# dubplicated 메서드는 각 행이 중복인지 아닌지를 알려주는 불리언 객체를 반환
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [38]:
#drop_duplicates는 배열이 False인 DataFrame을 필터링해 반환
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [39]:
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


#### 2) 함수나 매핑을 이용해서 데이터 변형하기

In [51]:
data = pd.DataFrame({"food":["bacon","pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"],
                    "ounces":[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [52]:
meat_to_animal = {
    "bacon" : "pig",
    "pulled pork" : "pig",
    "pastrami":"cow",
    "corned beef": "cow",
    "honey ham" : "pig",
    "nova lox" : "salmon"
}

In [53]:
#data와 meat to animal을 food를 기준으로 합쳐보자
data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [54]:
def get_animal(x):
    return meat_to_animal[x]

In [55]:
data["food"].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

#### 3) 값 치환하기

In [61]:
data = pd.Series([1., -999., 2.,-999.,-1000.,3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [57]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [58]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [60]:
#각 값을 다른 값으로 치환하려면 새로 지정할 길의 리스트를 전달
data.replace([-999, -1000], [np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [62]:
#딕셔너리를 이용할 수 있다
data.replace({-999:np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

#### 4) 축 색인 이름 바꾸기

In [63]:
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=["Ohio","Colorado","New York"], columns=["one","two","three","four"])

In [64]:
#축 색인에도 map 메서드가 있다
def transform(x):
    return x[:4].upper()

data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [66]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [67]:
# 원래 객체를 변경하지 않고 새로운 객체를 생성하려면 rename 메서드를 사용한다
data.rename(index = str.title, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [68]:
data.rename(index={"OHIO":"INDIANA"}, columns={"three":"peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


#### 5) 이산화

In [69]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [70]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [71]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [72]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [73]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [74]:
pd.value_counts(age_categories)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64