In [3]:
### 누락된 데이터 처리하기
import pandas as pd
import numpy as np

string_data = pd.Series(["a", "b", np.nan, "d"])
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [9]:
# 누락된 데이터 골라내기
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [4]:
from numpy import nan as NA
data = pd.DataFrame([ [1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])

cleaned_data1 = data.dropna()
cleaned_data1
# NA 값을 하나라도 포함되고 있는 ROW를 제외

cleaned_data2 = data.dropna(how="all")
cleaned_data2
# OPTION "how='all'" 모두 NA 값인 ROW만 제외

cleaned_data2.dropna(thresh=2)
# 몇개 이상의 값이 들어있는 ROW만 살펴보고 싶을때 OPTION 'THRESH'

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
3,,6.5,3.0


In [33]:
# NA값 0으로 대체

data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [37]:
# 각 column별로 값 지정

data.fillna({0:0.2, 1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,0.0
2,0.2,0.5,0.0
3,0.2,6.5,3.0


In [44]:
df2 = data.fillna(0)
df2
data.fillna(0, inplace=True)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


In [45]:
# FILLNA METHOD 사용

df = pd.DataFrame(np.random.randn(6,3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-2.182808,-0.770665,-2.323487
1,-0.636426,0.070412,-2.042965
2,1.066373,,2.01369
3,0.279141,,-1.728624
4,0.250511,,
5,-1.211823,,


In [46]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-2.182808,-0.770665,-2.323487
1,-0.636426,0.070412,-2.042965
2,1.066373,0.070412,2.01369
3,0.279141,0.070412,-1.728624
4,0.250511,0.070412,-1.728624
5,-1.211823,0.070412,-1.728624


In [47]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-2.182808,-0.770665,-2.323487
1,-0.636426,0.070412,-2.042965
2,1.066373,0.070412,2.01369
3,0.279141,0.070412,-1.728624
4,0.250511,,-1.728624
5,-1.211823,,-1.728624


In [53]:
# Series의 평균값이나 중앙값 넣기

data = pd.Series([1, NA, 3.5, NA, 7])
data.fillna(data.mean())
data.fillna(data.median())

0    1.0
1    3.5
2    3.5
3    3.5
4    7.0
dtype: float64

In [54]:
### 데이터 변형
# 중복 제거하기

dic_data = {"k1":['one', 'two', 'one', 'two', 'one', 'two', 'two'], 
            "k2":[1, 1, 2, 3, 3, 4, 4]}
data = pd.DataFrame(dic_data)
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [55]:
# 중복된 값 찾아보기

data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [57]:
# 'k1'의 값을 기준으로 중복된 값 모두 제외
data['v1'] = [0, 1, 2, 3, 4, 5, 6]
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [61]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [59]:
data.drop_duplicates(['k1', 'k2'], keep = 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [62]:
# 개별화와 양자화

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [63]:
bins = [18, 25, 35, 60, 100]

In [64]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [66]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [None]:
### 정규 표현식

In [5]:
import re
text = "foo bar\t baz \tqux"
print(text)

foo bar	 baz 	qux


In [70]:
re.split("\s+", text)
## '\s+' => 여러가지 공백문자가 포함된 문자열을 나누고싶을때 사용

['foo', 'bar', 'baz', 'qux']

In [71]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9-.]+.[A-Z]{2,4}'
re1 = re.compile(pattern, flags=re.IGNORECASE)

In [72]:
re1.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [73]:
data = pd.Series(np.random.randn(9), index = ['a', 'a','a','b', 'b', 'c', 'c','d', 'd'])
data

a   -1.155278
a   -1.116339
a    0.975689
b   -0.564532
b    0.803827
c   -0.740317
c   -0.283511
d    0.591088
d    1.284811
dtype: float64

In [76]:
data = pd.Series(np.random.randn(9), index = [['a', 'a','a','b', 'b', 'c', 'c','d', 'd'], 
                                              [1,2,3,1,2,1,2,1,2]])
data

a  1   -0.480165
   2    0.252967
   3   -0.900676
b  1    0.644486
   2   -0.129333
c  1   -0.926244
   2   -0.470468
d  1   -1.189769
   2   -0.753108
dtype: float64

In [77]:
data['b']

1    0.644486
2   -0.129333
dtype: float64

In [80]:
data['b':'d']

b  1    0.644486
   2   -0.129333
c  1   -0.926244
   2   -0.470468
d  1   -1.189769
   2   -0.753108
dtype: float64

In [79]:
data.loc[['b', 'd']]

b  1    0.644486
   2   -0.129333
d  1   -1.189769
   2   -0.753108
dtype: float64

In [82]:
data.loc['b', 2]

-0.1293326498282228

In [81]:
data.loc[:, 2]

a    0.252967
b   -0.129333
c   -0.470468
d   -0.753108
dtype: float64

In [85]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.480165,0.252967,-0.900676
b,0.644486,-0.129333,
c,-0.926244,-0.470468,
d,-1.189769,-0.753108,


In [None]:
# 데이터 합치기

In [89]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
df1
df2

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6


Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [88]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [90]:
# column merge 지정  => "on = 'key'"
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [91]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [None]:
### 축 따라 이어붙이기(concatenation)
# 연결binding, 적층stacking


In [6]:
# numpy로 이어붙이기
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [10]:
np.concatenate([arr, arr], axis = 0)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [9]:
np.concatenate([arr, arr], axis = 1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [11]:
# pandas로 이어붙이기
s1 = pd.Series([0,1], index = ['a', 'b'])
s2 = pd.Series([2,3,4], index = ['c', 'd', 'e'])
s3 = pd.Series([5,6], index = ['f', 'g'])
print(s1);print(s2);print(s3)

a    0
b    1
dtype: int64
c    2
d    3
e    4
dtype: int64
f    5
g    6
dtype: int64


In [12]:
pd.concat([s1, s2, s3])
# 데이터의 구조가 비슷하고 단순하게 합칠땐 concat을 활용 

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [13]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [16]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['b', 'd', 'a'])


Unnamed: 0,b,d,a
0,-0.368661,-1.232475,0.429867
1,0.417466,0.548468,-0.813038


In [17]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c,d
0,-0.176221,-2.034161,-2.41205,1.524847
1,4.414395,-0.889813,1.07604,-1.999482
2,1.489594,-0.202737,-2.150486,1.329201
0,0.429867,-0.368661,,-1.232475
1,-0.813038,0.417466,,0.548468


In [18]:
# INDEX 값이 순차적으로 붙었음 
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,-0.176221,-2.034161,-2.41205,1.524847
1,4.414395,-0.889813,1.07604,-1.999482
2,1.489594,-0.202737,-2.150486,1.329201
3,0.429867,-0.368661,,-1.232475
4,-0.813038,0.417466,,0.548468
