In [1]:
import pandas as pd
import numpy as np

In [17]:
# data.frame 생성하기
df = pd.DataFrame(
    {"a" : [4,5,6],
    "b" : [7,8,9],
    "c": [10,11,12]}, # dict type으로 지정, Series 데이터
    index = [1,2,3]) # row name 설정

In [18]:
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [8]:
df["b"] # 특정 column

1    7
2    8
3    9
Name: b, dtype: int64

In [None]:
df[["a","b"]] # list에 column 담아주기

In [25]:
df.loc[1] # 1st row

a     4
b     7
c    10
Name: 1, dtype: int64

In [30]:
df.loc[3,"a"]

6

In [4]:
df.loc[[1,2],["a","b"]] # 여러개의 행과 열 인덱싱

Unnamed: 0,a,b
1,4,7
2,5,8


In [6]:
df = pd.DataFrame(
    [[4,7,10],[5,8,11],[6,9,12]],
    index = [1,2,3],
    columns = ['a','b','c'])
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [8]:
W = pd.DataFrame(np.arange(12).reshape(3,4),
                columns=[list('abcd')])
print(W)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [9]:
W.loc[0:1,:] # pd 계열: 마지막 값 포함O

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7


In [10]:
W.iloc[0:1, :] # np계열 : 마지막 값 포함X

Unnamed: 0,a,b,c,d
0,0,1,2,3


In [12]:
D1 = pd.DataFrame([[1,2],[3,4]],
                 columns = ["a","b"])
D2 = pd.DataFrame([[11,22,33],[44,55,66],[77,88,99]],
                 columns = ['a','b','c'])

In [17]:
print(D1+D2)
print(D1-D2)
print(D1*D2)
print(D2%D1)

      a     b   c
0  12.0  24.0 NaN
1  47.0  59.0 NaN
2   NaN   NaN NaN
      a     b   c
0 -10.0 -20.0 NaN
1 -41.0 -51.0 NaN
2   NaN   NaN NaN
       a      b   c
0   11.0   44.0 NaN
1  132.0  220.0 NaN
2    NaN    NaN NaN
     a    b   c
0  0.0  0.0 NaN
1  2.0  3.0 NaN
2  NaN  NaN NaN


## 7.1 결측 자료의 처리

In [18]:
N = pd.Series([1, np.nan, "missing", None])
print(N)

0          1
1        NaN
2    missing
3       None
dtype: object


In [19]:
N.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [20]:
N.fillna(0) # 특정 숫자로 missing 대체

0          1
1          0
2    missing
3          0
dtype: object

In [22]:
M = pd.DataFrame([[1,None,2,None],
                 [4,5,6,None],
                 [None, 7,8,None]],
                index = list('123'),
                columns = list('abcd'))
print(M)

     a    b  c     d
1  1.0  NaN  2  None
2  4.0  5.0  6  None
3  NaN  7.0  8  None


In [30]:
# 열을 기준으로 모두 missing이면 열 삭제
M.dropna(axis='columns', how="all")
# M.dropna(axis=1,how="all") : 동일 결과

Unnamed: 0,a,b,c
1,1.0,,2
2,4.0,5.0,6
3,,7.0,8


In [33]:
# 3개 이상의 non-missing인 row만 남기고 삭제
M.dropna(axis = "rows", thresh=3)
# M.dropna(axis = 0, thresh=3)

Unnamed: 0,a,b,c,d
2,4.0,5.0,6,


In [34]:
# missing의 바로 뒤 열값으로 채움 (b = backward)
M.fillna(method = "bfill", axis = 1)

Unnamed: 0,a,b,c,d
1,1.0,2.0,2.0,
2,4.0,5.0,6.0,
3,7.0,7.0,8.0,


In [35]:
# missing의 바로 뒤 열값으로 채움 (f = forward)
M.fillna(method = "ffill", axis= 1)

Unnamed: 0,a,b,c,d
1,1.0,1.0,2.0,2.0
2,4.0,5.0,6.0,6.0
3,,7.0,8.0,8.0


## 7.3 논리적 인덱스

In [19]:
# 예제 1
x = pd.DataFrame([[1,None,2,None],
                 [3,4,5,None],
                 [None, 6,7, None]],
                index = [1,2,3],
                columns =list('abcd'))
x

Unnamed: 0,a,b,c,d
1,1.0,,2,
2,3.0,4.0,5,
3,,6.0,7,


In [33]:
x.loc[2,"d"] = 99
print(x)

     a    b  c     d
1  1.0  NaN  2  None
2  3.0  4.0  5    99
3  NaN  6.0  7  None


In [37]:
pd.isnull(x['b']) # [True, False, False]

1     True
2    False
3    False
Name: b, dtype: bool

In [38]:
x[pd.isnull(x['b'])]

Unnamed: 0,a,b,c,d
1,1.0,,2,


In [44]:
pd.isnull(x).any(axis=1) # 각 행에서 하나라도 missing이 있는가

1     True
2    False
3     True
dtype: bool

In [46]:
x[pd.isnull(x).any(axis=1)]['b']

1    NaN
3    6.0
Name: b, dtype: float64

In [47]:
x.loc[pd.isnull(x).any(axis=1),'b']

1    NaN
3    6.0
Name: b, dtype: float64

In [48]:
# loc을 쓰지않고 numpy처럼 indexing하면 에러남
x[3,'b']

KeyError: (3, 'b')

In [50]:
# 예제 2
K = pd.DataFrame(np.arange(12).reshape(3,4), columns = list('abcd'))
K

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [51]:
K.loc[K['b']>4,'c']

1     6
2    10
Name: c, dtype: int32

In [53]:
K.loc[K['b']>4]['c'] # K.loc[r,c] = K.loc[r][c]

1     6
2    10
Name: c, dtype: int32

In [54]:
K.loc[K['b']>4, ['c','d']]

Unnamed: 0,c,d
1,6,7
2,10,11


## 7.4 데이터 붙이기

### stack(), unstack()

In [55]:
P = pd.DataFrame(np.arange(6).reshape(2,3),
                index = list('ab'),
                columns = list('123'))
P

Unnamed: 0,1,2,3
a,0,1,2
b,3,4,5


In [56]:
P.stack()

a  1    0
   2    1
   3    2
b  1    3
   2    4
   3    5
dtype: int32

In [57]:
P.stack().unstack() # 원상복귀

Unnamed: 0,1,2,3
a,0,1,2
b,3,4,5


### concat() : 밑으로 붙이기

In [60]:
Q1 = pd.DataFrame([[1,2,3],[4,5,6]], columns=list('abc'))
Q2 = pd.DataFrame([[11,22,33],[44,55,66]],columns=list('bcd'))

In [61]:
pd.concat([Q1,Q2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
0,1.0,2,3,
1,4.0,5,6,
0,,11,22,33.0
1,,44,55,66.0


In [62]:
pd.concat([Q1,Q2], ignore_index = True) # 순서 있는 인덱스

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
0,1.0,2,3,
1,4.0,5,6,
2,,11,22,33.0
3,,44,55,66.0


In [63]:
pd.concat([Q1,Q2], join='inner')

Unnamed: 0,b,c
0,2,3
1,5,6
0,11,22
1,44,55
