### DataFrame 이리저리 조작하기


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.random.randn(6,4))

In [3]:
df

Unnamed: 0,0,1,2,3
0,0.69416,0.147135,-0.282096,-0.999366
1,-0.304167,0.176898,2.668515,0.031599
2,0.776855,-0.519929,-1.710675,0.365366
3,-0.997207,2.103128,1.53681,-1.159432
4,-0.783679,0.077847,-0.705187,1.038576
5,0.158427,-0.016749,0.063688,-0.014445


In [6]:
df.columns = ["A","B","C","D"]

# pandas 에서 제공하는 date_range : datetime데이터형 으로 구성된 인덱스를 생성할 때 사용. 시계열 형태의 데이터셋
df.index = pd.date_range("20160701", periods=6)

In [7]:
df.index

DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',
               '2016-07-05', '2016-07-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df

Unnamed: 0,A,B,C,D
2016-07-01,0.69416,0.147135,-0.282096,-0.999366
2016-07-02,-0.304167,0.176898,2.668515,0.031599
2016-07-03,0.776855,-0.519929,-1.710675,0.365366
2016-07-04,-0.997207,2.103128,1.53681,-1.159432
2016-07-05,-0.783679,0.077847,-0.705187,1.038576
2016-07-06,0.158427,-0.016749,0.063688,-0.014445


In [9]:
# NaN인위적 사용 np.nan
df["F"] = [1.0 , np.nan, 3.5 , 6.1 , np.nan , 7.0]

In [10]:
df

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-02,-0.304167,0.176898,2.668515,0.031599,
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [11]:
# NaN이 포함된 행 어떤것이든 삭제
df.dropna(how="any")

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [12]:
# 값이 모두 NaN인 경우의 행 삭제 
df.dropna(how="all")

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-02,-0.304167,0.176898,2.668515,0.031599,
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [14]:
# NaN을 다른 값으로 대치
df.fillna(value=5.0)

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-02,-0.304167,0.176898,2.668515,0.031599,5.0
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,5.0
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [15]:
df

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-02,-0.304167,0.176898,2.668515,0.031599,
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [16]:
# NaN이 있는 위치에만 TRUE 인 불리언 마스크 
df.isnull()

Unnamed: 0,A,B,C,D,F
2016-07-01,False,False,False,False,False
2016-07-02,False,False,False,False,True
2016-07-03,False,False,False,False,False
2016-07-04,False,False,False,False,False
2016-07-05,False,False,False,False,True
2016-07-06,False,False,False,False,False


In [17]:
# F열의 NaN 값을 포함한 행 만을 출력 
df.loc[df.isnull()["F"],:]

Unnamed: 0,A,B,C,D,F
2016-07-02,-0.304167,0.176898,2.668515,0.031599,
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,


In [18]:
df

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-02,-0.304167,0.176898,2.668515,0.031599,
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [20]:
# 입력한 문자열"20160701"을 데이터 타임 형태로 변환
pd.to_datetime("20160701")

Timestamp('2016-07-01 00:00:00')

In [21]:
# 해당 날짜의 행 삭제
df.drop(pd.to_datetime("20160701"))

Unnamed: 0,A,B,C,D,F
2016-07-02,-0.304167,0.176898,2.668515,0.031599,
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-04,-0.997207,2.103128,1.53681,-1.159432,6.1
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [22]:
# 복수 개의 행 삭제
df.drop([pd.to_datetime("20160702"), pd.to_datetime("20160704")])

Unnamed: 0,A,B,C,D,F
2016-07-01,0.69416,0.147135,-0.282096,-0.999366,1.0
2016-07-03,0.776855,-0.519929,-1.710675,0.365366,3.5
2016-07-05,-0.783679,0.077847,-0.705187,1.038576,
2016-07-06,0.158427,-0.016749,0.063688,-0.014445,7.0


In [24]:
# 열 삭제
df.drop("F", axis =1)

Unnamed: 0,A,B,C,D
2016-07-01,0.69416,0.147135,-0.282096,-0.999366
2016-07-02,-0.304167,0.176898,2.668515,0.031599
2016-07-03,0.776855,-0.519929,-1.710675,0.365366
2016-07-04,-0.997207,2.103128,1.53681,-1.159432
2016-07-05,-0.783679,0.077847,-0.705187,1.038576
2016-07-06,0.158427,-0.016749,0.063688,-0.014445


In [25]:
# 복수 개의 열 삭제
df.drop(["B","F"], axis=1)

Unnamed: 0,A,C,D
2016-07-01,0.69416,-0.282096,-0.999366
2016-07-02,-0.304167,2.668515,0.031599
2016-07-03,0.776855,-1.710675,0.365366
2016-07-04,-0.997207,1.53681,-1.159432
2016-07-05,-0.783679,-0.705187,1.038576
2016-07-06,0.158427,0.063688,-0.014445
