# Reference 
* 파이썬 라이브러리를 활용한 데이터 분석

In [1]:
from datetime import datetime

In [2]:
now = datetime.now()
now

datetime.datetime(2017, 9, 5, 16, 5, 23, 534177)

In [3]:
now.year, now.month, now.day

(2017, 9, 5)

In [5]:
delta = datetime(2017, 9, 5) - datetime(2011, 1, 7)
delta

datetime.timedelta(2433)

In [6]:
delta.days

2433

In [7]:
from datetime import timedelta #timedelta : 두 datetime 값 간의 차이(day, second, microseconds)를 표현

In [8]:
start = datetime(2017, 9, 5)
start + timedelta(12)

datetime.datetime(2017, 9, 17, 0, 0)

In [9]:
start - 2 * timedelta(12)

datetime.datetime(2017, 8, 12, 0, 0)

* 문자열을 datetime으로 변환하기

In [11]:
stamp = datetime(2017, 9, 5)
str(stamp)

'2017-09-05 00:00:00'

In [12]:
stamp.strftime('%Y-%m-%d')

'2017-09-05'

In [13]:
from dateutil.parser import parse

In [14]:
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [15]:
parse('Mar 7, 1993 10:45 PM')

datetime.datetime(1993, 3, 7, 22, 45)

In [16]:
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [20]:
import pandas as pd
import numpy as np

In [21]:
datestrs = ['7/6/2011', '8/6/2011']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None)

* NaT = Not a Time(pandas에서 누락된 타임스탬프 데이터를 나타냄.)

In [22]:
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2011-07-06', '2011-08-06', 'NaT'], dtype='datetime64[ns]', freq=None)

In [23]:
dates = [datetime(2011,1,2), datetime(2011,1,5), datetime(2011,1,7), 
         datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]

ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02   -0.819150
2011-01-05    1.359027
2011-01-07   -2.265938
2011-01-08   -1.000112
2011-01-10    1.705976
2011-01-12    0.868105
dtype: float64

In [24]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

* pandas는 Numpy의 datetime64 자료형을 사용해서 나노초의 정밀도를 가지는 타임스탬프를 저장한다.

In [25]:
ts.index.dtype

dtype('<M8[ns]')

* 인덱싱, 선택, 부분 선택

In [26]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

In [27]:
stamp = ts.index[2]
ts[stamp]

-2.2659382072204668

In [29]:
ts['1/10/2011'], ts['20110110']

(1.7059755608245812, 1.7059755608245812)

In [34]:
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts.tail()

2002-09-22   -0.106814
2002-09-23   -1.419299
2002-09-24   -0.297540
2002-09-25    1.007907
2002-09-26   -1.135671
Freq: D, dtype: float64

In [37]:
longer_ts['2001'].tail()

2001-12-27    0.185114
2001-12-28   -0.865469
2001-12-29    1.151239
2001-12-30   -2.434433
2001-12-31   -0.462157
Freq: D, dtype: float64

In [38]:
ts[datetime(2011, 1,7):]

2011-01-07   -2.265938
2011-01-08   -1.000112
2011-01-10    1.705976
2011-01-12    0.868105
dtype: float64

In [41]:
ts['1/5/2011':'1/12/2011']

2011-01-05    1.359027
2011-01-07   -2.265938
2011-01-08   -1.000112
2011-01-10    1.705976
2011-01-12    0.868105
dtype: float64

In [42]:
ts.truncate(after='1/9/2011')

2011-01-02   -0.819150
2011-01-05    1.359027
2011-01-07   -2.265938
2011-01-08   -1.000112
dtype: float64

In [47]:
dates = pd.date_range('1/1/2017', periods=100, freq='W-WED') # 수요일마다
long_df = pd.DataFrame(np.random.randn(100, 4), index = dates, 
                    columns=['Colorado', 'Texas', 'New York', 'Ohio'])
long_df.loc['5-2017']

Unnamed: 0,Colorado,Texas,New York,Ohio
2017-05-03,-0.040497,1.753916,1.778631,-2.050843
2017-05-10,-0.843313,-1.344803,-0.375495,0.364178
2017-05-17,-1.213776,-0.881384,1.138867,2.138576
2017-05-24,0.806269,-1.058437,0.538174,0.487562
2017-05-31,-0.832626,-1.324052,0.970963,-0.203554


* 중복된 색인을 갖는 시게열

In [48]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [52]:
dup_ts.index.is_unique

False

In [49]:
dup_ts['1/3/2000'] # 중복되지 않음 스칼라값 생성

4

In [51]:
dup_ts['1/2/2000'] # 중복됨

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [57]:
grouped = dup_ts.groupby(level=0) 
#유일하지 않은 타임 스탬를 가지는 데이터를 집계한다고 했을 때 level=0(단일 단계 인덱싱)을 넘기는 것
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [58]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

* 날짜 범위 생성하기

In [60]:
index = pd.date_range('4/1/2017', '9/1/2017')
index

DatetimeIndex(['2017-04-01', '2017-04-02', '2017-04-03', '2017-04-04',
               '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-08',
               '2017-04-09', '2017-04-10',
               ...
               '2017-08-23', '2017-08-24', '2017-08-25', '2017-08-26',
               '2017-08-27', '2017-08-28', '2017-08-29', '2017-08-30',
               '2017-08-31', '2017-09-01'],
              dtype='datetime64[ns]', length=154, freq='D')

In [61]:
pd.date_range('5/1/2017', periods=31)

DatetimeIndex(['2017-05-01', '2017-05-02', '2017-05-03', '2017-05-04',
               '2017-05-05', '2017-05-06', '2017-05-07', '2017-05-08',
               '2017-05-09', '2017-05-10', '2017-05-11', '2017-05-12',
               '2017-05-13', '2017-05-14', '2017-05-15', '2017-05-16',
               '2017-05-17', '2017-05-18', '2017-05-19', '2017-05-20',
               '2017-05-21', '2017-05-22', '2017-05-23', '2017-05-24',
               '2017-05-25', '2017-05-26', '2017-05-27', '2017-05-28',
               '2017-05-29', '2017-05-30', '2017-05-31'],
              dtype='datetime64[ns]', freq='D')

In [67]:
pd.date_range('5/2/2017 12:56:31',  periods = 20)

DatetimeIndex(['2017-05-02 12:56:31', '2017-05-03 12:56:31',
               '2017-05-04 12:56:31', '2017-05-05 12:56:31',
               '2017-05-06 12:56:31', '2017-05-07 12:56:31',
               '2017-05-08 12:56:31', '2017-05-09 12:56:31',
               '2017-05-10 12:56:31', '2017-05-11 12:56:31',
               '2017-05-12 12:56:31', '2017-05-13 12:56:31',
               '2017-05-14 12:56:31', '2017-05-15 12:56:31',
               '2017-05-16 12:56:31', '2017-05-17 12:56:31',
               '2017-05-18 12:56:31', '2017-05-19 12:56:31',
               '2017-05-20 12:56:31', '2017-05-21 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [68]:
pd.date_range('5/2/2017 12:56:31',  periods = 20, normalize=True)

DatetimeIndex(['2017-05-02', '2017-05-03', '2017-05-04', '2017-05-05',
               '2017-05-06', '2017-05-07', '2017-05-08', '2017-05-09',
               '2017-05-10', '2017-05-11', '2017-05-12', '2017-05-13',
               '2017-05-14', '2017-05-15', '2017-05-16', '2017-05-17',
               '2017-05-18', '2017-05-19', '2017-05-20', '2017-05-21'],
              dtype='datetime64[ns]', freq='D')

* 빈도와 날짜 오프셋

In [70]:
from pandas.tseries.offsets import Hour, Minute

In [71]:
hour = Hour()
hour

<Hour>

In [73]:
four_hours = Hour(4)
four_hours

<4 * Hours>

In [74]:
pd.date_range('1/1/2017', '1/3/2017 23:59', freq='4h')

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 04:00:00',
               '2017-01-01 08:00:00', '2017-01-01 12:00:00',
               '2017-01-01 16:00:00', '2017-01-01 20:00:00',
               '2017-01-02 00:00:00', '2017-01-02 04:00:00',
               '2017-01-02 08:00:00', '2017-01-02 12:00:00',
               '2017-01-02 16:00:00', '2017-01-02 20:00:00',
               '2017-01-03 00:00:00', '2017-01-03 04:00:00',
               '2017-01-03 08:00:00', '2017-01-03 12:00:00',
               '2017-01-03 16:00:00', '2017-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [75]:
Hour(2) + Minute(30)

<150 * Minutes>

In [76]:
pd.date_range('1/1/2017', periods=10, freq='1h30min')

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 01:30:00',
               '2017-01-01 03:00:00', '2017-01-01 04:30:00',
               '2017-01-01 06:00:00', '2017-01-01 07:30:00',
               '2017-01-01 09:00:00', '2017-01-01 10:30:00',
               '2017-01-01 12:00:00', '2017-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

* 월별주차

In [78]:
rng = pd.date_range('1/1/2017','9/1/2017', freq='WOM-3FRI') 
list(rng)

[Timestamp('2017-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-03-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-04-21 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-05-19 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-06-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-07-21 00:00:00', freq='WOM-3FRI'),
 Timestamp('2017-08-18 00:00:00', freq='WOM-3FRI')]

* 데이터 시프트
    * 시프트 : 데이터를 시간 축에서 앞이나 뒤로 이동하는 것을 말함

In [80]:
ts = pd.Series(np.random.randn(4), index=pd.date_range('1.1.2017', periods=4, freq='M'))
ts

2017-01-31    1.783282
2017-02-28    0.503006
2017-03-31    0.680261
2017-04-30   -0.289385
Freq: M, dtype: float64

In [81]:
ts.shift(2)

2017-01-31         NaN
2017-02-28         NaN
2017-03-31    1.783282
2017-04-30    0.503006
Freq: M, dtype: float64

In [82]:
ts.shift(-2)

2017-01-31    0.680261
2017-02-28   -0.289385
2017-03-31         NaN
2017-04-30         NaN
Freq: M, dtype: float64

In [83]:
ts / ts.shift(1) - 1

2017-01-31         NaN
2017-02-28   -0.717933
2017-03-31    0.352392
2017-04-30   -1.425403
Freq: M, dtype: float64