In [1]:
import pandas as pd
import numpy as np

## 문자열 연산

- 파이썬의 문자열 연산자를 거의 모두 반영

In [5]:
name_tuple = ['Suan Lee', 'Steven Jobs', 'Larry Page', 'Elon Musk', None, 'Bill Gates', 'Mark Zuckerberg',
             'Jeff Bezos']
names = pd.Series(name_tuple)
names

0           Suan Lee
1        Steven Jobs
2         Larry Page
3          Elon Musk
4               None
5         Bill Gates
6    Mark Zuckerberg
7         Jeff Bezos
dtype: object

In [6]:
# 문자열 연산 처리에 접근할때는 'str'
names.str.lower()

0           suan lee
1        steven jobs
2         larry page
3          elon musk
4               None
5         bill gates
6    mark zuckerberg
7         jeff bezos
dtype: object

In [7]:
names.str.split()

0           [Suan, Lee]
1        [Steven, Jobs]
2         [Larry, Page]
3          [Elon, Musk]
4                  None
5         [Bill, Gates]
6    [Mark, Zuckerberg]
7         [Jeff, Bezos]
dtype: object

- 기타 연산자

In [8]:
names.str[0:4]

0    Suan
1    Stev
2    Larr
3    Elon
4    None
5    Bill
6    Mark
7    Jeff
dtype: object

In [9]:
names.str.split().str.get(-1)

0           Lee
1          Jobs
2          Page
3          Musk
4          None
5         Gates
6    Zuckerberg
7         Bezos
dtype: object

In [10]:
names.str.repeat(2)

0                  Suan LeeSuan Lee
1            Steven JobsSteven Jobs
2              Larry PageLarry Page
3                Elon MuskElon Musk
4                              None
5              Bill GatesBill Gates
6    Mark ZuckerbergMark Zuckerberg
7              Jeff BezosJeff Bezos
dtype: object

In [11]:
names.str.join('*')

0                  S*u*a*n* *L*e*e
1            S*t*e*v*e*n* *J*o*b*s
2              L*a*r*r*y* *P*a*g*e
3                E*l*o*n* *M*u*s*k
4                             None
5              B*i*l*l* *G*a*t*e*s
6    M*a*r*k* *Z*u*c*k*e*r*b*e*r*g
7              J*e*f*f* *B*e*z*o*s
dtype: object

- 정규표현식

In [12]:
names.str.match('([A-Za-z]+)')

0    True
1    True
2    True
3    True
4    None
5    True
6    True
7    True
dtype: object

In [13]:
# 정규표현식에 해당하는 것들 모두 출력
names.str.findall('([A-Za-z]+)')

0           [Suan, Lee]
1        [Steven, Jobs]
2         [Larry, Page]
3          [Elon, Musk]
4                  None
5         [Bill, Gates]
6    [Mark, Zuckerberg]
7         [Jeff, Bezos]
dtype: object

- 시계열 처리

In [14]:
idx = pd.DatetimeIndex(['2019-01-01', '2020-01-01', '2020-02-01', '2020-02-02', '2020-03-01'])
s = pd.Series([0, 1, 2, 3, 4], index = idx)
s

2019-01-01    0
2020-01-01    1
2020-02-01    2
2020-02-02    3
2020-03-01    4
dtype: int64

In [15]:
s['2020-01-01':]

2020-01-01    1
2020-02-01    2
2020-02-02    3
2020-03-01    4
dtype: int64

In [16]:
s[:'2020-01-01']

2019-01-01    0
2020-01-01    1
dtype: int64

In [17]:
s['2019']

2019-01-01    0
dtype: int64

- 시계열 데이터 구조

In [19]:
from datetime import datetime
dates = pd.to_datetime(['12-12-2019', datetime(2020, 1, 1), '2nd of Feb, 2020', '2020-Mar-4', '20200701'])
dates

DatetimeIndex(['2019-12-12', '2020-01-01', '2020-02-02', '2020-03-04',
               '2020-07-01'],
              dtype='datetime64[ns]', freq=None)

In [20]:
dates.to_period('D')

PeriodIndex(['2019-12-12', '2020-01-01', '2020-02-02', '2020-03-04',
             '2020-07-01'],
            dtype='period[D]', freq='D')

In [21]:
dates - dates[0]

TimedeltaIndex(['0 days', '20 days', '52 days', '83 days', '202 days'], dtype='timedelta64[ns]', freq=None)

In [22]:
pd.date_range('2020-01-01', '2020-07-01')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10',
               ...
               '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25',
               '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29',
               '2020-06-30', '2020-07-01'],
              dtype='datetime64[ns]', length=183, freq='D')

In [23]:
pd.date_range('2020-01-01', periods = 7)

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07'],
              dtype='datetime64[ns]', freq='D')

In [24]:
pd.date_range('2020-01-01', periods = 7, freq = 'M')

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31', '2020-06-30', '2020-07-31'],
              dtype='datetime64[ns]', freq='M')

In [25]:
pd.date_range('2020-01-01', periods = 7, freq = 'H')

DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 01:00:00',
               '2020-01-01 02:00:00', '2020-01-01 03:00:00',
               '2020-01-01 04:00:00', '2020-01-01 05:00:00',
               '2020-01-01 06:00:00'],
              dtype='datetime64[ns]', freq='H')

In [26]:
idx = pd.to_datetime(['2020-01-01 12:00:00', '2020-01-02 00:00:00'] + [None])
idx

DatetimeIndex(['2020-01-01 12:00:00', '2020-01-02 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [27]:
idx[2]
# 'NaT' = Not a Time

NaT

In [28]:
pd.isnull(idx)

array([False, False,  True])

- 시계열 기본

In [29]:
dates = [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 4), datetime(2020, 1, 7),
        datetime(2020, 1, 10), datetime(2020, 1, 11), datetime(2020, 1, 15)]
dates

[datetime.datetime(2020, 1, 1, 0, 0),
 datetime.datetime(2020, 1, 2, 0, 0),
 datetime.datetime(2020, 1, 4, 0, 0),
 datetime.datetime(2020, 1, 7, 0, 0),
 datetime.datetime(2020, 1, 10, 0, 0),
 datetime.datetime(2020, 1, 11, 0, 0),
 datetime.datetime(2020, 1, 15, 0, 0)]

In [30]:
# 날짜 인덱스를 가지는 series
ts = pd.Series(np.random.randn(7), index = dates)
ts

2020-01-01    1.136985
2020-01-02    0.087183
2020-01-04    0.985372
2020-01-07   -0.971801
2020-01-10   -1.961379
2020-01-11    1.094167
2020-01-15    2.558868
dtype: float64

In [31]:
ts.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-04', '2020-01-07',
               '2020-01-10', '2020-01-11', '2020-01-15'],
              dtype='datetime64[ns]', freq=None)

In [32]:
ts.index[0]

Timestamp('2020-01-01 00:00:00')

In [33]:
ts[ts.index[2]]

0.9853718088748329

In [34]:
ts['20200104']

0.9853718088748329

In [35]:
ts['1/4/2020']

0.9853718088748329

In [38]:
ts['4th of Jan, 2020']

0.9853718088748329

In [39]:
ts = pd.Series(np.random.randn(1000),
              index = pd.date_range('2017-10-01', periods = 1000))
ts

2017-10-01   -0.409191
2017-10-02   -1.792816
2017-10-03    0.124101
2017-10-04   -0.709377
2017-10-05   -1.437921
                ...   
2020-06-22   -0.519861
2020-06-23   -1.637668
2020-06-24   -0.094069
2020-06-25    0.212469
2020-06-26   -1.014740
Freq: D, Length: 1000, dtype: float64

In [40]:
ts['2020']

2020-01-01   -0.096746
2020-01-02    2.180322
2020-01-03    0.646927
2020-01-04    2.094955
2020-01-05   -0.115152
                ...   
2020-06-22   -0.519861
2020-06-23   -1.637668
2020-06-24   -0.094069
2020-06-25    0.212469
2020-06-26   -1.014740
Freq: D, Length: 178, dtype: float64

In [41]:
ts['2020-06']

2020-06-01    0.452413
2020-06-02   -0.150912
2020-06-03    0.454692
2020-06-04   -1.807691
2020-06-05    0.175658
2020-06-06    1.302744
2020-06-07    0.147726
2020-06-08    1.022687
2020-06-09    0.213381
2020-06-10    0.136294
2020-06-11   -0.799113
2020-06-12   -0.041401
2020-06-13   -2.424204
2020-06-14   -1.194920
2020-06-15   -0.263316
2020-06-16    1.838632
2020-06-17   -0.869476
2020-06-18    0.060265
2020-06-19    1.763964
2020-06-20   -0.330324
2020-06-21    1.578547
2020-06-22   -0.519861
2020-06-23   -1.637668
2020-06-24   -0.094069
2020-06-25    0.212469
2020-06-26   -1.014740
Freq: D, dtype: float64

In [42]:
ts[datetime(2020, 6, 20):]

2020-06-20   -0.330324
2020-06-21    1.578547
2020-06-22   -0.519861
2020-06-23   -1.637668
2020-06-24   -0.094069
2020-06-25    0.212469
2020-06-26   -1.014740
Freq: D, dtype: float64

In [43]:
ts['2020-06-10' : '2020-06-20']

2020-06-10    0.136294
2020-06-11   -0.799113
2020-06-12   -0.041401
2020-06-13   -2.424204
2020-06-14   -1.194920
2020-06-15   -0.263316
2020-06-16    1.838632
2020-06-17   -0.869476
2020-06-18    0.060265
2020-06-19    1.763964
2020-06-20   -0.330324
Freq: D, dtype: float64

In [45]:
tdf = pd.DataFrame(np.random.randn(1000, 4),
                  index = pd.date_range('2017-10-01', periods = 1000),
                  columns = ['A', 'B', 'C', 'D'])
tdf

Unnamed: 0,A,B,C,D
2017-10-01,0.720867,1.734528,0.598323,1.163599
2017-10-02,-1.099413,0.483295,-0.178570,-0.671705
2017-10-03,0.019158,-1.394781,0.254253,0.743069
2017-10-04,0.044595,-0.617181,1.186326,-0.004021
2017-10-05,1.002718,0.675530,-0.818820,1.275040
...,...,...,...,...
2020-06-22,0.461751,0.626018,0.190462,1.343334
2020-06-23,-0.433068,-0.018556,-0.189112,-1.227210
2020-06-24,0.184964,-1.381077,-1.347275,-0.128053
2020-06-25,-2.513773,0.051145,0.836708,-1.427391


In [46]:
tdf['2020']

  tdf['2020']


Unnamed: 0,A,B,C,D
2020-01-01,1.116451,0.276545,1.426699,-0.658740
2020-01-02,-1.683228,0.520364,-0.411974,1.454870
2020-01-03,0.709543,0.054508,-1.019315,-0.405311
2020-01-04,0.059825,-0.973861,-0.025839,-1.227207
2020-01-05,0.440751,1.074110,0.480269,-0.203180
...,...,...,...,...
2020-06-22,0.461751,0.626018,0.190462,1.343334
2020-06-23,-0.433068,-0.018556,-0.189112,-1.227210
2020-06-24,0.184964,-1.381077,-1.347275,-0.128053
2020-06-25,-2.513773,0.051145,0.836708,-1.427391


In [47]:
tdf.loc['2020-06']

Unnamed: 0,A,B,C,D
2020-06-01,1.049683,0.857956,0.519267,1.434431
2020-06-02,0.103522,0.130278,-0.701808,-0.25539
2020-06-03,-1.931962,-1.763423,0.551779,-2.053994
2020-06-04,-0.030922,-0.804416,-0.904462,-0.522922
2020-06-05,-0.051265,0.468168,-1.807717,-1.443721
2020-06-06,-0.203037,-2.715865,-0.523128,0.607149
2020-06-07,-0.758824,-0.147089,-2.266838,-0.64677
2020-06-08,0.66658,0.241577,-0.039889,-1.060455
2020-06-09,-1.207883,-0.983443,-1.086066,1.040171
2020-06-10,-1.117354,-0.093855,-0.836269,-1.266392


In [50]:
tdf['2020-06-20':'2020-06-20']

Unnamed: 0,A,B,C,D
2020-06-20,0.694778,0.838745,0.328035,0.16195


In [51]:
tdf['C']

2017-10-01    0.598323
2017-10-02   -0.178570
2017-10-03    0.254253
2017-10-04    1.186326
2017-10-05   -0.818820
                ...   
2020-06-22    0.190462
2020-06-23   -0.189112
2020-06-24   -1.347275
2020-06-25    0.836708
2020-06-26    1.153269
Freq: D, Name: C, Length: 1000, dtype: float64

In [52]:
# 중복을 포함한 데이터프레임 생성
ts = pd.Series(np.random.randn(10),
              index = pd.DatetimeIndex(['2020-01-01', '2020-01-01', '2020-01-02', '2020-01-02', '2020-01-03', '2020-01-04',
                                       '2020-01-05', '2020-01-05', '2020-01-06', '2020-01-07']))
ts

2020-01-01    1.201521
2020-01-01   -0.237099
2020-01-02   -0.348933
2020-01-02   -2.570433
2020-01-03    0.416693
2020-01-04    0.531273
2020-01-05   -1.688731
2020-01-05   -0.995941
2020-01-06   -1.035633
2020-01-07   -1.197641
dtype: float64

In [53]:
ts.index.is_unique

False

In [54]:
ts['2020-01-01']

2020-01-01    1.201521
2020-01-01   -0.237099
dtype: float64

In [55]:
ts.groupby(level = 0).mean()

2020-01-01    0.482211
2020-01-02   -1.459683
2020-01-03    0.416693
2020-01-04    0.531273
2020-01-05   -1.342336
2020-01-06   -1.035633
2020-01-07   -1.197641
dtype: float64

In [56]:
pd.date_range('2020-01-01', '2020-07-01')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10',
               ...
               '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25',
               '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29',
               '2020-06-30', '2020-07-01'],
              dtype='datetime64[ns]', length=183, freq='D')

In [57]:
pd.date_range(start = '2020-01-01', periods = 10)

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10'],
              dtype='datetime64[ns]', freq='D')

In [58]:
pd.date_range(end = '2020-07-01', periods = 10)

DatetimeIndex(['2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25',
               '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29',
               '2020-06-30', '2020-07-01'],
              dtype='datetime64[ns]', freq='D')

In [59]:
# freq = 'B', Business (영업일)만 산출
pd.date_range('2020-07-01', '2020-07-07', freq = 'B')

DatetimeIndex(['2020-07-01', '2020-07-02', '2020-07-03', '2020-07-06',
               '2020-07-07'],
              dtype='datetime64[ns]', freq='B')

- 주기와 오프셋

    - 주기

In [61]:
pd.timedelta_range(0, periods = 12, freq = 'H')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',
                '0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
                '0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
                '0 days 09:00:00', '0 days 10:00:00', '0 days 11:00:00'],
               dtype='timedelta64[ns]', freq='H')

In [63]:
pd.timedelta_range(0, periods = 60, freq = 'T')

TimedeltaIndex(['0 days 00:00:00', '0 days 00:01:00', '0 days 00:02:00',
                '0 days 00:03:00', '0 days 00:04:00', '0 days 00:05:00',
                '0 days 00:06:00', '0 days 00:07:00', '0 days 00:08:00',
                '0 days 00:09:00', '0 days 00:10:00', '0 days 00:11:00',
                '0 days 00:12:00', '0 days 00:13:00', '0 days 00:14:00',
                '0 days 00:15:00', '0 days 00:16:00', '0 days 00:17:00',
                '0 days 00:18:00', '0 days 00:19:00', '0 days 00:20:00',
                '0 days 00:21:00', '0 days 00:22:00', '0 days 00:23:00',
                '0 days 00:24:00', '0 days 00:25:00', '0 days 00:26:00',
                '0 days 00:27:00', '0 days 00:28:00', '0 days 00:29:00',
                '0 days 00:30:00', '0 days 00:31:00', '0 days 00:32:00',
                '0 days 00:33:00', '0 days 00:34:00', '0 days 00:35:00',
                '0 days 00:36:00', '0 days 00:37:00', '0 days 00:38:00',
                '0 days 00:39:00', '0 days 00:40:00

In [64]:
pd.timedelta_range(0, periods = 10, freq = '1H30T')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:30:00', '0 days 03:00:00',
                '0 days 04:30:00', '0 days 06:00:00', '0 days 07:30:00',
                '0 days 09:00:00', '0 days 10:30:00', '0 days 12:00:00',
                '0 days 13:30:00'],
               dtype='timedelta64[ns]', freq='90T')

In [66]:
pd.date_range('2020-01-01', periods = 20, freq = 'B')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-06',
               '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10',
               '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16',
               '2020-01-17', '2020-01-20', '2020-01-21', '2020-01-22',
               '2020-01-23', '2020-01-24', '2020-01-27', '2020-01-28'],
              dtype='datetime64[ns]', freq='B')

In [65]:
pd.date_range('2020-01-01', periods = 30, freq = '2H')

DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 02:00:00',
               '2020-01-01 04:00:00', '2020-01-01 06:00:00',
               '2020-01-01 08:00:00', '2020-01-01 10:00:00',
               '2020-01-01 12:00:00', '2020-01-01 14:00:00',
               '2020-01-01 16:00:00', '2020-01-01 18:00:00',
               '2020-01-01 20:00:00', '2020-01-01 22:00:00',
               '2020-01-02 00:00:00', '2020-01-02 02:00:00',
               '2020-01-02 04:00:00', '2020-01-02 06:00:00',
               '2020-01-02 08:00:00', '2020-01-02 10:00:00',
               '2020-01-02 12:00:00', '2020-01-02 14:00:00',
               '2020-01-02 16:00:00', '2020-01-02 18:00:00',
               '2020-01-02 20:00:00', '2020-01-02 22:00:00',
               '2020-01-03 00:00:00', '2020-01-03 02:00:00',
               '2020-01-03 04:00:00', '2020-01-03 06:00:00',
               '2020-01-03 08:00:00', '2020-01-03 10:00:00'],
              dtype='datetime64[ns]', freq='2H')

In [67]:
pd.date_range('2020-01-01', periods = 30, freq = 'S')

DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 00:00:01',
               '2020-01-01 00:00:02', '2020-01-01 00:00:03',
               '2020-01-01 00:00:04', '2020-01-01 00:00:05',
               '2020-01-01 00:00:06', '2020-01-01 00:00:07',
               '2020-01-01 00:00:08', '2020-01-01 00:00:09',
               '2020-01-01 00:00:10', '2020-01-01 00:00:11',
               '2020-01-01 00:00:12', '2020-01-01 00:00:13',
               '2020-01-01 00:00:14', '2020-01-01 00:00:15',
               '2020-01-01 00:00:16', '2020-01-01 00:00:17',
               '2020-01-01 00:00:18', '2020-01-01 00:00:19',
               '2020-01-01 00:00:20', '2020-01-01 00:00:21',
               '2020-01-01 00:00:22', '2020-01-01 00:00:23',
               '2020-01-01 00:00:24', '2020-01-01 00:00:25',
               '2020-01-01 00:00:26', '2020-01-01 00:00:27',
               '2020-01-01 00:00:28', '2020-01-01 00:00:29'],
              dtype='datetime64[ns]', freq='S')

- 시프트(Shift)

In [68]:
ts = pd.Series(np.random.randn(5),
              index = pd.date_range('2020-01-01', periods = 5, freq = 'B'))
ts

2020-01-01   -0.198722
2020-01-02   -0.189387
2020-01-03    0.941405
2020-01-06   -1.922176
2020-01-07   -0.315469
Freq: B, dtype: float64

In [69]:
ts.shift(1)

2020-01-01         NaN
2020-01-02   -0.198722
2020-01-03   -0.189387
2020-01-06    0.941405
2020-01-07   -1.922176
Freq: B, dtype: float64

In [70]:
ts.shift(3)

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-06   -0.198722
2020-01-07   -0.189387
Freq: B, dtype: float64

In [71]:
ts.shift(-1)

2020-01-01   -0.189387
2020-01-02    0.941405
2020-01-03   -1.922176
2020-01-06   -0.315469
2020-01-07         NaN
Freq: B, dtype: float64

In [72]:
ts.shift(3, freq = 'B')

2020-01-06   -0.198722
2020-01-07   -0.189387
2020-01-08    0.941405
2020-01-09   -1.922176
2020-01-10   -0.315469
Freq: B, dtype: float64

In [73]:
ts.shift(2, freq = 'W')

2020-01-12   -0.198722
2020-01-12   -0.189387
2020-01-12    0.941405
2020-01-19   -1.922176
2020-01-19   -0.315469
dtype: float64

- 시간대 처리
    * 국제표준시(Coordinated Universal Time, UTC)를 기준으로 떨어진 거리만큼 오프셋으로 시간대처리
    * 전 세계의 시간대 정보를 모아놓은 올슨 데이터베이스를 활용한 라이브러리인 pytz 사용

In [74]:
import pytz
pytz.common_timezones

['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau', 'Africa/Blantyre', 'Africa/Brazzaville', 'Africa/Bujumbura', 'Africa/Cairo', 'Africa/Casablanca', 'Africa/Ceuta', 'Africa/Conakry', 'Africa/Dakar', 'Africa/Dar_es_Salaam', 'Africa/Djibouti', 'Africa/Douala', 'Africa/El_Aaiun', 'Africa/Freetown', 'Africa/Gaborone', 'Africa/Harare', 'Africa/Johannesburg', 'Africa/Juba', 'Africa/Kampala', 'Africa/Khartoum', 'Africa/Kigali', 'Africa/Kinshasa', 'Africa/Lagos', 'Africa/Libreville', 'Africa/Lome', 'Africa/Luanda', 'Africa/Lubumbashi', 'Africa/Lusaka', 'Africa/Malabo', 'Africa/Maputo', 'Africa/Maseru', 'Africa/Mbabane', 'Africa/Mogadishu', 'Africa/Monrovia', 'Africa/Nairobi', 'Africa/Ndjamena', 'Africa/Niamey', 'Africa/Nouakchott', 'Africa/Ouagadougou', 'Africa/Porto-Novo', 'Africa/Sao_Tome', 'Africa/Tripoli', 'Africa/Tunis', 'Africa/Windhoek', 'America/Adak', 'America/Anchorage', 'Amer

In [75]:
tz = pytz.timezone('Asia/Seoul')

In [77]:
dinx = pd.date_range('2020-01-01 09:00', periods = 7, freq = 'B')
ts = pd.Series(np.random.randn(len(dinx)),
                              index = dinx)
ts

2020-01-01 09:00:00   -1.443452
2020-01-02 09:00:00   -1.553388
2020-01-03 09:00:00   -0.053615
2020-01-06 09:00:00    0.268996
2020-01-07 09:00:00   -1.156622
2020-01-08 09:00:00    0.580169
2020-01-09 09:00:00   -0.595840
Freq: B, dtype: float64

In [78]:
pd.date_range('2020-01-01 09:00', periods = 7, freq = 'B', tz = 'UTC')

DatetimeIndex(['2020-01-01 09:00:00+00:00', '2020-01-02 09:00:00+00:00',
               '2020-01-03 09:00:00+00:00', '2020-01-06 09:00:00+00:00',
               '2020-01-07 09:00:00+00:00', '2020-01-08 09:00:00+00:00',
               '2020-01-09 09:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='B')

In [79]:
ts_utc = ts.tz_localize('UTC')
ts_utc

2020-01-01 09:00:00+00:00   -1.443452
2020-01-02 09:00:00+00:00   -1.553388
2020-01-03 09:00:00+00:00   -0.053615
2020-01-06 09:00:00+00:00    0.268996
2020-01-07 09:00:00+00:00   -1.156622
2020-01-08 09:00:00+00:00    0.580169
2020-01-09 09:00:00+00:00   -0.595840
Freq: B, dtype: float64

In [80]:
ts_utc.index

DatetimeIndex(['2020-01-01 09:00:00+00:00', '2020-01-02 09:00:00+00:00',
               '2020-01-03 09:00:00+00:00', '2020-01-06 09:00:00+00:00',
               '2020-01-07 09:00:00+00:00', '2020-01-08 09:00:00+00:00',
               '2020-01-09 09:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='B')

In [81]:
ts_utc.tz_convert('Asia/Seoul')

2020-01-01 18:00:00+09:00   -1.443452
2020-01-02 18:00:00+09:00   -1.553388
2020-01-03 18:00:00+09:00   -0.053615
2020-01-06 18:00:00+09:00    0.268996
2020-01-07 18:00:00+09:00   -1.156622
2020-01-08 18:00:00+09:00    0.580169
2020-01-09 18:00:00+09:00   -0.595840
Freq: B, dtype: float64

In [82]:
ts_seoul = ts.tz_localize('Asia/Seoul')
ts_seoul

2020-01-01 09:00:00+09:00   -1.443452
2020-01-02 09:00:00+09:00   -1.553388
2020-01-03 09:00:00+09:00   -0.053615
2020-01-06 09:00:00+09:00    0.268996
2020-01-07 09:00:00+09:00   -1.156622
2020-01-08 09:00:00+09:00    0.580169
2020-01-09 09:00:00+09:00   -0.595840
dtype: float64

In [84]:
ts_seoul.tz_convert('UTC')

2020-01-01 00:00:00+00:00   -1.443452
2020-01-02 00:00:00+00:00   -1.553388
2020-01-03 00:00:00+00:00   -0.053615
2020-01-06 00:00:00+00:00    0.268996
2020-01-07 00:00:00+00:00   -1.156622
2020-01-08 00:00:00+00:00    0.580169
2020-01-09 00:00:00+00:00   -0.595840
dtype: float64

In [85]:
ts_seoul.tz_convert('Europe/Berlin')

2020-01-01 01:00:00+01:00   -1.443452
2020-01-02 01:00:00+01:00   -1.553388
2020-01-03 01:00:00+01:00   -0.053615
2020-01-06 01:00:00+01:00    0.268996
2020-01-07 01:00:00+01:00   -1.156622
2020-01-08 01:00:00+01:00    0.580169
2020-01-09 01:00:00+01:00   -0.595840
dtype: float64

In [86]:
ts.index.tz_localize('America/New_York')

DatetimeIndex(['2020-01-01 09:00:00-05:00', '2020-01-02 09:00:00-05:00',
               '2020-01-03 09:00:00-05:00', '2020-01-06 09:00:00-05:00',
               '2020-01-07 09:00:00-05:00', '2020-01-08 09:00:00-05:00',
               '2020-01-09 09:00:00-05:00'],
              dtype='datetime64[ns, America/New_York]', freq=None)

In [88]:
stamp = pd.Timestamp('2020-01-01 12:00')
stamp_utc = stamp.tz_localize('UTC')
stamp_utc

Timestamp('2020-01-01 12:00:00+0000', tz='UTC')

In [89]:
stamp_utc.value

1577880000000000000

In [91]:
stamp_utc.tz_convert('Asia/Seoul')

Timestamp('2020-01-01 21:00:00+0900', tz='Asia/Seoul')

In [92]:
stamp_utc.tz_convert('Asia/Seoul').value

1577880000000000000

In [93]:
stamp_ny = pd.Timestamp('2020-01-01 12:00', tz = 'America/New_York')
stamp_ny

Timestamp('2020-01-01 12:00:00-0500', tz='America/New_York')

In [95]:
stamp_ny.value

1577898000000000000

In [96]:
stamp_utc.tz_convert('Asia/Shanghai')

Timestamp('2020-01-01 20:00:00+0800', tz='Asia/Shanghai')

In [97]:
stamp = pd.Timestamp('2020-01-01 12:00', tz = 'Asia/Seoul')
stamp

Timestamp('2020-01-01 12:00:00+0900', tz='Asia/Seoul')

In [98]:
from pandas.tseries.offsets import Hour
stamp + Hour()

Timestamp('2020-01-01 13:00:00+0900', tz='Asia/Seoul')

In [99]:
stamp + 3 * Hour()

Timestamp('2020-01-01 15:00:00+0900', tz='Asia/Seoul')

In [100]:
ts_utc

2020-01-01 09:00:00+00:00   -1.443452
2020-01-02 09:00:00+00:00   -1.553388
2020-01-03 09:00:00+00:00   -0.053615
2020-01-06 09:00:00+00:00    0.268996
2020-01-07 09:00:00+00:00   -1.156622
2020-01-08 09:00:00+00:00    0.580169
2020-01-09 09:00:00+00:00   -0.595840
Freq: B, dtype: float64

In [102]:
ts1 = ts_utc[:5].tz_convert('Europe/Berlin')
ts2 = ts_utc[2:].tz_convert('America/New_York')
ts = ts1 + ts2
ts

2020-01-01 09:00:00+00:00         NaN
2020-01-02 09:00:00+00:00         NaN
2020-01-03 09:00:00+00:00   -0.107231
2020-01-06 09:00:00+00:00    0.537992
2020-01-07 09:00:00+00:00   -2.313244
2020-01-08 09:00:00+00:00         NaN
2020-01-09 09:00:00+00:00         NaN
Freq: B, dtype: float64

In [103]:
ts.index

DatetimeIndex(['2020-01-01 09:00:00+00:00', '2020-01-02 09:00:00+00:00',
               '2020-01-03 09:00:00+00:00', '2020-01-06 09:00:00+00:00',
               '2020-01-07 09:00:00+00:00', '2020-01-08 09:00:00+00:00',
               '2020-01-09 09:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='B')

- 기간과 기간연산

In [105]:
p = pd.Period(2020, freq = 'A-JAN')
p

Period('2020', 'A-JAN')

In [106]:
p + 2

Period('2022', 'A-JAN')

In [107]:
p - 3

Period('2017', 'A-JAN')

In [108]:
p1 = pd.Period(2010, freq = 'A-JAN')
p2 = pd.Period(2020, freq = 'A-JAN')
p2 - p1

<10 * YearEnds: month=1>

In [110]:
pr = pd.period_range('2020-01-01', '2020-06-30', freq = 'M')
pr

PeriodIndex(['2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06'], dtype='period[M]', freq='M')

In [111]:
pd.Series(np.random.randn(6), index = pr)

2020-01   -0.337020
2020-02   -0.013285
2020-03   -0.709514
2020-04   -1.635426
2020-05   -1.380349
2020-06    0.259642
Freq: M, dtype: float64

In [112]:
pidx = pd.PeriodIndex(['2020-1', '2020-2', '2020-4'], freq = 'M')
pidx

PeriodIndex(['2020-01', '2020-02', '2020-04'], dtype='period[M]', freq='M')

In [113]:
p = pd.Period('2020', freq = 'A-FEB')
p

Period('2020', 'A-FEB')

In [114]:
p.asfreq('M', how = 'start')

Period('2019-03', 'M')

In [115]:
p.asfreq('M', how = 'end')

Period('2020-02', 'M')

In [116]:
p = pd.Period('2020', freq = 'A-OCT')
p

Period('2020', 'A-OCT')

In [117]:
p.asfreq('M', how = 'start')

Period('2019-11', 'M')

In [118]:
p.asfreq('M', how = 'end')

Period('2020-10', 'M')

In [120]:
pr = pd.period_range('2010', '2020', freq = 'A-JAN')
ts = pd.Series(np.random.randn(len(pr)), index = pr)
ts

2010    0.402038
2011    0.830841
2012   -0.033734
2013   -0.417865
2014   -0.749306
2015    2.955912
2016   -0.644602
2017    1.291225
2018   -0.295806
2019    0.451721
2020   -0.612757
Freq: A-JAN, dtype: float64

In [121]:
ts.asfreq('M', how = 'start')

2009-02    0.402038
2010-02    0.830841
2011-02   -0.033734
2012-02   -0.417865
2013-02   -0.749306
2014-02    2.955912
2015-02   -0.644602
2016-02    1.291225
2017-02   -0.295806
2018-02    0.451721
2019-02   -0.612757
Freq: M, dtype: float64

In [123]:
# 영업일 마지막 일 출력
ts.asfreq('B', how = 'end')

2010-01-29    0.402038
2011-01-31    0.830841
2012-01-31   -0.033734
2013-01-31   -0.417865
2014-01-31   -0.749306
2015-01-30    2.955912
2016-01-29   -0.644602
2017-01-31    1.291225
2018-01-31   -0.295806
2019-01-31    0.451721
2020-01-31   -0.612757
Freq: B, dtype: float64

In [124]:
p = pd.Period('2020Q2', freq = 'Q-JAN')
p

Period('2020Q2', 'Q-JAN')

In [125]:
p.asfreq('D', 'start')

Period('2019-05-01', 'D')

In [126]:
p.asfreq('D', 'end')

Period('2019-07-31', 'D')

In [127]:
pr = pd.period_range('2019Q3', '2020Q3', freq = 'Q-JAN')
ts = pd.Series(np.arange(len(pr)), index = pr)
ts

2019Q3    0
2019Q4    1
2020Q1    2
2020Q2    3
2020Q3    4
Freq: Q-JAN, dtype: int32

In [129]:
pr = pd.date_range('2020-01-01', periods = 5, freq = 'Q-JAN')
ts = pd.Series(np.random.randn(5), index = pr)
ts

2020-01-31   -0.910114
2020-04-30    0.301654
2020-07-31    2.892785
2020-10-31    1.331901
2021-01-31    0.523367
Freq: Q-JAN, dtype: float64

In [130]:
ts.to_period()

2020Q4   -0.910114
2021Q1    0.301654
2021Q2    2.892785
2021Q3    1.331901
2021Q4    0.523367
Freq: Q-JAN, dtype: float64

In [131]:
pr = pd.date_range('2020-01-01', periods = 5, freq = 'D')
ts = pd.Series(np.random.randn(5), index = pr)
ts

2020-01-01    1.028630
2020-01-02   -0.455313
2020-01-03    1.284885
2020-01-04    0.720725
2020-01-05   -1.309146
Freq: D, dtype: float64

In [132]:
p = ts.to_period('M')
p

2020-01    1.028630
2020-01   -0.455313
2020-01    1.284885
2020-01    0.720725
2020-01   -1.309146
Freq: M, dtype: float64

In [133]:
p.to_timestamp(how = 'start')

2020-01-01    1.028630
2020-01-01   -0.455313
2020-01-01    1.284885
2020-01-01    0.720725
2020-01-01   -1.309146
dtype: float64

- 리샘플링(Resampling)
    - 리샘플링(Resmapling) : 시계열의 빈도 변환<br>
    - 다운샘플링(Down Sampling) : 상위 빈도 데이터를 하위 빈도 데이터로 집계<br>
    - 업샘플링(Up Sampling) : 하위 빈도 데이터를 상위 빈도 데이터로 집계<br>
    - resample 메소드

In [137]:
dr = pd.date_range('2020-01-01', periods = 200, freq = 'D')
ts = pd.Series(np.random.randn(len(dr)), index = dr)
ts

2020-01-01    0.405835
2020-01-02    1.663908
2020-01-03   -0.015920
2020-01-04   -2.276931
2020-01-05   -0.657906
                ...   
2020-07-14    1.123832
2020-07-15   -1.538537
2020-07-16    0.598701
2020-07-17   -0.382718
2020-07-18   -0.056492
Freq: D, Length: 200, dtype: float64

In [138]:
ts.resample('M').mean()

2020-01-31   -0.082638
2020-02-29    0.013033
2020-03-31    0.107519
2020-04-30   -0.126931
2020-05-31   -0.035675
2020-06-30    0.044013
2020-07-31   -0.044374
Freq: M, dtype: float64

In [139]:
ts.resample('M', kind = 'period').mean()

2020-01   -0.082638
2020-02    0.013033
2020-03    0.107519
2020-04   -0.126931
2020-05   -0.035675
2020-06    0.044013
2020-07   -0.044374
Freq: M, dtype: float64

In [140]:
dr = pd.date_range('2020-01-01', periods = 10, freq = 'T')
ts = pd.Series(np.arange(10), index = dr)
ts

2020-01-01 00:00:00    0
2020-01-01 00:01:00    1
2020-01-01 00:02:00    2
2020-01-01 00:03:00    3
2020-01-01 00:04:00    4
2020-01-01 00:05:00    5
2020-01-01 00:06:00    6
2020-01-01 00:07:00    7
2020-01-01 00:08:00    8
2020-01-01 00:09:00    9
Freq: T, dtype: int32

In [141]:
ts.resample('2T', closed = 'left').sum()

2020-01-01 00:00:00     1
2020-01-01 00:02:00     5
2020-01-01 00:04:00     9
2020-01-01 00:06:00    13
2020-01-01 00:08:00    17
Freq: 2T, dtype: int32

In [142]:
ts.resample('2T', closed = 'right').sum()

2019-12-31 23:58:00     0
2020-01-01 00:00:00     3
2020-01-01 00:02:00     7
2020-01-01 00:04:00    11
2020-01-01 00:06:00    15
2020-01-01 00:08:00     9
Freq: 2T, dtype: int32

In [143]:
ts.resample('2T', closed = 'right', label = 'right').sum()

2020-01-01 00:00:00     0
2020-01-01 00:02:00     3
2020-01-01 00:04:00     7
2020-01-01 00:06:00    11
2020-01-01 00:08:00    15
2020-01-01 00:10:00     9
Freq: 2T, dtype: int32

In [144]:
ts.resample('2T', closed = 'right', label = 'right', loffset = '-1s').sum()


>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  ts.resample('2T', closed = 'right', label = 'right', loffset = '-1s').sum()


2019-12-31 23:59:59     0
2020-01-01 00:01:59     3
2020-01-01 00:03:59     7
2020-01-01 00:05:59    11
2020-01-01 00:07:59    15
2020-01-01 00:09:59     9
Freq: 2T, dtype: int32

In [145]:
ts.resample('2T').ohlc()

Unnamed: 0,open,high,low,close
2020-01-01 00:00:00,0,1,0,1
2020-01-01 00:02:00,2,3,2,3
2020-01-01 00:04:00,4,5,4,5
2020-01-01 00:06:00,6,7,6,7
2020-01-01 00:08:00,8,9,8,9


In [146]:
df = pd.DataFrame(np.random.randn(10, 4),
                 index = pd.date_range('2019-10-01', periods = 10, freq = 'M'),
                 columns = (['C1', 'C2', 'C3', 'C4']))
df

Unnamed: 0,C1,C2,C3,C4
2019-10-31,-0.648955,1.739272,1.004685,1.02141
2019-11-30,1.421717,-1.228527,0.597574,-0.045161
2019-12-31,-0.273312,0.584845,-0.579013,-0.470661
2020-01-31,-0.040667,2.605468,-0.777648,0.46074
2020-02-29,-0.019076,0.475923,-1.12544,-0.77298
2020-03-31,1.112657,0.996717,0.390377,-0.729871
2020-04-30,-0.795512,1.612379,0.227131,-0.372255
2020-05-31,-0.097638,0.107969,-0.664631,-0.682801
2020-06-30,0.317354,0.562318,-0.161104,-1.11396
2020-07-31,-0.560826,-0.055732,-0.659389,0.141019


In [147]:
df.resample('Y').asfreq()

Unnamed: 0,C1,C2,C3,C4
2019-12-31,-0.273312,0.584845,-0.579013,-0.470661
2020-12-31,,,,


In [148]:
df.resample('W-FRI').asfreq()

Unnamed: 0,C1,C2,C3,C4
2019-11-01,,,,
2019-11-08,,,,
2019-11-15,,,,
2019-11-22,,,,
2019-11-29,,,,
2019-12-06,,,,
2019-12-13,,,,
2019-12-20,,,,
2019-12-27,,,,
2020-01-03,,,,


In [149]:
df.resample('H').asfreq()

Unnamed: 0,C1,C2,C3,C4
2019-10-31 00:00:00,-0.648955,1.739272,1.004685,1.021410
2019-10-31 01:00:00,,,,
2019-10-31 02:00:00,,,,
2019-10-31 03:00:00,,,,
2019-10-31 04:00:00,,,,
...,...,...,...,...
2020-07-30 20:00:00,,,,
2020-07-30 21:00:00,,,,
2020-07-30 22:00:00,,,,
2020-07-30 23:00:00,,,,


> 인용구 안에
>> **마크다운 문법을 활용 할 수도 있고,** 이렇게 $y=3^x$ 수학기호를 쓸 수도 있다.<br>
>> 허허 신기하구만<br><br>
>> 'br' 태그로 줄바꿈 가능하다
>>> 이렇게해도 되나?<br><br><br>
>> 'br' 태그 두번쓰면 두 줄 떨어짐

Fomula
$$\sqrt[3]{x^3+y^3 \over 2}$$