## 디폴트 install, import

In [None]:
!pip install finance-datareader
!pip install beautifulsoup4 
!pip install numpy
!pip install pandas

In [19]:
# default settings
import numpy as np
import pandas as pd
from datetime import datetime

# jupyter notebook 여러 실행인자 실행해도 print되게 만들기
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('max_columns', None)

---

## 복습

In [20]:
df = pd.read_csv("./data/Small_and_Big.csv", index_col=0, parse_dates=["date"])
df.head()

Unnamed: 0,date,종목명,PBR(IFRS-연결),"베타 (M,5Yr)",수익률(%),시가총액 (보통)(평균)(원)
0,2000-07-31,BYC,0.21,0.479,-0.58,27786000000.0
1,2000-07-31,CJ,0.51,1.166,-9.0,1160889000000.0
2,2000-07-31,CJ ENM,6.56,,17.4,400467000000.0
3,2000-07-31,CJ대한통운,0.17,1.314,-7.96,194962000000.0
4,2000-07-31,CJ씨푸드,,0.227,32.0,1987000000.0


---

In [21]:
median_df = df.groupby(['date']).agg({'시가총액 (보통)(평균)(원)': 'median'})
median_df.columns = ["median_시가총액"]
median_df.head()

Unnamed: 0_level_0,median_시가총액
date,Unnamed: 1_level_1
2000-07-31,34947000000.0
2000-08-31,33684000000.0
2000-09-30,33684000000.0
2000-10-31,30523000000.0
2000-11-30,30798000000.0


---

In [22]:
df = df.join(median_df, on="date")

In [23]:
df.loc[df['시가총액 (보통)(평균)(원)'] < df['median_시가총액'], "size"] = "small"
df.loc[df['시가총액 (보통)(평균)(원)'] >= df['median_시가총액'], "size"] = "big"

In [24]:
df.head()

Unnamed: 0,date,종목명,PBR(IFRS-연결),"베타 (M,5Yr)",수익률(%),시가총액 (보통)(평균)(원),median_시가총액,size
0,2000-07-31,BYC,0.21,0.479,-0.58,27786000000.0,34947000000.0,small
1,2000-07-31,CJ,0.51,1.166,-9.0,1160889000000.0,34947000000.0,big
2,2000-07-31,CJ ENM,6.56,,17.4,400467000000.0,34947000000.0,big
3,2000-07-31,CJ대한통운,0.17,1.314,-7.96,194962000000.0,34947000000.0,big
4,2000-07-31,CJ씨푸드,,0.227,32.0,1987000000.0,34947000000.0,small


---

## 복습2

In [25]:
df = pd.read_csv("./data/Small_and_Big.csv", index_col=0, parse_dates=["date"])
df.head()

Unnamed: 0,date,종목명,PBR(IFRS-연결),"베타 (M,5Yr)",수익률(%),시가총액 (보통)(평균)(원)
0,2000-07-31,BYC,0.21,0.479,-0.58,27786000000.0
1,2000-07-31,CJ,0.51,1.166,-9.0,1160889000000.0
2,2000-07-31,CJ ENM,6.56,,17.4,400467000000.0
3,2000-07-31,CJ대한통운,0.17,1.314,-7.96,194962000000.0
4,2000-07-31,CJ씨푸드,,0.227,32.0,1987000000.0


---

In [26]:
# 데이터 사이즈 줄이기
df = df[df['date'] >= "2017-01-01"]
df.head()

Unnamed: 0,date,종목명,PBR(IFRS-연결),"베타 (M,5Yr)",수익률(%),시가총액 (보통)(평균)(원)
207650,2017-01-31,3S,3.22,1.225,-2.15,121442000000.0
207651,2017-01-31,AJ네트웍스,0.89,,-17.85,286576000000.0
207652,2017-01-31,AJ렌터카,0.84,0.18,6.35,184235000000.0
207653,2017-01-31,AP위성,1.4,,-2.73,134105000000.0
207654,2017-01-31,BGF,2.53,-0.673,12.1,4328976000000.0


---

In [27]:
df.groupby(['date'])['수익률(%)'].mean()

date
2017-01-31   -1.083
2017-02-28    0.252
2017-03-31    1.845
2017-04-30    1.845
2017-05-31    4.130
2017-06-30    0.831
2017-07-31   -3.174
2017-08-31    0.540
2017-09-30    0.540
2017-10-31    3.632
2017-11-30    5.547
2017-12-31    5.547
2018-01-31   12.478
2018-02-28   -3.717
2018-03-31   -3.717
2018-04-30    6.085
2018-05-31    1.051
2018-06-30    1.051
Name: 수익률(%), dtype: float64

---

In [28]:
df['date'] = df['date'].dt.strftime("%Y-%m-%d")
df

Unnamed: 0,date,종목명,PBR(IFRS-연결),"베타 (M,5Yr)",수익률(%),시가총액 (보통)(평균)(원)
207650,2017-01-31,3S,3.220,1.225,-2.150,121442000000.000
207651,2017-01-31,AJ네트웍스,0.890,,-17.850,286576000000.000
207652,2017-01-31,AJ렌터카,0.840,0.180,6.350,184235000000.000
207653,2017-01-31,AP위성,1.400,,-2.730,134105000000.000
207654,2017-01-31,BGF,2.530,-0.673,12.100,4328976000000.000
...,...,...,...,...,...,...
235924,2018-06-30,흥국에프엔비,0.950,1.364,-1.690,64926000000.000
235925,2018-06-30,흥국화재,0.650,0.721,8.890,383175000000.000
235926,2018-06-30,흥아해운,0.920,1.240,-5.750,134473000000.000
235927,2018-06-30,희림,1.390,0.216,2.230,86250000000.000


---

## datetime

### timestamp 써보기

선언은 이렇게 할 수 있다

In [29]:
datetime(2021, 1, 1)
type(datetime(2021, 1, 1))

datetime.datetime(2021, 1, 1, 0, 0)

datetime.datetime

---

In [30]:
a = datetime(2014, 8, 1)
b = pd.Timestamp(a)
b

Timestamp('2014-08-01 00:00:00')

---

In [31]:
pd.Timestamp("2021-01-02")

Timestamp('2021-01-02 00:00:00')

---

datetime은 주로 index로 사용이 된다.

In [32]:
dates = [datetime(2014, 8, 1), datetime(2014, 8, 5)]
type(dates)

list

---

In [33]:
dti = pd.DatetimeIndex(dates)
dti

DatetimeIndex(['2014-08-01', '2014-08-05'], dtype='datetime64[ns]', freq=None)

---

In [34]:
pd.to_datetime(dates)

DatetimeIndex(['2014-08-01', '2014-08-05'], dtype='datetime64[ns]', freq=None)

---

In [35]:
pd.to_datetime(dates)[0]

Timestamp('2014-08-01 00:00:00')

---

Series와 함께 사용되는 예제를 보자

In [36]:
dates = [datetime(2014, 8, 1), datetime(2014, 8, 5)]
ts = pd.Series(np.random.randn(2), index=dates)
ts

2014-08-01   -0.875
2014-08-05    0.661
dtype: float64

---

In [37]:
ts.index

DatetimeIndex(['2014-08-01', '2014-08-05'], dtype='datetime64[ns]', freq=None)

---

In [38]:
ts.loc[pd.Timestamp("2014-08-01")]

-0.8751395143092521

---

In [39]:
ts.loc[datetime(2014, 8, 1)]
ts.loc["2014-08-01"]

-0.8751395143092521

-0.8751395143092521

---

주의사항

In [40]:
# True
pd.Timestamp(dates[0])  == datetime(2014, 8, 1)
pd.to_datetime(dates)[0] == datetime(2014, 8, 1)

# False 
pd.to_datetime(dates)[0] == "2014-08-01"

True

True

False

---

In [41]:
# sorting도 지원
ts = ts.sort_index()
ts

2014-08-01   -0.875
2014-08-05    0.661
dtype: float64

---

In [42]:
ts.loc["2014-08-01"]

-0.8751395143092521

---

In [43]:
ts.loc["2014-08"]

2014-08-01   -0.875
2014-08-05    0.661
dtype: float64

---

In [44]:
ts.loc["2014-08-01":]

2014-08-01   -0.875
2014-08-05    0.661
dtype: float64

---

In [45]:
# 주의! list 인덱싱과는 다르게 양끝 포함
ts.loc["2014-08-01":"2014-08-05"]

2014-08-01   -0.875
2014-08-05    0.661
dtype: float64

---

주기적으로 반복되는 datetime도 넣을 수 있음.

In [46]:
dates = pd.date_range('2014-08-01', periods=10, freq="D")
dates

DatetimeIndex(['2014-08-01', '2014-08-02', '2014-08-03', '2014-08-04',
               '2014-08-05', '2014-08-06', '2014-08-07', '2014-08-08',
               '2014-08-09', '2014-08-10'],
              dtype='datetime64[ns]', freq='D')

---

In [47]:
dates = pd.date_range('2014-08-01', periods=10, freq="B")
dates

DatetimeIndex(['2014-08-01', '2014-08-04', '2014-08-05', '2014-08-06',
               '2014-08-07', '2014-08-08', '2014-08-11', '2014-08-12',
               '2014-08-13', '2014-08-14'],
              dtype='datetime64[ns]', freq='B')

---

In [48]:
dates = pd.date_range('2014-08-01', "2014-08-14", freq="D")
dates

DatetimeIndex(['2014-08-01', '2014-08-02', '2014-08-03', '2014-08-04',
               '2014-08-05', '2014-08-06', '2014-08-07', '2014-08-08',
               '2014-08-09', '2014-08-10', '2014-08-11', '2014-08-12',
               '2014-08-13', '2014-08-14'],
              dtype='datetime64[ns]', freq='D')

---

주기가 길어질 경우 아래와 같이 처리해 보자

In [49]:
period = pd.Period('2014-08', freq='Q')  # freq= "D", "M", .. etc
period

Period('2014Q3', 'Q-DEC')

---

In [50]:
period.start_time
period.end_time

Timestamp('2014-07-01 00:00:00')

Timestamp('2014-09-30 23:59:59.999999999')

---

In [51]:
# +1 ==> `freq`에 해당하는 단위가 더해짐 (여기서는 1Q)
period2 = period + 1
period2

Period('2014Q4', 'Q-DEC')

---

In [52]:
period2.start_time
period2.end_time

Timestamp('2014-10-01 00:00:00')

Timestamp('2014-12-31 23:59:59.999999999')

---

pandas에는 period_range라는 함수도 지원한다

In [53]:
p2013 = pd.period_range('2013-01-01', '2013-12-31', freq='M')
p2013

PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
             '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
            dtype='period[M]')

---

In [54]:
p2013[0]

Period('2013-01', 'M')

---

In [55]:
for p in p2013:
    print("{0} {1} {2} {3}".format(p, p.freq, p.start_time, p.end_time))

2013-01 <MonthEnd> 2013-01-01 00:00:00 2013-01-31 23:59:59.999999999
2013-02 <MonthEnd> 2013-02-01 00:00:00 2013-02-28 23:59:59.999999999
2013-03 <MonthEnd> 2013-03-01 00:00:00 2013-03-31 23:59:59.999999999
2013-04 <MonthEnd> 2013-04-01 00:00:00 2013-04-30 23:59:59.999999999
2013-05 <MonthEnd> 2013-05-01 00:00:00 2013-05-31 23:59:59.999999999
2013-06 <MonthEnd> 2013-06-01 00:00:00 2013-06-30 23:59:59.999999999
2013-07 <MonthEnd> 2013-07-01 00:00:00 2013-07-31 23:59:59.999999999
2013-08 <MonthEnd> 2013-08-01 00:00:00 2013-08-31 23:59:59.999999999
2013-09 <MonthEnd> 2013-09-01 00:00:00 2013-09-30 23:59:59.999999999
2013-10 <MonthEnd> 2013-10-01 00:00:00 2013-10-31 23:59:59.999999999
2013-11 <MonthEnd> 2013-11-01 00:00:00 2013-11-30 23:59:59.999999999
2013-12 <MonthEnd> 2013-12-01 00:00:00 2013-12-31 23:59:59.999999999


---

In [56]:
# DateTimeIndex : collections of `Timestamp` objects
a = pd.date_range('1/1/2013', '12/31/2013', freq='M')
a
a[0]

DatetimeIndex(['2013-01-31', '2013-02-28', '2013-03-31', '2013-04-30',
               '2013-05-31', '2013-06-30', '2013-07-31', '2013-08-31',
               '2013-09-30', '2013-10-31', '2013-11-30', '2013-12-31'],
              dtype='datetime64[ns]', freq='M')

Timestamp('2013-01-31 00:00:00', freq='M')

---

In [57]:
# PeriodIndex : collections of `Period` objects
b = pd.period_range('1/1/2013', '12/31/2013', freq='M')
b
b[0]

PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
             '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
            dtype='period[M]')

Period('2013-01', 'M')

---

In [58]:
# 해당 index는 이제 특정 date(time) 시점을 의미하는 것이 아닌, date 범위(range)를 의미
ps = pd.Series(np.random.randn(12), p2013)
ps

2013-01   -2.778
2013-02    0.129
2013-03    1.789
2013-04    2.786
2013-05    0.194
2013-06    0.682
2013-07   -0.914
2013-08    1.490
2013-09   -0.465
2013-10    0.070
2013-11   -0.270
2013-12    0.839
Freq: M, dtype: float64

---

In [59]:
ps.loc["2013-11"]

-0.269937593460067

---

In [60]:
ps.loc["2013-11":]

2013-11   -0.270
2013-12    0.839
Freq: M, dtype: float64