## 7.1 数据str型转日期型

In [9]:
# 7.1 数据str型转日期型

import numpy as np
import pandas as pd

date_strings =np.array(['10-02-2021 05:47 PM','10-02-2021 06:06 PM'])

[pd.to_datetime(date,format ='%d-%m-%Y %I:%M %p') for date in date_strings]

## 注意AM PM ， 如果是PM就不要用24小时格式了 

[Timestamp('2021-02-10 17:47:00'), Timestamp('2021-02-10 18:06:00')]

In [12]:
date_strings =np.array(['10-02-2021 05:47 PM','10-02-2021 16:06 PM'])

[pd.to_datetime(date,format ='%d-%m-%Y %I:%M %p',errors = "coerce") for date in date_strings]
## 添加errors = "coerce" 这个函数之后 就算数据格式和设定格式不符也不会报错，会返回NaT

[Timestamp('2021-02-10 17:47:00'), NaT]

小结 ： %Y 年 ， %m 月 ， %d 日 ， %I 小时（12） ，%p 上下午 ， %M 分钟 ， %S 秒

# 7.2 处理时区

In [13]:
import pandas as pd 

pd.Timestamp('2021-2-10 06:00:00',tz='Europe/London') # tz=time zone 通过tz控制时区

Timestamp('2021-02-10 06:00:00+0000', tz='Europe/London')

In [14]:
date  = pd.Timestamp('2021-2-10 18:20:00')

In [18]:
date_in_london = date.tz_localize('Europe/London')
date_in_london

Timestamp('2021-02-10 18:20:00+0000', tz='Europe/London')

In [22]:
date_in_london.tz_convert('Asia/Shanghai') # 伦敦时间转换成上海时间

Timestamp('2021-02-11 02:20:00+0800', tz='Asia/Shanghai')

In [41]:
dates  = pd.Series(pd.date_range('2/10/2021',periods=3,freq='M')) # 使用range生成一大堆时间
dates

0   2021-02-28
1   2021-03-31
2   2021-04-30
dtype: datetime64[ns]

In [42]:
dates.dt.tz_localize('Europe/Paris')
dates

0   2021-02-28
1   2021-03-31
2   2021-04-30
dtype: datetime64[ns]

In [51]:
from pytz import all_timezones # 调这个包可以看全部的时区参数
len(all_timezones)

592

# 7.3 筛选日期

In [52]:
import pandas as pd 
dataframe = pd.DataFrame()
dataframe['date'] = pd.date_range('1/1/2021',periods =10000,freq='H')  # 生成数据

In [54]:
dataframe[(dataframe['date']>'2021-01-02 02:00:00') &(dataframe['date']<='2021-02-01 02:00:00') ]  # 布尔法筛选

Unnamed: 0,date
27,2021-01-02 03:00:00
28,2021-01-02 04:00:00
29,2021-01-02 05:00:00
30,2021-01-02 06:00:00
31,2021-01-02 07:00:00
32,2021-01-02 08:00:00
33,2021-01-02 09:00:00
34,2021-01-02 10:00:00
35,2021-01-02 11:00:00
36,2021-01-02 12:00:00


In [56]:
# 还可以把时间设置成索引 然后用索引筛选
dataframe =dataframe.set_index(dataframe['date'])
dataframe.loc['2021-01-02 02:00:00':'2021-02-01 02:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2021-01-02 02:00:00,2021-01-02 02:00:00
2021-01-02 03:00:00,2021-01-02 03:00:00
2021-01-02 04:00:00,2021-01-02 04:00:00
2021-01-02 05:00:00,2021-01-02 05:00:00
2021-01-02 06:00:00,2021-01-02 06:00:00
2021-01-02 07:00:00,2021-01-02 07:00:00
2021-01-02 08:00:00,2021-01-02 08:00:00
2021-01-02 09:00:00,2021-01-02 09:00:00
2021-01-02 10:00:00,2021-01-02 10:00:00
2021-01-02 11:00:00,2021-01-02 11:00:00


# 7.4 抠出年月日

In [57]:
import pandas as pd 
dataframe = pd.DataFrame()

dataframe['date'] = pd.date_range('1/10/2021',periods= 150 ,freq ='W') # 间隔是周

In [59]:
dataframe['year']=dataframe['date'].dt.year
dataframe['month']=dataframe['date'].dt.month
dataframe['day']=dataframe['date'].dt.day
dataframe['hour']=dataframe['date'].dt.hour
dataframe['minute']=dataframe['date'].dt.minute

In [61]:
dataframe.head(3) # 看结果展示的时候可以这样从头部抽几个来看

Unnamed: 0,date,year,month,day,hour,minute
0,2021-01-10,2021,1,10,0,0
1,2021-01-17,2021,1,17,0,0
2,2021-01-24,2021,1,24,0,0


# 7.5 计算日期差

In [64]:
import pandas as pd 
dataframe  = pd.DataFrame()
dataframe['Arrived'] = [pd.Timestamp('01-01-2021'),pd.Timestamp('01-04-2021')]
dataframe['Left'] = [pd.Timestamp('01-01-2021'),pd.Timestamp('01-06-2021')]

dataframe['Left'] - dataframe['Arrived']

0   0 days
1   2 days
dtype: timedelta64[ns]

In [66]:
dataframe # 横向做差 

Unnamed: 0,Arrived,Left
0,2021-01-01,2021-01-01
1,2021-01-04,2021-01-06


讨论部分：做的事是利用时间点来算时间间隔，书里的给的例子是计算旅馆客人的居住时长。

# 7.6 日期转化成周

In [78]:
import pandas as pd 
dates = pd.Series(pd.date_range("21/10/2021",periods=4,freq="M")) 

In [79]:
dates.dt.weekday_name

0     Sunday
1    Tuesday
2     Friday
3     Monday
dtype: object

In [81]:
pd.Series(pd.date_range("21/10/2021",periods=3,freq="H"))# 确定了这个地方是2021年10月21号

0   2021-10-21 00:00:00
1   2021-10-21 01:00:00
2   2021-10-21 02:00:00
dtype: datetime64[ns]

In [80]:
dates.dt.weekday # 数值化 从0开始 0-6 对应周一到周日

0    6
1    1
2    4
3    0
dtype: int64

# 7.7 lag操作

In [82]:
import pandas as pd 
dataframe = pd.DataFrame()

In [85]:
dataframe["dates"] = pd.date_range("1/1/2021",periods = 5 ,freq ="D" )
dataframe["stock_price"] = [1.1,2.2,3.3,4.4,5.5]

dataframe["previous_days_stock_price"]  = dataframe["stock_price"].shift(1) # 整体下窜，取上一期数据
dataframe 

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2021-01-01,1.1,
1,2021-01-02,2.2,1.1
2,2021-01-03,3.3,2.2
3,2021-01-04,4.4,3.3
4,2021-01-05,5.5,4.4


#  7.8 窗函数 rolling time windows 

In [87]:
import pandas as pd 
time_index = pd.date_range("1/1/2021",periods =5 ,freq = "M")
dataframe =  pd.DataFrame(index=time_index)
dataframe["Stock_price"] = [1,2,3,4,5]
dataframe.rolling(window=2).mean() # 参数是2 ， 算的就是 当前值和上一个值的均值 

Unnamed: 0,Stock_price
2021-01-31,
2021-02-28,1.5
2021-03-31,2.5
2021-04-30,3.5
2021-05-31,4.5


In [88]:
dataframe.rolling(window=3).mean()

Unnamed: 0,Stock_price
2021-01-31,
2021-02-28,
2021-03-31,2.0
2021-04-30,3.0
2021-05-31,4.0


# 7.9 处理时间序列缺失值

总的来说三种方法 1.按缺失值前后幅度做平滑 2.取前值 2.取后值

In [98]:
import pandas as pd 
import numpy as np 
time_index = pd.date_range("1/1/2021",periods=5,freq="M")
dataframe = pd.DataFrame(index=time_index)
dataframe['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]
# 方法1 
dataframe.interpolate() # 默认是线性
dataframe.interpolate(method="quadratic")  # 如果是非线性关系，可以通过控制参数来调整
dataframe.interpolate(limit=1,limit_direction="forward")# 也可以控制填补个数和方向

Unnamed: 0,Sales
2021-01-31,1.0
2021-02-28,2.0
2021-03-31,3.0
2021-04-30,
2021-05-31,5.0


In [93]:
#方法2 
dataframe.ffill()

Unnamed: 0,Sales
2021-01-31,1.0
2021-02-28,2.0
2021-03-31,2.0
2021-04-30,2.0
2021-05-31,5.0


In [94]:
# 方法3
dataframe.bfill()

Unnamed: 0,Sales
2021-01-31,1.0
2021-02-28,2.0
2021-03-31,5.0
2021-04-30,5.0
2021-05-31,5.0
