#### Author：马肖
#### E-Mail：maxiaoscut@aliyun.com
#### GitHub：https://github.com/Albertsr

#### 生成实验数据集

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
from create_time_feature import create_time_feature as ctf


# 生成日期、时间信息
today = dt.datetime.today()
size = 600
full_time = pd.date_range(end=today, freq='1D 45min 13s 451521us', periods=size)

# 生成交易额信息
np.random.seed(size)
consume_num = np.random.uniform(0, 1000, size)

# 运用datetime.strftime("%Y-%m-%d")从完整的时间中分离出字符串格式的日期、时间
# 运用pd.DatetimeIndex或者pd.to_datetime将字符串格式的日期转化为日期索引
consume_date = pd.DatetimeIndex(full_time.strftime("%Y-%m-%d"))

# 构建数据框
dict_ = {"Amount": consume_num, "Time":full_time}
sales = pd.DataFrame(dict_, index=consume_date)
sales.head()

Unnamed: 0,Amount,Time
2017-07-11,32.367275,2017-07-11 17:30:14.852224
2017-07-12,542.409663,2017-07-12 18:15:28.303745
2017-07-13,802.91961,2017-07-13 19:00:41.755266
2017-07-14,55.159403,2017-07-14 19:45:55.206787
2017-07-15,382.264775,2017-07-15 20:31:08.658308


In [2]:
ctf(sales, sales['Time'], 3).tail(5)

Unnamed: 0,Amount,Is_Month_Start_End,Weekday,Is_Weekend,Week_Order,Season,Hour_of_Day,Time_Range,Day_Order
2019-03-17,167.892404,0,Sun,1,10,Spring,9,AM,76
2019-03-18,728.618652,0,Mon,0,11,Spring,10,AM,77
2019-03-19,976.788669,0,Tue,0,11,Spring,11,AM,78
2019-03-20,458.933563,0,Wed,0,11,Spring,12,PM,79
2019-03-21,421.315669,0,Thu,0,11,Spring,12,PM,80


#### 结合时间型特征、数字型特征进行分析

In [3]:
print(sales.loc["2018-05", "Amount"].sum())

13259.086647654212


In [4]:
sales.resample("3M").agg([np.sum, np.mean])

Unnamed: 0_level_0,Amount,Amount,Is_Month_Start_End,Is_Month_Start_End,Is_Weekend,Is_Weekend
Unnamed: 0_level_1,sum,mean,sum,mean,sum,mean
2017-07-31,7964.874424,398.243721,3,0.15,6,0.3
2017-10-31,42624.193038,478.923517,18,0.202247,25,0.280899
2018-01-31,52071.987789,578.577642,16,0.177778,25,0.277778
2018-04-30,38017.128956,442.059639,17,0.197674,26,0.302326
2018-07-31,45222.353345,508.11633,18,0.202247,25,0.280899
2018-10-31,44577.379553,500.869433,18,0.202247,25,0.280899
2019-01-31,39211.077433,440.573904,17,0.191011,25,0.280899
2019-04-30,25348.311909,528.089831,8,0.166667,13,0.270833


In [5]:
sales.pivot_table(index=["Season"], values=["Amount"], columns=["Time_Range"], aggfunc=[np.sum, np.mean])

Unnamed: 0_level_0,sum,sum,sum,sum,mean,mean,mean,mean
Unnamed: 0_level_1,Amount,Amount,Amount,Amount,Amount,Amount,Amount,Amount
Time_Range,AM,Mid Night,Night,PM,AM,Mid Night,Night,PM
Season,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Autumn,17030.478111,28295.77291,13636.219565,25423.063901,500.896415,533.882508,413.218775,529.647165
Spring,20941.083065,24287.649816,13111.780454,25098.884251,436.272564,495.666323,624.370498,522.893422
Summer,9287.47998,15292.584821,6982.025141,9745.677805,386.978332,546.163744,436.376571,487.28389
Winter,21182.4734,22908.261242,15040.550491,26773.321497,460.488552,498.005679,485.179048,486.787664


In [6]:
time_sub_dummies = pd.get_dummies(sales["Time_Range"])

# sales.drop("time_sub", axis=1).join(time_sub_dummies).head()
# sales.join(time_sub_dummies).head()

sales["Time_Range"].value_counts()

Mid Night    176
PM           171
AM           152
Night        101
Name: Time_Range, dtype: int64