# 采用字典结构提取特征
## 从原始数据输入到特征文件生成

# 引入三方库

In [1]:
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from datetime import datetime


# 读取所需数据

In [2]:
train = pd.read_csv('data/train.csv')
test =  pd.read_csv('data/test.csv')
merchant = pd.read_csv('data/merchants.csv')
new_transaction = pd.read_csv('data/new_merchant_transactions.csv')
history_transaction = pd.read_csv('data/historical_transactions.csv')


# 数据预处理

## 时间段特征工程/split/join/strptime/strftime

In [3]:
## 交易记录
# 1、为了统一处理，首先拼接new和history两张表格，后续可以month_lag>=0进行区分。
transaction = pd.concat([new_transaction, history_transaction], axis=0, ignore_index=True)
del new_transaction
del history_transaction
gc.collect()
transaction['purchase_date']

0           2018-03-11 14:57:36
1           2018-03-19 18:53:37
2           2018-04-26 14:08:44
3           2018-03-07 09:43:21
4           2018-03-22 21:07:53
                   ...         
31075387    2017-01-20 08:52:04
31075388    2017-02-20 04:40:50
31075389    2017-12-26 18:37:51
31075390    2017-11-24 14:18:15
31075391    2017-10-26 14:09:40
Name: purchase_date, Length: 31075392, dtype: object

In [4]:
# 4、进行时间段的处理，简单起见进行月份、日期的星期数（工作日与周末）、以及时间段（上午、下午、晚上、凌晨）的信息提取。
transaction['purchase_month'] = transaction['purchase_date'].apply(lambda x:'-'.join(x.split(' ')[0].split('-')[:2]))
#输出0/1/2/3 上午/下午/晚上/凌晨
transaction['purchase_hour_section'] = transaction['purchase_date'].apply(lambda x: x.split(' ')[1].split(':')[0]).astype(int)//6 
#输出0/1 工作日/周末
transaction['purchase_day'] = transaction['purchase_date'].apply(lambda x: datetime.strptime(x.split(" ")[0], "%Y-%m-%d").weekday())//5                                                                    

transaction[['purchase_month','purchase_hour_section','purchase_day']]

Unnamed: 0,purchase_month,purchase_hour_section,purchase_day
0,2018-03,2,1
1,2018-03,3,0
2,2018-04,2,0
3,2018-03,1,0
4,2018-03,3,0
...,...,...,...
31075387,2017-01,1,0
31075388,2017-02,0,0
31075389,2017-12,3,0
31075390,2017-11,2,0


In [6]:
transaction[['purchase_hour_section','purchase_day']].value_counts().sort_index()

purchase_hour_section  purchase_day
0                      0               1413293
                       1                814310
1                      0               4555719
                       1               1923398
2                      0               9651946
                       1               3658022
3                      0               6703531
                       1               2355173
Name: count, dtype: int64