# import包

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# 读取数据

In [2]:
df = pd.read_csv('./时序特征样例数据.csv')

# 按时间排序

In [3]:
df = df.sort_values(by = 'time').reset_index(drop = True)

In [4]:
df

Unnamed: 0,用户id,time,消费金额
0,17,2020-01-01 00:30:04,17.0
1,38,2020-01-01 01:14:13,87.0
2,36,2020-01-01 02:48:01,12.0
3,36,2020-01-01 02:50:20,4.0
4,31,2020-01-01 03:14:22,6.0
...,...,...,...
995,49,2020-01-30 21:02:24,81.0
996,55,2020-01-30 21:26:39,37.0
997,9,2020-01-30 21:43:22,36.0
998,19,2020-01-30 22:58:34,41.0


# lag特征

In [5]:
key = '用户id'
val = '消费金额'
step = 1
name = f'{key}_{val}_lag_{step}'
df[name] = df.groupby(key)[val].transform(lambda x: x.shift(step))

In [6]:
df

Unnamed: 0,用户id,time,消费金额,用户id_消费金额_lag_1
0,17,2020-01-01 00:30:04,17.0,
1,38,2020-01-01 01:14:13,87.0,
2,36,2020-01-01 02:48:01,12.0,
3,36,2020-01-01 02:50:20,4.0,12.0
4,31,2020-01-01 03:14:22,6.0,
...,...,...,...,...
995,49,2020-01-30 21:02:24,81.0,44.0
996,55,2020-01-30 21:26:39,37.0,3.0
997,9,2020-01-30 21:43:22,36.0,28.0
998,19,2020-01-30 22:58:34,41.0,57.0


# diff特征

In [7]:
key = '用户id'
val = '消费金额'
step = 1
name = f'{key}_{val}_diff_{step}'
lag_val = df.groupby(key)[val].shift(step).values
origin_val = df.groupby(key)[val].shift(0).values
df[name] = lag_val - origin_val

In [8]:
df

Unnamed: 0,用户id,time,消费金额,用户id_消费金额_lag_1,用户id_消费金额_diff_1
0,17,2020-01-01 00:30:04,17.0,,
1,38,2020-01-01 01:14:13,87.0,,
2,36,2020-01-01 02:48:01,12.0,,
3,36,2020-01-01 02:50:20,4.0,12.0,8.0
4,31,2020-01-01 03:14:22,6.0,,
...,...,...,...,...,...
995,49,2020-01-30 21:02:24,81.0,44.0,-37.0
996,55,2020-01-30 21:26:39,37.0,3.0,-34.0
997,9,2020-01-30 21:43:22,36.0,28.0,-8.0
998,19,2020-01-30 22:58:34,41.0,57.0,16.0


# 窗口内统计特征

In [9]:
key = '用户id'
val = '消费金额'
window = 3
ops = ['mean', 'std', 'median', 'max', 'min']

for op in ops:
    name = f'{key}_{val}_rolling_{window}_{op}'
    if op == 'mean':
        df[name] = df.groupby(key)[val].transform(lambda x: x.rolling(window=window).mean())
    if op == 'std':
        df[name] = df.groupby(key)[val].transform(lambda x: x.rolling(window=window).std())
    if op == 'median':
        df[name] = df.groupby(key)[val].transform(lambda x: x.rolling(window=window).median())
    if op == 'max':
        df[name] = df.groupby(key)[val].transform(lambda x: x.rolling(window=window).max())
    if op == 'min':
        df[name] = df.groupby(key)[val].transform(lambda x: x.rolling(window=window).min())

In [10]:
df

Unnamed: 0,用户id,time,消费金额,用户id_消费金额_lag_1,用户id_消费金额_diff_1,用户id_消费金额_rolling_3_mean,用户id_消费金额_rolling_3_std,用户id_消费金额_rolling_3_median,用户id_消费金额_rolling_3_max,用户id_消费金额_rolling_3_min
0,17,2020-01-01 00:30:04,17.0,,,,,,,
1,38,2020-01-01 01:14:13,87.0,,,,,,,
2,36,2020-01-01 02:48:01,12.0,,,,,,,
3,36,2020-01-01 02:50:20,4.0,12.0,8.0,,,,,
4,31,2020-01-01 03:14:22,6.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
995,49,2020-01-30 21:02:24,81.0,44.0,-37.0,64.333333,18.770544,68.0,81.0,44.0
996,55,2020-01-30 21:26:39,37.0,3.0,-34.0,27.666667,21.571586,37.0,43.0,3.0
997,9,2020-01-30 21:43:22,36.0,28.0,-8.0,45.333333,23.437861,36.0,72.0,28.0
998,19,2020-01-30 22:58:34,41.0,57.0,16.0,66.000000,30.512293,57.0,100.0,41.0
