In [6]:
import numpy as np
import pandas as pd

# data prep

In [7]:
OnlRt = pd.read_csv('OnlineRetail.csv',
                   usecols = ['CustomerID','InvoiceDate','UnitPrice','Quantity','Country'],
                   encoding = 'ISO-8859-1',
                   parse_dates = ['InvoiceDate'],
                   dtype = {'CustomerID':np.str,'UnitPrice':np.float32,'Quantity':np.int32,'Country':np.str})
OnlRt.head()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = {'CustomerID':np.str,'UnitPrice':np.float32,'Quantity':np.int32,'Country':np.str})


Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


## 有效id

In [8]:
neg_id = OnlRt[(OnlRt['Quantity']<=0)|(OnlRt['UnitPrice']<=0)].loc[:,'CustomerID']  #这些顾客存在退货的情况
data0 = OnlRt[(OnlRt['CustomerID'].notnull())&
             (~OnlRt['CustomerID'].isin(neg_id))&
             (OnlRt['Country']=='United Kingdom')].drop('Country',axis = 1)

In [9]:
data1 = data0.assign(amount = data0['UnitPrice'].multiply(data0['Quantity']))
data1.head()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID,amount
46,80,2010-12-01 09:00:00,2.55,13748,203.999996
142,12,2010-12-01 09:45:00,0.85,16098,10.2
143,8,2010-12-01 09:45:00,3.75,16098,30.0
144,12,2010-12-01 09:45:00,1.65,16098,19.8
145,10,2010-12-01 09:45:00,1.95,16098,19.5


## day diff from first time purse

In [10]:
first_time = data1['InvoiceDate'].sort_values(ascending = True)\
.groupby(data1['CustomerID']).nth(0).apply(lambda x:x.date()).reset_index()\
.rename(columns={'InvoiceDate':'first_time'})
data2 = pd.merge(data1,first_time,how='left',on=['CustomerID'])

In [11]:
dayth = (data2['InvoiceDate'].apply(lambda x:x.date())-data2['first_time']).apply(lambda x:x.days)

## order time

In [12]:
month = data2['InvoiceDate'].apply(lambda x:x.month)
weekday = data2['InvoiceDate'].apply(lambda x:x.weekday())
hour = data2['InvoiceDate'].apply(lambda x:x.hour)
minute = data2['InvoiceDate'].apply(lambda x:x.minute)
second = data2['InvoiceDate'].apply(lambda x:x.second)

In [13]:
hour_preci = (second/60+minute)/60+hour

In [14]:
data3 = data2.assign(dayth = dayth).assign(hour = hour_preci).\
assign(weekday = weekday).drop(['first_time','InvoiceDate'],axis = 1).\
sort_values(by=['CustomerID','dayth','hour'])
data3.head()

Unnamed: 0,Quantity,UnitPrice,CustomerID,amount,dayth,hour,weekday
3205,12,2.55,12747,30.599999,0,15.633333,6
3206,6,2.55,12747,15.3,0,15.633333,6
3207,16,1.69,12747,27.040001,0,15.633333,6
3208,24,1.65,12747,39.599999,0,15.633333,6
3209,6,5.45,12747,32.699999,0,15.633333,6


## decide 28天/4周用户数据为training input

In [15]:
#training input
X = data3[data3['dayth']<28].set_index('CustomerID').drop('amount',axis = 1).sort_index()

In [16]:
#define LTV as total amount within 180 days
data180 = data3[(data3['dayth']<180)&(data3['CustomerID'].isin(X.index))]
y = data180['amount'].groupby(data180['CustomerID']).sum().sort_index()

In [17]:
X.to_csv('bookdata_X.csv')
y.to_csv('bookdata_y.csv')

In [18]:
X.head()

Unnamed: 0_level_0,Quantity,UnitPrice,dayth,hour,weekday
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12747,12,2.55,0,15.633333,6
12747,6,2.55,0,15.633333,6
12747,16,1.69,0,15.633333,6
12747,24,1.65,0,15.633333,6
12747,6,5.45,0,15.633333,6


In [39]:
np.zeros([2500,4,32,4])

array([[[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]],


       [[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         ...,
         [0.,