## User data preprocessing
Read and deposit into pickle
Allow random sampling of user data


In [1]:
# -*- coding: UTF-8 -*-
# Basic imports
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import pickle
import os
import math
import matplotlib.pyplot as plt
import numpy as np
import random
from __future__ import division

plt.style.use('ggplot')
%matplotlib inline

action_1_path = "./data/JData_Action_201602.csv"
action_2_path = "./data/JData_Action_201603.csv"
action_3_path = "./data/JData_Action_201604.csv"
comment_path = "./data/JData_Comment.csv"
product_path = "./data/JData_Product.csv"
user_path = "./data/JData_User.csv"

test_start = '2016-02-01'
test_end = '2016-04-15'

In [21]:
def convert_age(age_str):
        if age_str == u'-1':
            return 0
        elif age_str == u'15岁以下':
            return 1
        elif age_str == u'16-25岁':
            return 2
        elif age_str == u'26-35岁':
            return 3
        elif age_str == u'36-45岁':
            return 4
        elif age_str == u'46-55岁':
            return 5
        elif age_str == u'56岁以上':
            return 6
        else:
            return -1
        
class UserSampler(object):
    columns = {'user_id': 0, 'age': 1, 'sex': 2, 'user_lv_cd': 3, 'user_reg_tm': 4}
    
    def __init__(self, file):
        if file[-3:] == 'csv':
            self.load_csv(file)
        else:
            self.load_save(file)
        self.sequence = np.arange(len(self.user_data))
        random.shuffle(self.sequence)
        self.sequence_index = 0
        
    def __getitem__(self, key):
        col = self.columns[key]
        return self.user_data[:, col]
    
    def get_column(self, key):
        return self.columns[key]
    
    def load_csv(self, file):
        user_data_pkl = 'data/user_simple'
        self.user_data = pd.read_csv(file, encoding='gbk')
        self.user_data['age'] = self.user_data['age'].map(convert_age)
        #self.user_data['sex'][pd.isnull(self.user_data['sex'])] = 0
        self.user_data['sex'] = self.user_data['sex'].map(lambda x: int(x) if not pd.isnull(x) else 0)
        #self.user_data = self.user_data[self.user_data['user_reg_tm'] < test_start]
        self.user_data['user_reg_tm'] = pd.to_datetime(self.user_data['user_reg_tm']) - pd.to_datetime(test_start)
        self.user_data['user_reg_tm'] = self.user_data['user_reg_tm'].map(lambda x: int(x.total_seconds()) if not pd.isnull(x) else 0)
        self.user_data = self.user_data.as_matrix()
        np.save('data/user_simple', self.user_data)
    
    def load_save(self, file):
        self.user_data = np.load(file)
            
    def get_user_batch(self, size=300):
        idx = self.sequence[self.sequence_index:(self.sequence_index + size)]
        self.sequence_index += size
        return self.user_data[idx, :]

### Unit test for UserSampler
To do

In [23]:
us = UserSampler(user_path)
batch = us.get_user_batch(20)
print batch[:, 0]

[275039 207905 235719 289370 236788 290811 262928 222416 281960 286678
 206947 274980 226392 270438 219119 204070 262435 296956 215513 289659]


In [4]:
batch

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm
87591,287592,3,2,4,2015-08-18
62291,262292,3,0,5,2009-12-11
28816,228817,3,0,5,2014-02-27
54232,254233,2,2,3,2014-09-11
53427,253428,3,0,5,2014-08-28
9385,209386,4,2,5,2013-06-14
29093,229094,4,0,5,2011-06-15
28536,228537,3,0,5,2011-06-05
48819,248820,0,0,3,2014-06-24
44962,244963,2,0,4,2014-05-31


### Action data
Convert time to seconds from start, int.

Drop model_id. Too complicated.

Generate impulse of actions from sampled users.

 Use exponential kernel for impulses. 
 
 Use 6 different kernels for 6 types of action (1-6)

In [24]:
# Simplify action data
with open('data/all_action', 'r') as f:
    all_action = pickle.load(f)
all_action = all_action.drop('model_id', 1)
all_action['time'] = pd.to_datetime(all_action['time']) - datetime.date(2016, 2, 1)
all_action['time'] = all_action['time'].map(lambda x:int(x.total_seconds()))
all_action['user_id'] = all_action['user_id'].map(int)
all_action = all_action.as_matrix()
np.save('data/action_simple', all_action)
del all_action

In [50]:
class ActionSampler(object):
    num_action = 6
    columns = {'user_id': 0, 'sku_id': 1, 'time': 2, 'type': 3, 'cate': 4, 'brand': 5}
    TYPE_VIEW = 1
    TYPE_CART = 2
    TYPE_DECART = 3
    TYPE_BUY = 4
    TYPE_FAVOR = 5
    TYPE_CLICK = 6
    def __init__(self, action_npy):
        self.all_action = np.load(action_npy)
        self.kernel_list = []
        
    def __getitem__(self, key):
        col = self.columns[key]
        return self.all_action[:, col]
    
    def add_kernel(self, weights):
        assert len(weights) == self.num_action, "Each action type requires a weight"
        self.kernel_list.append(weights)
    
    def get_kernel(self, kernel_id):
        return self.kernel_list[kernel_id]
    
    def get_column(self, column_name):
        return self.columns[column_name]
    
    def get_impulse(self, user_id, till_time, kernel_id=0):
        data = self.all_action[self.all_action[:, self.get_column('user_id')] == user_id, :]
        kernel = self.get_kernel(kernel_id)
        category = np.unique(data[:, self.get_column('cate')])
        impulse = np.zeros([len(category), self.num_action + 1])
        for cate_idx, cate in enumerate(category):
            cate_data = data[data[:, self.get_column('cate')] == cate, :]
            impulse[cate_idx, 0] = cate
            for action in range(self.num_action):
                action_time = np.array(cate_data[cate_data[:, 
                                            self.get_column('type')] == action + 1, 
                                            self.get_column('time')])
                action_time = action_time[action_time < till_time]
                impulse[cate_idx, action + 1] = np.sum(np.exp((till_time - action_time)
                                                * kernel[action]))
        return impulse
    
    def get_purchase(self, user_id, start_time, end_time):
        data = self.all_action[self.all_action[:, self.get_column('user_id')] == user_id, :]
        data = data[data[:, self.get_column('type')] == self.TYPE_BUY, :]
        data = data[data[:, self.get_column('time')] > start_time, :]
        data = data[data[:, self.get_column('time')] < end_time, :]
        return data
        
            

### Unit test for ActionSampler

To do

In [51]:
asampler = ActionSampler('data/action_simple.npy')
asampler.add_kernel([-0.00001, -0.00001, -0.00001, -0.00001, -0.00001, -0.00001])
print asampler.get_kernel(0)

[-1e-05, -1e-05, -1e-05, -1e-05, -1e-05, -1e-05]


In [52]:
print asampler.get_impulse(266079, 3e5)
print asampler.get_purchase(266079, 3e5, 4e6)

[[  4.           0.           0.           0.           0.           0.
    0.        ]
 [  8.          12.54077758   1.44942234   0.30012173   0.09082688   0.
   29.33336009]]
[]
