## User data preprocessing
Read and deposit into pickle
Allow random sampling of user data


In [16]:
# -*- coding: UTF-8 -*-
# Basic imports
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import pickle
import os
import math
import matplotlib.pyplot as plt
import numpy as np
import random
from __future__ import division

plt.style.use('ggplot')
%matplotlib inline

action_1_path = "./data/JData_Action_201602.csv"
action_2_path = "./data/JData_Action_201603.csv"
action_3_path = "./data/JData_Action_201604.csv"
comment_path = "./data/JData_Comment.csv"
product_path = "./data/JData_Product.csv"
user_path = "./data/JData_User.csv"

test_start = '2016-02-01'
test_end = '2016-04-15'

In [20]:
def convert_age(age_str):
        if age_str == u'-1':
            return 0
        elif age_str == u'15岁以下':
            return 1
        elif age_str == u'16-25岁':
            return 2
        elif age_str == u'26-35岁':
            return 3
        elif age_str == u'36-45岁':
            return 4
        elif age_str == u'46-55岁':
            return 5
        elif age_str == u'56岁以上':
            return 6
        else:
            return -1
        
class UserSampler(object):
    def __init__(self, file):
        if file[-3:] == 'csv':
            self.load_csv(file)
        else:
            self.load_pickle(file)
        self.sequence = np.arange(len(self.user_data))
        random.shuffle(self.sequence)
        self.sequence_index = 0
        
    def load_csv(self, file):
        user_data_pkl = 'data/user_simple'
        self.user_data = pd.read_csv(file, encoding='gbk')
        self.user_data['age'] = self.user_data['age'].map(convert_age)
        self.user_data['sex'] = self.user_data['sex'].map(int)
        self.user_data = self.user_data[self.user_data['user_reg_tm'] < test_start]
        with open(user_data_pkl, 'w') as f:
            pickle.dump(self.user_data, f)
    
    def load_pickle(self, file):
        with open(file, 'r') as f:
            self.user_data = pickle.load(f)
            
    def get_user_batch(self, size=300):
        idx = self.sequence[self.sequence_index:(self.sequence_index + size)]
        self.sequence_index += size
        return self.user_data.iloc[idx]

### Unit test for UserSampler
To do

In [23]:
us = UserSampler('data/user_simple')
batch = us.get_user_batch(20)
print batch['user_id']

83297    283298
31167    231168
81946    281947
63562    263563
13161    213162
79488    279489
72462    272463
47275    247276
72738    272739
20271    220272
15393    215394
98458    298459
25465    225466
34059    234060
95462    295463
65616    265617
32845    232846
30066    230067
35037    235038
6744     206745
Name: user_id, dtype: int64


### Action data
Convert time to seconds from start, int.

Drop model_id. Too complicated.

Generate impulse of actions from sampled users.

 Use exponential kernel for impulses. 
 
 Use 6 different kernels for 6 types of action (1-6)

In [24]:
# Simplify action data
with open('data/all_action', 'r') as f:
    all_action = pickle.load(f)
all_action = all_action.drop('model_id', 1)
all_action['time'] = pd.to_datetime(all_action['time']) - datetime.date(2016, 2, 1)
all_action['time'] = all_action['time'].map(lambda x:int(x.total_seconds()))
all_action['user_id'] = all_action['user_id'].map(int)
with open('data/simple_action', 'w') as f:
    pickle.dump(all_action, f)
del all_action

In [25]:
class ActionSampler(object):
    num_action = 6
    def __init__(self, action_pkl):
        with open(action_pkl, 'r') as f:
            self.all_action = pickle.load(f)
        self.kernel_list = []
    
    def add_kernel(self, weights):
        assert len(weights) == self.num_action, "Each action type requires a weight"
        self.kernel_list.append(weights)
    
    def get_kernel(self, kernel_id):
        return self.kernel_list[kernel_id]
    
    def get_impulse(self, user_id, kernel_id, till_time):
        data = self.all_action[self.all_action['user_id'] == user_id]
        kernel = self.get_kernel(kernel_id)
        impulse = np.zeros(self.num_action)
        for action in range(self.num_action):
            action_time = np.array(data[data['type'] == action]['time'])
            action_time = action_time[action_time <= till_time]
            impulse[action] = np.sum(np.exp((till_time - action_time) * kernel[action]))
        return impulse
    
    def get_impulse_batch(self, user_ids, kernel_id, till_time):
        results = np.zeros([len(user_ids), self.num_action])
        for idx, user in enumerate(user_ids):
            results[idx, :] = self.get_impulse(user, kernel_id, till_time)
        return results
            