In [1]:
import pandas as pd
import numpy as np

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def load_data(path):
    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))
    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))
    data = pd.read_csv(path + 'user_behavior.csv',header=None)

    data.columns = ['userID','itemID','behavior','timestamp']
    data['day'] = data['timestamp'] // 86400
    data['hour'] = data['timestamp'] // 3600 % 24
    
    ## 生成behavior的onehot
    for i in ['pv','fav','cart','buy']:
        data[i] = 0
        data.loc[data['behavior'] == i, i] = 1

    ## 生成behavior的加权
    
    data['day_hour'] = data['day'] + data['hour'] / float(24)
    data.loc[data['behavior']=='pv','behavior'] = 1
    data.loc[data['behavior']=='fav','behavior'] = 2
    data.loc[data['behavior']=='cart','behavior'] = 3
    data.loc[data['behavior']=='buy','behavior'] = 1
    max_day = max(data['day'])
    min_day = min(data['day'])
    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] 

    item.columns = ['itemID','category','shop','brand']
    user.columns = ['userID','sex','age','ability']
    
    data = reduce_mem_usage(data)

    data = pd.merge(left=data, right=item, on='itemID',how='left')
    data = pd.merge(left=data, right=user, on='userID',how='left')

    return user, item, data
    

In [None]:
#path = '..\\data\\'
path = '../ECommAI_EUIR_round2_train_20190816/'
user, item, data = load_data(path = path)

Memory usage of dataframe is 44702560.00 MB
Memory usage after optimization is: 9778785.00 MB
Decreased by 78.1%
Memory usage of dataframe is 138182592.00 MB
Memory usage after optimization is: 60454956.00 MB
Decreased by 56.2%
Memory usage of dataframe is 7081839904.00 MB
Memory usage after optimization is: 1770460072.00 MB
Decreased by 75.0%


In [None]:
train = data[data['day'] < 15]

In [None]:
train = data[data['day'] < 15]

online_features = []
for count_feature in ['category','shop','brand']:
    train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
                                               + count_feature + '_count'}).to_csv('user_to_' + str(count_feature)+'_count.csv', index=False)
for count_feature in ['category','shop','brand']:
    train[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
        {'behavior': 'sum'}).rename(columns={'behavior':'user_to_' 
                                             + count_feature + '_sum'}).to_csv('user_to_' + str(count_feature)+'_sum.csv', index=False)

for count_feature in ['category','shop','brand']:
    for behavior_type in ['pv','fav','cart','buy']:
        train[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
            {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
                                                   + count_feature + '_count_' + behavior_type}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '.csv', index=False)



In [None]:
yestday = data[data['day'] == 14]

for count_feature in ['category','shop','brand']:
    yestday[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
                                               + count_feature + '_count_yestday'}).to_csv('user_to_' + str(count_feature)+'_count_yestday.csv', index=False)

for count_feature in ['category','shop','brand']:
    for behavior_type in ['pv','fav','cart','buy']:
        yestday[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
            {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
                                                   + count_feature + '_count_' + behavior_type+'_yestday'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_yestday.csv', index=False)


In [None]:
a5days = data[(data['day'] > 15 - 5) & (data['day'] < 15)]

for count_feature in ['category','shop','brand']:
    a5days[['behavior','userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
                                               + count_feature + '_count_5days'}).to_csv('user_to_' + str(count_feature)+'_count_5days.csv', index=False)

for count_feature in ['category','shop','brand']:
    for behavior_type in ['pv','fav','cart','buy']:
        a5days[[behavior_type,'userID',count_feature]].groupby(['userID', count_feature], as_index=False).agg(
            {behavior_type: 'sum'}).rename(columns={behavior_type:'user_to_'
                                                   + count_feature + '_count_' + behavior_type+'_5days'}).to_csv('user_to_' + str(count_feature) + '_count_' + behavior_type + '_5days.csv', index=False)


In [None]:
start_timestamp  = max(data[data['day'] < 15]['timestamp'])

In [None]:
time_features = []
test = data[data['day'] < 15]
for time_feature in ['shop', 'category','brand']:
    time_features.append(test[['last_time','userID',time_feature,'day']].groupby(['userID',time_feature], as_index=False).agg({'last_time': 'min', 'day':'max'}).rename(columns={'last_time': 'user_to_'
                                                   + time_feature + '_lasttime', 'day':'user_to_'+ time_feature + '_lastday'}))

In [None]:
for f in time_features:
    f.to_csv(str(f.columns[2])+'.csv', index=False)

In [None]:
for f in time_features:
    print(str(f.columns[2])+'.csv')

In [None]:
for count_feature in ['sex','ability','age']:
    train[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(
        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
                                               + count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count.csv', index=False)