In [1]:
import pandas as pd
import numpy as np

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def load_data(path):
    user = reduce_mem_usage(pd.read_csv(path + 'user.csv',header=None))
    item = reduce_mem_usage(pd.read_csv(path + 'item.csv',header=None))
    data = pd.read_csv(path + 'user_behavior.csv',header=None)

    data.columns = ['userID','itemID','behavior','timestamp']
    data['day'] = data['timestamp'] // 86400
    data['hour'] = data['timestamp'] // 3600 % 24
    
    ## 生成behavior的onehot
    for i in ['pv','fav','cart','buy']:
        data[i] = 0
        data.loc[data['behavior'] == i, i] = 1

    ## 生成behavior的加权
    
    data['day_hour'] = data['day'] + data['hour'] / float(24)
    data.loc[data['behavior']=='pv','behavior'] = 1
    data.loc[data['behavior']=='fav','behavior'] = 2
    data.loc[data['behavior']=='cart','behavior'] = 3
    data.loc[data['behavior']=='buy','behavior'] = 1
    max_day = max(data['day'])
    min_day = min(data['day'])
    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] 

    item.columns = ['itemID','category','shop','brand']
    user.columns = ['userID','sex','age','ability']
    
    data = reduce_mem_usage(data)

    data = pd.merge(left=data, right=item, on='itemID',how='left')
    data = pd.merge(left=data, right=user, on='userID',how='left')

    return user, item, data
    

In [3]:
path = '../ECommAI_EUIR_round2_train_20190816/'

user, item, data = load_data(path = path)

Memory usage of dataframe is 44702560.00 MB
Memory usage after optimization is: 9778785.00 MB
Decreased by 78.1%
Memory usage of dataframe is 138182592.00 MB
Memory usage after optimization is: 60454956.00 MB
Decreased by 56.2%
Memory usage of dataframe is 7081839904.00 MB
Memory usage after optimization is: 1770460072.00 MB
Decreased by 75.0%


In [None]:
for count_feature in ['itemID', 'shop', 'category','brand']:
    data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(
        {'behavior':'count'}).rename(columns={'behavior':count_feature + '_count'}).to_csv(str(count_feature)+'_count.csv', index=False)

for count_feature in ['itemID', 'shop', 'category','brand']:
    data[['behavior', count_feature]].groupby(count_feature, as_index=False).agg(
        {'behavior':'sum'}).rename(columns={'behavior':count_feature + '_sum'}).to_csv(str(count_feature)+'_sum.csv', index=False)

In [None]:
temp = data[['behavior','category']].groupby('category', as_index=False).agg({'behavior': ['median','std','skew']})
temp.columns = ['category','category_median','category_std','category_skew']

temp.to_csv('category_higher.csv',index=False)

In [None]:
temp = data[['behavior','itemID']].groupby('itemID', as_index=False).agg({'behavior': ['median','std','skew']})
temp.columns = ['itemID','itemID_median','itemID_std','itemID_skew']

temp.to_csv('itemID_higher.csv',index=False)

In [4]:
data['age'] = data['age'] // 10
train = data[data['day'] < 15]

In [8]:
## 注意 这个要生成一个underline版本和一个online版本
for count_feature in ['sex','ability','age']:
    data[['behavior','itemID',count_feature]].groupby(['itemID', count_feature], as_index=False).agg(
        {'behavior': 'count'}).rename(columns={'behavior':'user_to_'
                                               + count_feature + '_count'}).to_csv('item_to_' + str(count_feature)+'_count_online.csv', index=False)

In [None]:
itemcount = pd.read_csv('itemID_count.csv')

In [None]:
temp = pd.merge(left=item, right=itemcount, how='left', on='itemID')

In [None]:
item_rank = []
for eachcat in temp.groupby('category'):
    each_df = eachcat[1].sort_values('itemID_count', ascending=False).reset_index(drop=True)
    each_df['rank'] = each_df.index + 1
    lenth = each_df.shape[0]
    each_df['rank_percent'] = (each_df.index + 1) / lenth
    item_rank.append(each_df[['itemID','rank','rank_percent']])

In [None]:
item_rank = pd.concat(item_rank, sort=False)

In [None]:
item_rank.to_csv('item_rank.csv',index=False)

In [None]:
def unique_count(x):
    return len(set(x))

In [None]:
cat1 = item.groupby('category',as_index=False).agg({'itemID': unique_count}).rename(columns={'itemID':'itemnum_undercat'})

In [None]:
cat2 = item.groupby('category',as_index=False).agg({'brand': unique_count}).rename(columns={'brand':'brandnum_undercat'})

In [None]:
cat3 = item.groupby('category',as_index=False).agg({'shop': unique_count}).rename(columns={'shop':'shopnum_undercat'})

In [None]:
pd.concat([cat1, cat2[['brandnum_undercat']], cat3[['shopnum_undercat']]], axis=1).to_csv('category_lower.csv',index=False)