In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_column",100)
pd.set_option("expand_frame_repr",False)
%matplotlib inline

In [3]:
#1--读取数据
test_data=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\test_format1.csv')
train_data=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\train_format1.csv')
user_info=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\user_info_format1.csv')
user_log=pd.read_csv(r'D:\文件\学习\数据集\天猫用户复购\data_format1\user_log_format1.csv')

In [4]:
def reduce_mem_usage(df,verbose=True):
    start_mem=df.memory_usage().sum()/1024**2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type=df[col].dtypes
        if col_type in numerics:
            c_min =df[col].min()
            c_max=df[col].max()
            if str(col_type)[:3]=="int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col]=df[col].astype(np.int8)
                elif c_min >np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col]=df[col].astype(np.int16)
                elif c_min >np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] =df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col]=df[col].astype(np.float64)
    end_mem=df.memory_usage().sum()/1024**2
    print('Memory usage before optimization is: {:.2f} MB'.format(start_mem))
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    return df

In [5]:
train_data=reduce_mem_usage(train_data)
test_data=reduce_mem_usage(test_data)
user_info=reduce_mem_usage(user_info)
user_log=reduce_mem_usage(user_log)

Memory usage before optimization is: 5.97 MB
Memory usage after optimization is: 1.74 MB
Memory usage before optimization is: 5.98 MB
Memory usage after optimization is: 3.49 MB
Memory usage before optimization is: 9.71 MB
Memory usage after optimization is: 3.24 MB
Memory usage before optimization is: 2933.33 MB
Memory usage after optimization is: 890.48 MB


In [6]:
import gc
from collections import Counter

In [7]:
gc.collect()

60

In [8]:
all_data=train_data.append(test_data)

In [9]:
all_data.head(3)

Unnamed: 0,user_id,merchant_id,label,prob
0,34176,3906,0.0,
1,34176,121,0.0,
2,34176,4356,1.0,


In [10]:
all_data=all_data.merge(user_info,on=["user_id"],how="left")

In [11]:
user_log.head(3)

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,328862,323294,833,2882,2660.0,829,0
1,328862,844400,1271,2882,2660.0,829,0
2,328862,575153,1271,2882,2660.0,829,0


In [12]:
user_log=user_log.sort_values(["user_id","time_stamp"])
user_log.head(3)

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
23288890,1,181459,276,2245,4752.0,1009,0
23288891,1,779078,276,2245,4752.0,1009,0
23288892,1,779078,276,2245,4752.0,1009,0


In [13]:
list_join_func=lambda x:" ".join([str(i) for i in x])

In [14]:
agg_dict={
    "item_id":list_join_func,
    "cat_id":list_join_func,
    "seller_id":list_join_func,
    "brand_id":list_join_func,
    "time_stamp":list_join_func,
    "action_type":list_join_func
}

In [15]:
rename_dict={
    "item_id":"item_path",
    "cat_id":"cat_path",
    "seller_id":"seller_path",
    "brand_id":"brand_path",
    "time_stamp":"time_stamp_path",
    "action_type":"action_type_path"
}

In [16]:
user_log_path=user_log.groupby("user_id").agg(agg_dict)
user_log_path.head(3)

Unnamed: 0_level_0,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,181459 779078 779078 452837 543397 504149 5041...,276 276 276 276 276 1023 1023 1023 1023 1252 1...,2245 2245 2245 2245 2245 925 925 925 925 4026 ...,4752.0 4752.0 4752.0 4752.0 4752.0 7400.0 7400...,1009 1009 1009 1009 1009 1011 1011 1011 1011 1...,0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 ...
2,348983 749563 239288 751744 239288 714176 1972...,177 177 602 602 602 1213 602 602 602 1213 1213...,2223 2223 420 420 420 420 420 420 420 420 420 ...,3272.0 3272.0 4952.0 4952.0 4952.0 4058.0 4952...,527 527 626 626 626 626 626 626 626 626 626 62...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 ...
3,895754 895754 182882 182882 985337 175397 9463...,1505 1505 1271 1271 1271 662 1134 1134 1134 45...,795 795 2123 2123 4925 1102 4461 4461 4461 474...,3608.0 3608.0 4796.0 4796.0 8004.0 1214.0 905....,516 516 627 627 627 727 819 819 820 906 906 90...,2 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [17]:
user_log_path=user_log_path.reset_index()
user_log_path.head(3)

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type
0,1,181459 779078 779078 452837 543397 504149 5041...,276 276 276 276 276 1023 1023 1023 1023 1252 1...,2245 2245 2245 2245 2245 925 925 925 925 4026 ...,4752.0 4752.0 4752.0 4752.0 4752.0 7400.0 7400...,1009 1009 1009 1009 1009 1011 1011 1011 1011 1...,0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 ...
1,2,348983 749563 239288 751744 239288 714176 1972...,177 177 602 602 602 1213 602 602 602 1213 1213...,2223 2223 420 420 420 420 420 420 420 420 420 ...,3272.0 3272.0 4952.0 4952.0 4952.0 4058.0 4952...,527 527 626 626 626 626 626 626 626 626 626 62...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 ...
2,3,895754 895754 182882 182882 985337 175397 9463...,1505 1505 1271 1271 1271 662 1134 1134 1134 45...,795 795 2123 2123 4925 1102 4461 4461 4461 474...,3608.0 3608.0 4796.0 4796.0 8004.0 1214.0 905....,516 516 627 627 627 727 819 819 820 906 906 90...,2 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [18]:
user_log_path=user_log_path.rename(columns=rename_dict)
user_log_path.head(3)

Unnamed: 0,user_id,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,1,181459 779078 779078 452837 543397 504149 5041...,276 276 276 276 276 1023 1023 1023 1023 1252 1...,2245 2245 2245 2245 2245 925 925 925 925 4026 ...,4752.0 4752.0 4752.0 4752.0 4752.0 7400.0 7400...,1009 1009 1009 1009 1009 1011 1011 1011 1011 1...,0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 ...
1,2,348983 749563 239288 751744 239288 714176 1972...,177 177 602 602 602 1213 602 602 602 1213 1213...,2223 2223 420 420 420 420 420 420 420 420 420 ...,3272.0 3272.0 4952.0 4952.0 4952.0 4058.0 4952...,527 527 626 626 626 626 626 626 626 626 626 62...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 0 0 ...
2,3,895754 895754 182882 182882 985337 175397 9463...,1505 1505 1271 1271 1271 662 1134 1134 1134 45...,795 795 2123 2123 4925 1102 4461 4461 4461 474...,3608.0 3608.0 4796.0 4796.0 8004.0 1214.0 905....,516 516 627 627 627 727 819 819 820 906 906 90...,2 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [19]:
all_data_path=all_data.merge(user_log_path,on=["user_id"],how="left")

In [20]:
all_data_path.head(3)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
1,34176,121,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
2,34176,4356,1.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...


In [21]:
#定义统计函数

In [22]:
#总数
def cnt_(x):
    try:
        return len(x.split(" "))
    except:
        return -1

In [23]:
def nunique_(x):
    try:
        return len(set(x.split(" ")))
    except:
        return -1

In [24]:
def max_(x):
    try:
        return np.max([int(i) for i in x.split(" ")])
    except:
        return -1

In [25]:
def min_(x):
    try:
        return np.min([int(i) for i in x.split(" ")])
    except:
        return -1

In [26]:
def std_(x):
    try:
        return np.std([float(i) for i in x.split(" ")])
    except:
        return -1

In [27]:
def most_n_cnt(x,n):
    try:
        return Counter(x.split(" ")).most_common(n)[n-1][1]
    except:
        return -1

In [28]:
def most_n(x,n):
    try:
        return Counter(x.split(" ")).most_common(n)[n-1][0]
    except:
        return -1

In [29]:
#调用统计函数的函数

In [30]:
def user_cnt(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(cnt_)
    return df_data

In [31]:
def user_nunique(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(nunique_)
    return df_data

In [32]:
def user_max(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(max_)
    return df_data

In [33]:
def user_min(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(min_)
    return df_data

In [34]:
def user_std(df_data,single_col,name):
    df_data[name]=df_data[single_col].apply(std_)
    return df_data

In [35]:
def user_most_n(df_data,single_col,name,n=1):
    func=lambda x:most_n(x,n)
    df_data[name]=df_data[single_col].apply(func)
    return df_data

In [36]:
def user_most_n_cnt(df_data,single_col,name,n=1):
    func=lambda x:most_n_cnt(x,n)
    df_data[name]=df_data[single_col].apply(func)
    return df_data

In [37]:
all_data_test=all_data_path.head(2000)
all_data_test.head(3)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
1,34176,121,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...
2,34176,4356,1.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...


In [38]:
all_data_test=user_cnt(all_data_test,"seller_path","user_cnt")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451


In [39]:
all_data_test=user_nunique(all_data_test,"seller_path","seller_nunique")

In [40]:
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109


In [41]:
all_data_test=user_nunique(all_data_test,"cat_path","cat_nunique")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45


In [42]:
all_data_test=user_nunique(all_data_test,"brand_path","brand_nunique")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106


In [43]:
all_data_test=user_nunique(all_data_test,"item_path","item_nunique")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256


In [44]:
all_data_test=user_nunique(all_data_test,"time_stamp_path","time_stamp_nunique")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47


In [45]:
all_data_test=user_nunique(all_data_test,"action_type_path","action_type_nunique")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3


In [46]:
all_data_test=user_max(all_data_test,"action_type_path","time_stamp_max")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3,3


In [47]:
all_data_test=user_min(all_data_test,"action_type_path","time_stamp_min")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3,3,0


In [48]:
all_data_test=user_std(all_data_test,"action_type_path","time_stamp_std")
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3,3,0,0.63427


In [49]:
all_data_test["time_stamp_range"]=all_data_test["time_stamp_max"]-all_data_test["time_stamp_min"]
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std,time_stamp_range
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3,3,0,0.63427,3


In [50]:
all_data_test=user_most_n(all_data_test,"seller_path","seller_most_1",n=1)
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std,time_stamp_range,seller_most_1
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3,3,0,0.63427,3,331


In [51]:
all_data_test = user_most_n(all_data_test, 'brand_path', 'brand_most_1', n=1)
# 最常见的行为动作
all_data_test = user_most_n(all_data_test, 'action_type_path', 'action_type_1', n=1)
# 用户最喜欢的店铺 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'seller_path', 'seller_most_1_cnt', n=1)
# 最喜欢的类目 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'cat_path', 'cat_most_1_cnt', n=1)
# 最喜欢的品牌 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'brand_path', 'brand_most_1_cnt', n=1)
# 最常见的行为动作 行为次数
all_data_test = user_most_n_cnt(all_data_test, 'action_type_path', 'action_type_1_cnt', n=1)
all_data_test.head(1)

Unnamed: 0,user_id,merchant_id,label,prob,age_range,gender,item_path,cat_path,seller_path,brand_path,time_stamp_path,action_type_path,user_cnt,seller_nunique,cat_nunique,brand_nunique,item_nunique,time_stamp_nunique,action_type_nunique,time_stamp_max,time_stamp_min,time_stamp_std,time_stamp_range,seller_most_1,brand_most_1,action_type_1,seller_most_1_cnt,cat_most_1_cnt,brand_most_1_cnt,action_type_1_cnt
0,34176,3906,0.0,,6.0,0.0,581818 879005 581818 581818 1011673 52343 2773...,1505 662 1505 1505 1505 662 1095 1505 662 1095...,416 3606 416 416 416 3760 3606 416 1926 3004 4...,4014.0 33.0 4014.0 4014.0 4014.0 3738.0 33.0 4...,521 521 521 521 521 521 521 521 521 521 521 52...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 ...,451,109,45,106,256,47,3,3,0,0.63427,3,331,4094.0,0,70,98,70,410


In [56]:
all_data_test.to_csv(r"C:\Users\xiongyuan\Desktop\all_data_test.csv")